コード例 #1
0
void SequenceInstanceNumeric::Initialize(const SequenceDictionary &dictionary,
                                         SequenceInstance* instance) {
  TokenDictionary *token_dictionary = dictionary.GetTokenDictionary();
  int length = instance->size();
  int i;
  int id;

  int prefix_length = FLAGS_prefix_length;
  int suffix_length = FLAGS_suffix_length;
  bool form_case_sensitive = FLAGS_form_case_sensitive;

  Clear();

  form_ids_.resize(length);
  prefix_ids_.resize(length);
  suffix_ids_.resize(length);
  shape_ids_.resize(length);
  has_digit_.resize(length);
  has_upper_.resize(length);
  has_hyphen_.resize(length);
  all_digits_.resize(length);
  all_digits_with_punctuation_.resize(length);
  all_upper_.resize(length);
  first_upper_.resize(length);
  tag_ids_.resize(length);

  for (i = 0; i < length; i++) {
    std::string form = instance->GetForm(i);
    if (!form_case_sensitive) {
      transform(form.begin(), form.end(), form.begin(), ::tolower);
    }
    id = token_dictionary->GetFormId(form);
    CHECK_LT(id, 0xffff);
    if (id < 0) id = TOKEN_UNKNOWN;
    form_ids_[i] = id;

    prefix_ids_[i].resize(prefix_length);
    for (int l = 0; l < prefix_length; ++l) {
      std::string prefix = form.substr(0, l + 1);
      id = token_dictionary->GetPrefixId(prefix);
      CHECK_LT(id, 0xffff);
      if (id < 0) id = TOKEN_UNKNOWN;
      prefix_ids_[i][l] = id;
    }

    suffix_ids_[i].resize(suffix_length);
    for (int l = 0; l < suffix_length; ++l) {
      int start = form.length() - l - 1;
      if (start < 0) start = 0;
      std::string suffix = form.substr(start, l + 1);
      id = token_dictionary->GetSuffixId(suffix);
      CHECK_LT(id, 0xffff);
      if (id < 0) id = TOKEN_UNKNOWN;
      suffix_ids_[i][l] = id;
    }

    // Compute and store the word shape.
    std::string shape;
    dictionary.GetTokenDictionary()->GetWordShape(instance->GetForm(i), &shape);
    int shape_id = dictionary.GetTokenDictionary()->GetShapeId(shape);
    CHECK_LT(shape_id, 0xffff);
    if (shape_id < 0) shape_id = kUnknownShape;
    shape_ids_[i] = shape_id;

    // Compute and store various flags.
    const char* word = instance->GetForm(i).c_str();
    int word_length = instance->GetForm(i).length();
    int num_digits = CountDigits(word, word_length);

    has_digit_[i] = (num_digits > 0);
    has_upper_[i] = HasUpperCaseLetters(word, word_length);
    has_hyphen_[i] = HasHyphen(word, word_length);
    all_digits_[i] = AllDigits(word, word_length);
    all_digits_with_punctuation_[i] = AllDigitsWithPunctuation(word, word_length);
    all_upper_[i] = AllUpperCase(word, word_length);
    first_upper_[i] = IsCapitalized(word, word_length);

    //id = token_dictionary->GetPosTagId(instance->GetTag(i));
    id = dictionary.GetTagAlphabet().Lookup(instance->GetTag(i));
    //CHECK_LT(id, 0xff);
    //CHECK_GE(id, 0);
    if (id < 0) {
      id = TOKEN_UNKNOWN;
      VLOG(2) << "Unknown tag: " << instance->GetTag(i);
    }
    tag_ids_[i] = id;
  }
}
コード例 #2
0
void CoreferenceSentenceNumeric::Initialize(
    const CoreferenceDictionary &dictionary,
    CoreferenceSentence* instance,
    bool add_gold_mentions,
    std::map<std::string, int> *coreference_span_names) {
  TokenDictionary *token_dictionary = dictionary.GetTokenDictionary();
  DependencyDictionary *dependency_dictionary =
    dictionary.GetDependencyDictionary();
  SemanticDictionary *semantic_dictionary =
    dictionary.GetSemanticDictionary();
  SemanticInstance *semantic_instance =
    static_cast<SemanticInstance*>(instance);

  Clear();

  SemanticInstanceNumeric::Initialize(*semantic_dictionary,
                                      semantic_instance);

  const std::vector<EntitySpan*> &entity_spans = instance->GetEntitySpans();
  entity_spans_.resize(entity_spans.size());
  for (int k = 0; k < entity_spans.size(); ++k) {
    int start = entity_spans[k]->start();
    int end = entity_spans[k]->end();
    const std::string &name = entity_spans[k]->name();
    int id = dictionary.GetEntityAlphabet().Lookup(name);
    CHECK_LT(id, 0xff);
    if (id < 0) id = kUnknownEntity;
    entity_spans_[k] = new NumericSpan(start, end, id);
  }

  const std::vector<NamedSpan*> &constituent_spans =
    instance->GetConstituentSpans();
  constituent_spans_.resize(constituent_spans.size());
  for (int k = 0; k < constituent_spans.size(); ++k) {
    int start = constituent_spans[k]->start();
    int end = constituent_spans[k]->end();
    const std::string &name = constituent_spans[k]->name();
    int id = dictionary.GetConstituentAlphabet().Lookup(name);
    CHECK_LT(id, 0xffff);
    if (id < 0) id = kUnknownConstituent;
    constituent_spans_[k] = new NumericSpan(start, end, id);
  }

  //std::map<std::string, int> span_names;
  const std::vector<NamedSpan*> &coreference_spans =
    instance->GetCoreferenceSpans();
  coreference_spans_.resize(coreference_spans.size());
  for (int k = 0; k < coreference_spans.size(); ++k) {
    int start = coreference_spans[k]->start();
    int end = coreference_spans[k]->end();
    const std::string &name = coreference_spans[k]->name();
    std::map<std::string, int>::const_iterator it =
      coreference_span_names->find(name);
    int id = -1;
    if (it == coreference_span_names->end()) {
      id = coreference_span_names->size();
      (*coreference_span_names)[name] = id;
    } else {
      id = it->second;
    }
    coreference_spans_[k] = new NumericSpan(start, end, id);
  }

  int length = instance->size();
  first_upper_.resize(length);
  for (int i = 0; i < length; ++i) {
    const char* word = instance->GetForm(i).c_str();
    int word_length = instance->GetForm(i).length();
    first_upper_[i] = IsCapitalized(word, word_length);
  }

  // Generate candidate mentions.
  GenerateMentions(dictionary, instance);

  //LOG(INFO) << mentions_.size() << " found in sentence.";

  // Add gold mentions as candidates (only for training).
  if (add_gold_mentions) AddGoldMentions(dictionary, instance);


#if 0
  LOG(INFO) << mentions_.size() << " found in sentence.";
  for (int i = 0; i < mentions_.size(); ++i) {
    LOG(INFO) << "Mention "
              << mentions_[i]->start() << " "
              << mentions_[i]->end();

  }
#endif

#if 0
  // Print mention information (for debugging purposes).
  LOG(INFO) << mentions_.size() << " found in sentence.";
  for (int i = 0; i < mentions_.size(); ++i) {
    mentions_[i]->Print(dictionary, instance);
  }
#endif
}