void CoreferenceDictionary::CreateWordDictionaries( CoreferenceSentenceReader *reader) { LOG(INFO) << "Creating word dictionary..."; std::vector<int> word_freqs; std::vector<int> word_lower_freqs; /* string special_symbols[NUM_SPECIAL_TOKENS]; special_symbols[TOKEN_UNKNOWN] = kTokenUnknown; special_symbols[TOKEN_START] = kTokenStart; special_symbols[TOKEN_STOP] = kTokenStop; for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { word_alphabet.Insert(special_symbols[i]); word_freqs.push_back(0); word_lower_alphabet.Insert(special_symbols[i]); word_lower_freqs.push_back(0); } */ // Go through the corpus and build the label dictionary, // counting the frequencies. reader->Open(pipe_->GetOptions()->GetTrainingFilePath()); CoreferenceSentence *instance = static_cast<CoreferenceSentence*>(reader->GetNext()); while (instance != NULL) { for (int i = 0; i < instance->size(); ++i) { int id; // Add form to alphabet. std::string form = instance->GetForm(i); std::string form_lower(form); transform(form_lower.begin(), form_lower.end(), form_lower.begin(), ::tolower); id = word_alphabet_.Insert(form); if (id >= word_freqs.size()) { CHECK_EQ(id, word_freqs.size()); word_freqs.push_back(0); } ++word_freqs[id]; // Add lower-case form to the alphabet. // TODO(atm): "sanitize" words, by escaping digit sequences: // word = re.sub('[\d]+', '#', word.lower()) id = word_lower_alphabet_.Insert(form_lower); if (id >= word_lower_freqs.size()) { CHECK_EQ(id, word_lower_freqs.size()); word_lower_freqs.push_back(0); } ++word_lower_freqs[id]; } delete instance; instance = static_cast<CoreferenceSentence*>(reader->GetNext()); } reader->Close(); word_alphabet_.StopGrowth(); word_lower_alphabet_.StopGrowth(); LOG(INFO) << "Number of words: " << word_alphabet_.size(); LOG(INFO) << "Number of lower-case words: " << word_lower_alphabet_.size(); }
void DependencyInstanceNumeric::Initialize( const DependencyDictionary &dictionary, DependencyInstance* instance) { TokenDictionary *token_dictionary = dictionary.GetTokenDictionary(); int length = instance->size(); int i; int id; int prefix_length = FLAGS_prefix_length; int suffix_length = FLAGS_suffix_length; bool form_case_sensitive = FLAGS_form_case_sensitive; Clear(); form_ids_.resize(length); form_lower_ids_.resize(length); lemma_ids_.resize(length); prefix_ids_.resize(length); suffix_ids_.resize(length); feats_ids_.resize(length); pos_ids_.resize(length); cpos_ids_.resize(length); //shapes_.resize(length); is_noun_.resize(length); is_verb_.resize(length); is_punc_.resize(length); is_coord_.resize(length); heads_.resize(length); relations_.resize(length); for (i = 0; i < length; i++) { std::string form = instance->GetForm(i); std::string form_lower(form); transform(form_lower.begin(), form_lower.end(), form_lower.begin(), ::tolower); if (!form_case_sensitive) form = form_lower; id = token_dictionary->GetFormId(form); CHECK_LT(id, 0xffff); if (id < 0) id = TOKEN_UNKNOWN; form_ids_[i] = id; id = token_dictionary->GetFormLowerId(form_lower); CHECK_LT(id, 0xffff); if (id < 0) id = TOKEN_UNKNOWN; form_lower_ids_[i] = id; id = token_dictionary->GetLemmaId(instance->GetLemma(i)); CHECK_LT(id, 0xffff); if (id < 0) id = TOKEN_UNKNOWN; lemma_ids_[i] = id; std::string prefix = form.substr(0, prefix_length); id = token_dictionary->GetPrefixId(prefix); CHECK_LT(id, 0xffff); if (id < 0) id = TOKEN_UNKNOWN; prefix_ids_[i] = id; int start = form.length() - suffix_length; if (start < 0) start = 0; std::string suffix = form.substr(start, suffix_length); id = token_dictionary->GetSuffixId(suffix); CHECK_LT(id, 0xffff); if (id < 0) id = TOKEN_UNKNOWN; suffix_ids_[i] = id; id = token_dictionary->GetPosTagId(instance->GetPosTag(i)); CHECK_LT(id, 0xff); if (id < 0) id = TOKEN_UNKNOWN; pos_ids_[i] = id; id = token_dictionary->GetCoarsePosTagId(instance->GetCoarsePosTag(i)); CHECK_LT(id, 0xff); if (id < 0) id = TOKEN_UNKNOWN; cpos_ids_[i] = id; feats_ids_[i].resize(instance->GetNumMorphFeatures(i)); for (int j = 0; j < instance->GetNumMorphFeatures(i); ++j) { id = token_dictionary->GetMorphFeatureId(instance->GetMorphFeature(i, j)); CHECK_LT(id, 0xffff); if (id < 0) id = TOKEN_UNKNOWN; feats_ids_[i][j] = id; } //GetWordShape(instance->GetForm(i), &shapes_[i]); // Check whether the word is a noun, verb, punctuation or coordination. // Note: this depends on the POS tag string. // This procedure is taken from EGSTRA // (http://groups.csail.mit.edu/nlp/egstra/). is_noun_[i] = false; is_verb_[i] = false; is_punc_[i] = false; is_coord_[i] = false; const char* tag = instance->GetPosTag(i).c_str(); if (tag[0] == 'v' || tag[0] == 'V') { is_verb_[i] = true; } else if (tag[0] == 'n' || tag[0] == 'N') { is_noun_[i] = true; } else if (strcmp(tag, "Punc") == 0 || strcmp(tag, "$,") == 0 || strcmp(tag, "$.") == 0 || strcmp(tag, "PUNC") == 0 || strcmp(tag, "punc") == 0 || strcmp(tag, "F") == 0 || strcmp(tag, "IK") == 0 || strcmp(tag, "XP") == 0 || strcmp(tag, ",") == 0 || strcmp(tag, ";") == 0) { is_punc_[i] = true; } else if (strcmp(tag, "Conj") == 0 || strcmp(tag, "KON") == 0 || strcmp(tag, "conj") == 0 || strcmp(tag, "Conjunction") == 0 || strcmp(tag, "CC") == 0 || strcmp(tag, "cc") == 0) { is_coord_[i] = true; } heads_[i] = instance->GetHead(i); relations_[i] = dictionary.GetLabelAlphabet().Lookup( instance->GetDependencyRelation(i)); } }
void EntityTokenDictionary::Initialize(EntityReader *reader) { SetTokenDictionaryFlagValues(); LOG(INFO) << "Creating token dictionary..."; std::vector<int> form_freqs; std::vector<int> form_lower_freqs; std::vector<int> shape_freqs; std::vector<int> pos_freqs; Alphabet form_alphabet; Alphabet form_lower_alphabet; Alphabet shape_alphabet; Alphabet pos_alphabet; std::string special_symbols[NUM_SPECIAL_TOKENS]; special_symbols[TOKEN_UNKNOWN] = kTokenUnknown; special_symbols[TOKEN_START] = kTokenStart; special_symbols[TOKEN_STOP] = kTokenStop; for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { prefix_alphabet_.Insert(special_symbols[i]); suffix_alphabet_.Insert(special_symbols[i]); form_alphabet.Insert(special_symbols[i]); form_lower_alphabet.Insert(special_symbols[i]); shape_alphabet.Insert(special_symbols[i]); pos_alphabet.Insert(special_symbols[i]); // Counts of special symbols are set to -1: form_freqs.push_back(-1); form_lower_freqs.push_back(-1); shape_freqs.push_back(-1); pos_freqs.push_back(-1); } // Go through the corpus and build the dictionaries, // counting the frequencies. reader->Open(pipe_->GetOptions()->GetTrainingFilePath()); EntityInstance *instance = static_cast<EntityInstance*>(reader->GetNext()); while (instance != NULL) { int instance_length = instance->size(); for (int i = 0; i < instance_length; ++i) { int id; // Add form to alphabet. std::string form = instance->GetForm(i); std::string form_lower(form); std::transform(form_lower.begin(), form_lower.end(), form_lower.begin(), ::tolower); if (!form_case_sensitive) form = form_lower; id = form_alphabet.Insert(form); if (id >= form_freqs.size()) { CHECK_EQ(id, form_freqs.size()); form_freqs.push_back(0); } ++form_freqs[id]; // Add lower-case form to the alphabet. id = form_lower_alphabet.Insert(form_lower); if (id >= form_lower_freqs.size()) { CHECK_EQ(id, form_lower_freqs.size()); form_lower_freqs.push_back(0); } ++form_lower_freqs[id]; // Add prefix/suffix to alphabet. std::string prefix = form.substr(0, prefix_length); id = prefix_alphabet_.Insert(prefix); int start = form.length() - suffix_length; if (start < 0) start = 0; std::string suffix = form.substr(start, suffix_length); id = suffix_alphabet_.Insert(suffix); // Add shape to alphabet. std::string shape; GetWordShape(instance->GetForm(i), &shape); id = shape_alphabet.Insert(shape); if (id >= shape_freqs.size()) { CHECK_EQ(id, shape_freqs.size()); shape_freqs.push_back(0); } ++shape_freqs[id]; // Add POS to alphabet. id = pos_alphabet.Insert(instance->GetPosTag(i)); if (id >= pos_freqs.size()) { CHECK_EQ(id, pos_freqs.size()); pos_freqs.push_back(0); } ++pos_freqs[id]; } delete instance; instance = static_cast<EntityInstance*>(reader->GetNext()); } reader->Close(); // Now adjust the cutoffs if necessary. while (true) { form_alphabet_.clear(); for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { form_alphabet_.Insert(special_symbols[i]); } for (Alphabet::iterator iter = form_alphabet.begin(); iter != form_alphabet.end(); ++iter) { if (form_freqs[iter->second] > form_cutoff) { form_alphabet_.Insert(iter->first); } } if (form_alphabet_.size() < kMaxFormAlphabetSize) break; ++form_cutoff; LOG(INFO) << "Incrementing form cutoff to " << form_cutoff << "..."; } while (true) { form_lower_alphabet_.clear(); for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { form_lower_alphabet_.Insert(special_symbols[i]); } for (Alphabet::iterator iter = form_lower_alphabet.begin(); iter != form_lower_alphabet.end(); ++iter) { if (form_lower_freqs[iter->second] > form_lower_cutoff) { form_lower_alphabet_.Insert(iter->first); } } if (form_lower_alphabet_.size() < kMaxFormAlphabetSize) break; ++form_lower_cutoff; LOG(INFO) << "Incrementing lower-case form cutoff to " << form_lower_cutoff << "..."; } while (true) { shape_alphabet_.clear(); for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { shape_alphabet_.Insert(special_symbols[i]); } for (Alphabet::iterator iter = shape_alphabet.begin(); iter != shape_alphabet.end(); ++iter) { if (shape_freqs[iter->second] > shape_cutoff) { shape_alphabet_.Insert(iter->first); } } if (shape_alphabet_.size() < kMaxShapeAlphabetSize) break; ++shape_cutoff; LOG(INFO) << "Incrementing shape cutoff to " << shape_cutoff << "..."; } while (true) { pos_alphabet_.clear(); for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { pos_alphabet_.Insert(special_symbols[i]); } for (const auto& pos_token : pos_alphabet) { if (pos_freqs[pos_token.second] > pos_cutoff) { pos_alphabet_.Insert(pos_token.first); } } if (pos_alphabet_.size() < kMaxPosAlphabetSize) break; ++pos_cutoff; LOG(INFO) << "Incrementing POS cutoff to " << pos_cutoff << "..."; } form_alphabet_.StopGrowth(); form_lower_alphabet_.StopGrowth(); shape_alphabet_.StopGrowth(); lemma_alphabet_.StopGrowth(); prefix_alphabet_.StopGrowth(); suffix_alphabet_.StopGrowth(); feats_alphabet_.StopGrowth(); pos_alphabet_.StopGrowth(); cpos_alphabet_.StopGrowth(); LOG(INFO) << "Number of forms: " << form_alphabet_.size() << endl << "Number of lower-case forms: " << form_lower_alphabet_.size() << endl << "Number of prefixes: " << prefix_alphabet_.size() << endl << "Number of suffixes: " << suffix_alphabet_.size() << endl << "Number of word shapes: " << shape_alphabet_.size() << endl << "Number of pos: " << pos_alphabet_.size(); CHECK_LT(form_alphabet_.size(), 0xffff); CHECK_LT(form_lower_alphabet_.size(), 0xffff); CHECK_LT(shape_alphabet_.size(), 0xffff); CHECK_LT(lemma_alphabet_.size(), 0xffff); CHECK_LT(prefix_alphabet_.size(), 0xffff); CHECK_LT(suffix_alphabet_.size(), 0xffff); CHECK_LT(feats_alphabet_.size(), 0xffff); CHECK_LT(pos_alphabet_.size(), 0xff); CHECK_LT(cpos_alphabet_.size(), 0xff); #ifndef NDEBUG BuildNames(); #endif }