void TokenDictionary::InitializeFromSequenceReader(SequenceReader *reader) { LOG(INFO) << "Creating token dictionary..."; int form_cutoff = FLAGS_form_cutoff; int form_lower_cutoff = FLAGS_form_cutoff; int shape_cutoff = 0; int prefix_length = FLAGS_prefix_length; int suffix_length = FLAGS_suffix_length; bool form_case_sensitive = FLAGS_form_case_sensitive; std::vector<int> form_freqs; std::vector<int> form_lower_freqs; std::vector<int> shape_freqs; Alphabet form_alphabet; Alphabet form_lower_alphabet; Alphabet shape_alphabet; std::string special_symbols[NUM_SPECIAL_TOKENS]; special_symbols[TOKEN_UNKNOWN] = kTokenUnknown; special_symbols[TOKEN_START] = kTokenStart; special_symbols[TOKEN_STOP] = kTokenStop; for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { prefix_alphabet_.Insert(special_symbols[i]); suffix_alphabet_.Insert(special_symbols[i]); form_alphabet.Insert(special_symbols[i]); form_lower_alphabet.Insert(special_symbols[i]); shape_alphabet.Insert(special_symbols[i]); // Counts of special symbols are set to -1: form_freqs.push_back(-1); form_lower_freqs.push_back(-1); shape_freqs.push_back(-1); } // Go through the corpus and build the dictionaries, // counting the frequencies. reader->Open(pipe_->GetOptions()->GetTrainingFilePath()); SequenceInstance *instance = static_cast<SequenceInstance*>(reader->GetNext()); while (instance != NULL) { int instance_length = instance->size(); for (int i = 0; i < instance_length; ++i) { int id; // Add form to alphabet. std::string form = instance->GetForm(i); std::string form_lower(form); transform(form_lower.begin(), form_lower.end(), form_lower.begin(), ::tolower); if (!form_case_sensitive) form = form_lower; id = form_alphabet.Insert(form); if (id >= form_freqs.size()) { CHECK_EQ(id, form_freqs.size()); form_freqs.push_back(0); } ++form_freqs[id]; // Add lower-case form to the alphabet. id = form_lower_alphabet.Insert(form_lower); if (id >= form_lower_freqs.size()) { CHECK_EQ(id, form_lower_freqs.size()); form_lower_freqs.push_back(0); } ++form_lower_freqs[id]; // Add prefix/suffix to alphabet. std::string prefix = form.substr(0, prefix_length); id = prefix_alphabet_.Insert(prefix); int start = form.length() - suffix_length; if (start < 0) start = 0; std::string suffix = form.substr(start, suffix_length); id = suffix_alphabet_.Insert(suffix); // Add shape to alphabet. std::string shape; GetWordShape(instance->GetForm(i), &shape); id = shape_alphabet.Insert(shape); if (id >= shape_freqs.size()) { CHECK_EQ(id, shape_freqs.size()); shape_freqs.push_back(0); } ++shape_freqs[id]; } delete instance; instance = static_cast<SequenceInstance*>(reader->GetNext()); } reader->Close(); // Now adjust the cutoffs if necessary. while (true) { form_alphabet_.clear(); for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { form_alphabet_.Insert(special_symbols[i]); } for (Alphabet::iterator iter = form_alphabet.begin(); iter != form_alphabet.end(); ++iter) { if (form_freqs[iter->second] > form_cutoff) { form_alphabet_.Insert(iter->first); } } if (form_alphabet_.size() < kMaxFormAlphabetSize) break; ++form_cutoff; LOG(INFO) << "Incrementing form cutoff to " << form_cutoff << "..."; } while (true) { form_lower_alphabet_.clear(); for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { form_lower_alphabet_.Insert(special_symbols[i]); } for (Alphabet::iterator iter = form_lower_alphabet.begin(); iter != form_lower_alphabet.end(); ++iter) { if (form_lower_freqs[iter->second] > form_lower_cutoff) { form_lower_alphabet_.Insert(iter->first); } } if (form_lower_alphabet_.size() < kMaxFormAlphabetSize) break; ++form_lower_cutoff; LOG(INFO) << "Incrementing lower-case form cutoff to " << form_lower_cutoff << "..."; } while (true) { shape_alphabet_.clear(); for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) { shape_alphabet_.Insert(special_symbols[i]); } for (Alphabet::iterator iter = shape_alphabet.begin(); iter != shape_alphabet.end(); ++iter) { if (shape_freqs[iter->second] > shape_cutoff) { shape_alphabet_.Insert(iter->first); } } if (shape_alphabet_.size() < kMaxShapeAlphabetSize) break; ++shape_cutoff; LOG(INFO) << "Incrementing shape cutoff to " << shape_cutoff << "..."; } form_alphabet_.StopGrowth(); form_lower_alphabet_.StopGrowth(); shape_alphabet_.StopGrowth(); lemma_alphabet_.StopGrowth(); prefix_alphabet_.StopGrowth(); suffix_alphabet_.StopGrowth(); feats_alphabet_.StopGrowth(); pos_alphabet_.StopGrowth(); cpos_alphabet_.StopGrowth(); LOG(INFO) << "Number of forms: " << form_alphabet_.size() << endl << "Number of prefixes: " << prefix_alphabet_.size() << endl << "Number of suffixes: " << suffix_alphabet_.size() << endl << "Number of word shapes: " << shape_alphabet_.size(); CHECK_LT(form_alphabet_.size(), 0xffff); CHECK_LT(form_lower_alphabet_.size(), 0xffff); CHECK_LT(shape_alphabet_.size(), 0xffff); CHECK_LT(lemma_alphabet_.size(), 0xffff); CHECK_LT(prefix_alphabet_.size(), 0xffff); CHECK_LT(suffix_alphabet_.size(), 0xffff); CHECK_LT(feats_alphabet_.size(), 0xffff); CHECK_LT(pos_alphabet_.size(), 0xff); CHECK_LT(cpos_alphabet_.size(), 0xff); }
void TaggerDictionary::CreateTagDictionary(SequenceReader *reader) { SequenceDictionary::CreateTagDictionary(reader); LOG(INFO) << "Creating word-tag dictionary..."; bool form_case_sensitive = FLAGS_form_case_sensitive; // Go through the corpus and build the existing tags for each word. word_tags_.clear(); word_tags_.resize(token_dictionary_->GetNumForms()); reader->Open(pipe_->GetOptions()->GetTrainingFilePath()); SequenceInstance *instance = static_cast<SequenceInstance*>(reader->GetNext()); while (instance != NULL) { int instance_length = instance->size(); for (int i = 0; i < instance_length; ++i) { int id; string form = instance->GetForm(i); if (!form_case_sensitive) { transform(form.begin(), form.end(), form.begin(), ::tolower); } int word_id = token_dictionary_->GetFormId(form); //CHECK_GE(word_id, 0); id = tag_alphabet_.Lookup(instance->GetTag(i)); CHECK_GE(id, 0); // Insert new tag in the set of word tags, if it is not there // already. NOTE: this is inefficient, maybe we should be using a // different data structure. if (word_id >= 0) { vector<int> &tags = word_tags_[word_id]; int j; for (j = 0; j < tags.size(); ++j) { if (tags[j] == id) break; } if (j == tags.size()) tags.push_back(id); } } delete instance; instance = static_cast<SequenceInstance*>(reader->GetNext()); } reader->Close(); // If there is a list of possible tags for the unknown words, load it. TaggerOptions *options = static_cast<TaggerOptions*>(pipe_->GetOptions()); if (options->GetUnknownWordTagsFilePath().size() == 0) { for (int i = 0; i < tag_alphabet_.size(); ++i) { unknown_word_tags_.push_back(i); } } else { LOG(INFO) << "Loading file with unknown word tags..."; std::ifstream is; is.open(options->GetUnknownWordTagsFilePath().c_str(), ifstream::in); CHECK(is.good()) << "Could not open " << options->GetUnknownWordTagsFilePath() << "."; vector<vector<string> > sentence_fields; string line; if (is.is_open()) { while (!is.eof()) { getline(is, line); if (line.size() == 0) break; int tagid = tag_alphabet_.Lookup(line); CHECK(tagid >= 0) << "Tag " << line << " does not exist."; unknown_word_tags_.push_back(tagid); LOG(INFO) << "Unknown word tag: " << line; } } } LOG(INFO) << "Number of unknown word tags: " << unknown_word_tags_.size(); }