void bilou_ner_trainer::train(ner_id id, int stages, const network_parameters& parameters, const tagger& tagger, istream& features, istream& train, istream& heldout, ostream& os) { if (stages <= 0) runtime_failure("Cannot train NER with <= 0 stages!"); if (stages >= 256) runtime_failure("Cannot train NER with >= 256 stages!"); // Load training and possibly also heldout data entity_map entities; vector<labelled_sentence> train_data; cerr << "Loading train data: "; load_data(train, tagger, train_data, entities, true); cerr << "done, " << train_data.size() << " sentences" << endl; cerr << "Found " << entities.size() << " annotated entity types." << endl; vector<labelled_sentence> heldout_data; if (heldout) { cerr << "Loading heldout data: "; load_data(heldout, tagger, heldout_data, entities, false); cerr << "done, " << heldout_data.size() << " sentences" << endl; } // Parse feature templates feature_templates templates; unique_ptr<tokenizer> tokenizer(bilou_ner::new_tokenizer(id)); cerr << "Parsing feature templates: "; templates.parse(features, entities, nlp_pipeline(tokenizer.get(), &tagger)); cerr << "done" << endl; // Train required number of stages vector<network_classifier> networks(stages); for (auto&& network : networks) { // Generate features cerr << "Generating features: "; vector<classifier_instance> train_instances, heldout_instances; generate_instances(train_data, templates, train_instances, true); generate_instances(heldout_data, templates, heldout_instances, false); cerr << "done" << endl; // Train and encode the recognizer cerr << "Training network classifier." << endl; if (!network.train(templates.get_total_features(), bilou_entity::total(entities.size()), train_instances, heldout_instances, parameters, true)) runtime_failure("Cannot train the network classifier!"); // Use the trained classifier to compute previous_stage compute_previous_stage(train_data, templates, network); compute_previous_stage(heldout_data, templates, network); } // Encode the recognizer cerr << "Encoding the recognizer." << endl; if (!entities.save(os)) runtime_error("Cannot save entity map!"); if (!templates.save(os)) runtime_error("Cannot save feature templates!"); if (!os.put(stages)) runtime_error("Cannot save number of stages!"); for (auto&& network : networks) if (!network.save(os)) runtime_error("Cannot save classifier network!"); }
void bilou_ner_trainer::load_data(istream& is, const tagger& tagger, vector<labelled_sentence>& data, entity_map& entity_map, bool add_entities) { vector<string> words, entities; vector<string_piece> forms; data.clear(); string line; vector<string> tokens; for (bool eof; true; ) { eof = !getline(is, line); if (eof || line.empty()) { if (!words.empty()) { // Tag the sentence forms.clear(); for (auto&& word : words) forms.emplace_back(word); data.emplace_back(); auto& sentence = data.back(); tagger.tag(forms, sentence.sentence); // Clear previous_stage sentence.sentence.clear_previous_stage(); // Decode the entities names and ranges for (unsigned i = 0; i < entities.size(); i++) if (entities[i] == "_" || entities[i] == "O") sentence.outcomes.emplace_back(bilou_entity::O); else if (entities[i].size() >= 3 && (entities[i].compare(0, 2, "I-") == 0 || entities[i].compare(0, 2, "B-") == 0)) { bool has_prev = i > 0 && entities[i][0] == 'I' && entities[i-1].compare(1, string::npos, entities[i], 1, string::npos) == 0; bool has_next = i+1 < entities.size() && entities[i+1][0] != 'B' && entities[i+1].compare(1, string::npos, entities[i], 1, string::npos) == 0; entity_type entity = entity_map.parse(entities[i].c_str() + 2, add_entities); sentence.outcomes.emplace_back(!has_prev && !has_next ? bilou_entity::U(entity) : !has_prev && has_next ? bilou_entity::B(entity) : has_prev && has_next ? bilou_entity::I : bilou_entity::L); } else runtime_failure("Cannot parse entity type " << entities[i] << "!"); // Start a new sentence words.clear(); entities.clear(); } if (eof) break; } else { split(line, '\t', tokens); if (tokens.size() != 2) runtime_failure("The NER data line '" << line << "' does not contain two columns!"); words.emplace_back(tokens[0]); entities.emplace_back(tokens[1]); } } }
void feature_templates::parse(istream& is, entity_map& entities) { sentence_processors.clear(); entity_processors.clear(); total_features = 1; // An omnipresent feature used in process_sentence string line; vector<string> tokens; while (getline(is, line)) { // Ignore empty lines and lines commented with # if (line.empty() || line[0] == '#') continue; split(line, ' ', tokens); if (tokens.size() < 1) runtime_failure("Bad line '" << line << "' of feature templates file!"); vector<string> token0_parts; split(tokens[0], '/', token0_parts); if (token0_parts.size() < 1 || token0_parts.size() > 2) runtime_failure("Bad feature template description at line '" << line << "' of feature templates file!"); string template_name = token0_parts[0]; int window = token0_parts.size() > 1 ? parse_int(token0_parts[1].c_str(), "feature_template_window") : 0; vector<string> args; for (unsigned i = 1; i < tokens.size(); i++) args.emplace_back(tokens[i]); // Try sentence processor auto* maybe_sentence_processor = sentence_processor::create(template_name); if (maybe_sentence_processor) { if (!maybe_sentence_processor->parse(window, args, entities, &total_features)) runtime_failure("Cannot initialize feature template sentence processor '" << template_name << "' from line '" << line << "' of feature templates file!"); sentence_processors.emplace_back(template_name, maybe_sentence_processor); continue; } // Try entity processor auto* maybe_entity_processor = entity_processor::create(template_name); if (maybe_entity_processor) { if (window) cerr << "Ignoring window of " << window << " specified in entity_processor '" << template_name << "'." << endl; if (!maybe_entity_processor->parse(args, entities)) runtime_failure("Cannot initialize feature template entity processor '" << template_name << "' from line '" << line << "' of feature templates file!"); entity_processors.emplace_back(template_name, maybe_entity_processor); continue; } // Fail runtime_failure("Cannot create feature template '" << template_name << "' from line '" << line << "' of feature templates file!"); } }
void derivator_dictionary_encoder::encode(istream& is, istream& dictionary, bool verbose, ostream& os) { // Load the morphology cerr << "Loading morphology: "; auto dictionary_start = dictionary.tellg(); unique_ptr<morpho> morpho(morpho::load(dictionary)); if (!morpho) runtime_failure("Cannot load morpho model from given file!"); if (morpho->get_derivator()) runtime_failure("The given morpho model already has a derivator!"); auto dictionary_end = dictionary.tellg(); cerr << "done" << endl; // Load the derivator cerr << "Loading derivator data: "; struct lemma_info { string sense; string comment; string parent; set<string> parents; unsigned children; unsigned mark; lemma_info(const string& sense = string(), const string& comment = string()) : sense(sense), comment(comment), children(0), mark(0) {} }; map<string, lemma_info> derinet; string line; string part_lid, lemma_lid, lemma_comment; vector<string> tokens; vector<string> parts; unordered_map<string, lemma_info> matched[2]; vector<tagged_lemma_forms> matched_lemmas_forms; while (getline(is, line)) { split(line, '\t', tokens); if (tokens.size() != 2) runtime_failure("Expected two tab separated columns on derivator line '" << line << "'!"); // Generate all possible lemmas and parents for (int i = 0; i < 2; i++) { split(tokens[i], ' ', parts); if (parts.size() > 2) runtime_failure("The derivator lemma desctiption '" << tokens[i] << "' contains two or more spaces!"); bool is_lemma_id = parts.size() == 1; part_lid.assign(parts[0], 0, morpho->lemma_id_len(parts[0])); morpho->generate(parts[0], is_lemma_id ? nullptr : parts[1].c_str(), morpho::NO_GUESSER, matched_lemmas_forms); matched[i].clear(); for (auto&& lemma_forms : matched_lemmas_forms) { lemma_lid.assign(lemma_forms.lemma, 0, morpho->lemma_id_len(lemma_forms.lemma)); if (!is_lemma_id || part_lid == lemma_lid) { // Choose only the shortest lemma comment for the lemma id of lemma_form.lemma lemma_comment.assign(lemma_forms.lemma, lemma_lid.size(), string::npos); auto it = matched[i].emplace(lemma_lid, lemma_info(lemma_lid.substr(morpho->raw_lemma_len(lemma_lid)), lemma_comment)); if (!it.second && (lemma_comment.size() < it.first->second.comment.size() || (lemma_comment.size() == it.first->second.comment.size() && lemma_comment < it.first->second.comment))) it.first->second.comment.assign(lemma_comment); } } } if (matched[0].empty() || matched[1].empty()) { if (verbose) cerr << "Could not match a lemma from line '" << line << "', skipping." << endl; continue; } // Store the possible parents derinet.insert(matched[0].begin(), matched[0].end()); derinet.insert(matched[1].begin(), matched[1].end()); for (auto&& lemma : matched[0]) for (auto&& parent : matched[1]) derinet[lemma.first].parents.insert(parent.first); } cerr << "done" << endl; // Choose unique parent for every lemma for (auto&& lemma : derinet) if (!lemma.second.parents.empty()) { // Try locating lexicographically smallest parent with the same sense for (auto&& parent : lemma.second.parents) if (derinet[parent].sense == lemma.second.sense) { lemma.second.parent.assign(parent); break; } // Otherwise, choose the lexicographically smallest parent if (lemma.second.parent.empty()) lemma.second.parent.assign(*lemma.second.parents.begin()); // Add this edge also to the parent derinet[lemma.second.parent].children++; if (verbose) cerr << lemma.first << lemma.second.comment << " -> " << lemma.second.parent << derinet[lemma.second.parent].comment << endl; } // Make sure the derinet contains no cycles unsigned mark = 0; for (auto&& lemma : derinet) { lemma.second.mark = ++mark; for (auto node = derinet.find(lemma.first); !node->second.parent.empty(); ) { node = derinet.find(node->second.parent); if (node->second.mark) { if (node->second.mark == mark) runtime_failure("The derivator data contains a cycle with lemma '" << lemma.first << "'!"); break; } node->second.mark = mark; } } // Encode the derivator cerr << "Encoding derivator: "; os.put(morpho_ids::DERIVATOR_DICTIONARY); binary_encoder enc; vector<int> lengths; for (auto&& lemma : derinet) { if (lemma.first.size() >= lengths.size()) lengths.resize(lemma.first.size() + 1); lengths[lemma.first.size()]++; } enc.add_1B(lengths.size()); for (auto&& length : lengths) enc.add_4B(length); enc.add_4B(derinet.size()); string prev = ""; for (auto&& lemma : derinet) { int cpl = 0; while (prev[cpl] && prev[cpl] == lemma.first[cpl]) cpl++; enc.add_1B(prev.size() - cpl); enc.add_1B(lemma.first.size() - cpl); enc.add_data(lemma.first.c_str() + cpl); enc.add_1B(lemma.second.comment.size()); enc.add_data(lemma.second.comment); enc.add_2B(lemma.second.children); if (lemma.second.parent.empty()) { enc.add_1B(0); } else { unsigned best_lemma_from = 0, best_parent_from = 0, best_len = 0; for (unsigned lemma_from = 0; lemma_from < lemma.first.size(); lemma_from++) for (unsigned parent_from = 0; parent_from < lemma.second.parent.size(); parent_from++) { unsigned len = 0; while (lemma_from + len < lemma.first.size() && parent_from + len < lemma.second.parent.size() && lemma.first[lemma_from+len] == lemma.second.parent[parent_from+len]) len++; if (len > best_len) best_lemma_from = lemma_from, best_parent_from = parent_from, best_len = len; } enum { REMOVE_START = 1, REMOVE_END = 2, ADD_START = 4, ADD_END = 8 }; enc.add_1B(REMOVE_START * (best_lemma_from>0) + REMOVE_END * (best_lemma_from+best_len<lemma.first.size()) + ADD_START * (best_parent_from>0) + ADD_END * (best_parent_from+best_len<lemma.second.parent.size())); if (best_lemma_from > 0) enc.add_1B(best_lemma_from); if (best_lemma_from + best_len < lemma.first.size()) enc.add_1B(lemma.first.size() - best_lemma_from - best_len); if (best_parent_from > 0) { enc.add_1B(best_parent_from); enc.add_data(string_piece(lemma.second.parent.c_str(), best_parent_from)); } if (best_parent_from + best_len < lemma.second.parent.size()) { enc.add_1B(lemma.second.parent.size() - best_parent_from - best_len); enc.add_data(lemma.second.parent.c_str() + best_parent_from + best_len); } } prev.assign(lemma.first); } compressor::save(os, enc); // Append the morphology after the derivator dictionary model if (!dictionary.seekg(dictionary_start, dictionary.beg)) runtime_failure("Cannot seek in the morpho model!"); for (auto length = dictionary_end - dictionary_start; length; length--) os.put(dictionary.get()); cerr << "done" << endl; }