예제 #1
0
void bilou_ner_trainer::train(ner_id id, int stages, const network_parameters& parameters, const tagger& tagger,
                              istream& features, istream& train, istream& heldout, ostream& os) {
  if (stages <= 0) runtime_failure("Cannot train NER with <= 0 stages!");
  if (stages >= 256) runtime_failure("Cannot train NER with >= 256 stages!");

  // Load training and possibly also heldout data
  entity_map entities;
  vector<labelled_sentence> train_data;
  cerr << "Loading train data: ";
  load_data(train, tagger, train_data, entities, true);
  cerr << "done, " << train_data.size() << " sentences" << endl;
  cerr << "Found " << entities.size() << " annotated entity types." << endl;

  vector<labelled_sentence> heldout_data;
  if (heldout) {
    cerr << "Loading heldout data: ";
    load_data(heldout, tagger, heldout_data, entities, false);
    cerr << "done, " << heldout_data.size() << " sentences" << endl;
  }

  // Parse feature templates
  feature_templates templates;
  unique_ptr<tokenizer> tokenizer(bilou_ner::new_tokenizer(id));
  cerr << "Parsing feature templates: ";
  templates.parse(features, entities, nlp_pipeline(tokenizer.get(), &tagger));
  cerr << "done" << endl;

  // Train required number of stages
  vector<network_classifier> networks(stages);

  for (auto&& network : networks) {
    // Generate features
    cerr << "Generating features: ";
    vector<classifier_instance> train_instances, heldout_instances;
    generate_instances(train_data, templates, train_instances, true);
    generate_instances(heldout_data, templates, heldout_instances, false);
    cerr << "done" << endl;

    // Train and encode the recognizer
    cerr << "Training network classifier." << endl;
    if (!network.train(templates.get_total_features(), bilou_entity::total(entities.size()), train_instances, heldout_instances, parameters, true))
      runtime_failure("Cannot train the network classifier!");

    // Use the trained classifier to compute previous_stage
    compute_previous_stage(train_data, templates, network);
    compute_previous_stage(heldout_data, templates, network);
  }

  // Encode the recognizer
  cerr << "Encoding the recognizer." << endl;
  if (!entities.save(os)) runtime_error("Cannot save entity map!");
  if (!templates.save(os)) runtime_error("Cannot save feature templates!");
  if (!os.put(stages)) runtime_error("Cannot save number of stages!");
  for (auto&& network : networks)
    if (!network.save(os)) runtime_error("Cannot save classifier network!");
}
예제 #2
0
void bilou_ner_trainer::load_data(istream& is, const tagger& tagger, vector<labelled_sentence>& data, entity_map& entity_map, bool add_entities) {
  vector<string> words, entities;
  vector<string_piece> forms;

  data.clear();

  string line;
  vector<string> tokens;
  for (bool eof; true; ) {
    eof = !getline(is, line);
    if (eof || line.empty()) {
      if (!words.empty()) {
        // Tag the sentence
        forms.clear();
        for (auto&& word : words)
          forms.emplace_back(word);
        data.emplace_back();
        auto& sentence = data.back();
        tagger.tag(forms, sentence.sentence);

        // Clear previous_stage
        sentence.sentence.clear_previous_stage();

        // Decode the entities names and ranges
        for (unsigned i = 0; i < entities.size(); i++)
          if (entities[i] == "_" || entities[i] == "O")
            sentence.outcomes.emplace_back(bilou_entity::O);
          else if (entities[i].size() >= 3 && (entities[i].compare(0, 2, "I-") == 0 || entities[i].compare(0, 2, "B-") == 0)) {
            bool has_prev = i > 0 && entities[i][0] == 'I' && entities[i-1].compare(1, string::npos, entities[i], 1, string::npos) == 0;
            bool has_next = i+1 < entities.size() && entities[i+1][0] != 'B' && entities[i+1].compare(1, string::npos, entities[i], 1, string::npos) == 0;
            entity_type entity = entity_map.parse(entities[i].c_str() + 2, add_entities);
            sentence.outcomes.emplace_back(!has_prev && !has_next ? bilou_entity::U(entity) : !has_prev && has_next ? bilou_entity::B(entity) : has_prev && has_next ? bilou_entity::I : bilou_entity::L);
          }
          else
            runtime_failure("Cannot parse entity type " << entities[i] << "!");

        // Start a new sentence
        words.clear();
        entities.clear();
      }
      if (eof) break;
    } else {
      split(line, '\t', tokens);
      if (tokens.size() != 2) runtime_failure("The NER data line '" << line << "' does not contain two columns!");
      words.emplace_back(tokens[0]);
      entities.emplace_back(tokens[1]);
    }
  }
}
예제 #3
0
void feature_templates::parse(istream& is, entity_map& entities) {
  sentence_processors.clear();
  entity_processors.clear();
  total_features = 1; // An omnipresent feature used in process_sentence

  string line;
  vector<string> tokens;
  while (getline(is, line)) {
    // Ignore empty lines and lines commented with #
    if (line.empty() || line[0] == '#') continue;

    split(line, ' ', tokens);
    if (tokens.size() < 1) runtime_failure("Bad line '" << line << "' of feature templates file!");

    vector<string> token0_parts;
    split(tokens[0], '/', token0_parts);
    if (token0_parts.size() < 1 || token0_parts.size() > 2) runtime_failure("Bad feature template description at line '" << line << "' of feature templates file!");

    string template_name = token0_parts[0];
    int window = token0_parts.size() > 1 ? parse_int(token0_parts[1].c_str(), "feature_template_window") : 0;
    vector<string> args;
    for (unsigned i = 1; i < tokens.size(); i++)
      args.emplace_back(tokens[i]);

    // Try sentence processor
    auto* maybe_sentence_processor = sentence_processor::create(template_name);
    if (maybe_sentence_processor) {
      if (!maybe_sentence_processor->parse(window, args, entities, &total_features)) runtime_failure("Cannot initialize feature template sentence processor '" << template_name << "' from line '" << line << "' of feature templates file!");
      sentence_processors.emplace_back(template_name, maybe_sentence_processor);
      continue;
    }

    // Try entity processor
    auto* maybe_entity_processor = entity_processor::create(template_name);
    if (maybe_entity_processor) {
      if (window) cerr << "Ignoring window of " << window << " specified in entity_processor '" << template_name << "'." << endl;
      if (!maybe_entity_processor->parse(args, entities)) runtime_failure("Cannot initialize feature template entity processor '" << template_name << "' from line '" << line << "' of feature templates file!");
      entity_processors.emplace_back(template_name, maybe_entity_processor);
      continue;
    }

    // Fail
    runtime_failure("Cannot create feature template '" << template_name << "' from line '" << line << "' of feature templates file!");
  }
}
void derivator_dictionary_encoder::encode(istream& is, istream& dictionary, bool verbose, ostream& os) {
  // Load the morphology
  cerr << "Loading morphology: ";
  auto dictionary_start = dictionary.tellg();
  unique_ptr<morpho> morpho(morpho::load(dictionary));
  if (!morpho) runtime_failure("Cannot load morpho model from given file!");
  if (morpho->get_derivator()) runtime_failure("The given morpho model already has a derivator!");
  auto dictionary_end = dictionary.tellg();
  cerr << "done" << endl;

  // Load the derivator
  cerr << "Loading derivator data: ";

  struct lemma_info {
    string sense;
    string comment;
    string parent;
    set<string> parents;
    unsigned children;
    unsigned mark;

    lemma_info(const string& sense = string(), const string& comment = string())
        : sense(sense), comment(comment), children(0), mark(0) {}
  };
  map<string, lemma_info> derinet;

  string line;
  string part_lid, lemma_lid, lemma_comment;
  vector<string> tokens;
  vector<string> parts;
  unordered_map<string, lemma_info> matched[2];
  vector<tagged_lemma_forms> matched_lemmas_forms;
  while (getline(is, line)) {
    split(line, '\t', tokens);
    if (tokens.size() != 2) runtime_failure("Expected two tab separated columns on derivator line '" << line << "'!");

    // Generate all possible lemmas and parents
    for (int i = 0; i < 2; i++) {
      split(tokens[i], ' ', parts);
      if (parts.size() > 2) runtime_failure("The derivator lemma desctiption '" << tokens[i] << "' contains two or more spaces!");
      bool is_lemma_id = parts.size() == 1;

      part_lid.assign(parts[0], 0, morpho->lemma_id_len(parts[0]));
      morpho->generate(parts[0], is_lemma_id ? nullptr : parts[1].c_str(), morpho::NO_GUESSER, matched_lemmas_forms);

      matched[i].clear();
      for (auto&& lemma_forms : matched_lemmas_forms) {
        lemma_lid.assign(lemma_forms.lemma, 0, morpho->lemma_id_len(lemma_forms.lemma));

        if (!is_lemma_id || part_lid == lemma_lid) {
          // Choose only the shortest lemma comment for the lemma id of lemma_form.lemma
          lemma_comment.assign(lemma_forms.lemma, lemma_lid.size(), string::npos);
          auto it = matched[i].emplace(lemma_lid, lemma_info(lemma_lid.substr(morpho->raw_lemma_len(lemma_lid)), lemma_comment));
          if (!it.second &&
              (lemma_comment.size() < it.first->second.comment.size() ||
               (lemma_comment.size() == it.first->second.comment.size() && lemma_comment < it.first->second.comment)))
            it.first->second.comment.assign(lemma_comment);
        }
      }
    }
    if (matched[0].empty() || matched[1].empty()) {
      if (verbose)
        cerr << "Could not match a lemma from line '" << line << "', skipping." << endl;
      continue;
    }

    // Store the possible parents
    derinet.insert(matched[0].begin(), matched[0].end());
    derinet.insert(matched[1].begin(), matched[1].end());
    for (auto&& lemma : matched[0])
      for (auto&& parent : matched[1])
        derinet[lemma.first].parents.insert(parent.first);
  }
  cerr << "done" << endl;

  // Choose unique parent for every lemma
  for (auto&& lemma : derinet)
    if (!lemma.second.parents.empty()) {
      // Try locating lexicographically smallest parent with the same sense
      for (auto&& parent : lemma.second.parents)
        if (derinet[parent].sense == lemma.second.sense) {
          lemma.second.parent.assign(parent);
          break;
        }

      // Otherwise, choose the lexicographically smallest parent
      if (lemma.second.parent.empty())
          lemma.second.parent.assign(*lemma.second.parents.begin());

      // Add this edge also to the parent
      derinet[lemma.second.parent].children++;

      if (verbose)
        cerr << lemma.first << lemma.second.comment << " -> " << lemma.second.parent << derinet[lemma.second.parent].comment << endl;
    }

  // Make sure the derinet contains no cycles
  unsigned mark = 0;
  for (auto&& lemma : derinet) {
    lemma.second.mark = ++mark;
    for (auto node = derinet.find(lemma.first); !node->second.parent.empty(); ) {
      node = derinet.find(node->second.parent);
      if (node->second.mark) {
        if (node->second.mark == mark)
          runtime_failure("The derivator data contains a cycle with lemma '" << lemma.first << "'!");
        break;
      }
      node->second.mark = mark;
    }
  }

  // Encode the derivator
  cerr << "Encoding derivator: ";
  os.put(morpho_ids::DERIVATOR_DICTIONARY);

  binary_encoder enc;

  vector<int> lengths;
  for (auto&& lemma : derinet) {
    if (lemma.first.size() >= lengths.size())
      lengths.resize(lemma.first.size() + 1);
    lengths[lemma.first.size()]++;
  }
  enc.add_1B(lengths.size());
  for (auto&& length : lengths)
    enc.add_4B(length);

  enc.add_4B(derinet.size());
  string prev = "";
  for (auto&& lemma : derinet) {
    int cpl = 0;
    while (prev[cpl] && prev[cpl] == lemma.first[cpl]) cpl++;
    enc.add_1B(prev.size() - cpl);
    enc.add_1B(lemma.first.size() - cpl);
    enc.add_data(lemma.first.c_str() + cpl);

    enc.add_1B(lemma.second.comment.size());
    enc.add_data(lemma.second.comment);

    enc.add_2B(lemma.second.children);

    if (lemma.second.parent.empty()) {
      enc.add_1B(0);
    } else {
      unsigned best_lemma_from = 0, best_parent_from = 0, best_len = 0;
      for (unsigned lemma_from = 0; lemma_from < lemma.first.size(); lemma_from++)
        for (unsigned parent_from = 0; parent_from < lemma.second.parent.size(); parent_from++) {
          unsigned len = 0;
          while (lemma_from + len < lemma.first.size() &&
                 parent_from + len < lemma.second.parent.size() &&
                 lemma.first[lemma_from+len] == lemma.second.parent[parent_from+len])
            len++;
          if (len > best_len) best_lemma_from = lemma_from, best_parent_from = parent_from, best_len = len;
        }

      enum { REMOVE_START = 1, REMOVE_END = 2, ADD_START = 4, ADD_END = 8 };
      enc.add_1B(REMOVE_START * (best_lemma_from>0) + REMOVE_END * (best_lemma_from+best_len<lemma.first.size()) +
             ADD_START * (best_parent_from>0) + ADD_END * (best_parent_from+best_len<lemma.second.parent.size()));
      if (best_lemma_from > 0) enc.add_1B(best_lemma_from);
      if (best_lemma_from + best_len < lemma.first.size()) enc.add_1B(lemma.first.size() - best_lemma_from - best_len);
      if (best_parent_from > 0) {
        enc.add_1B(best_parent_from);
        enc.add_data(string_piece(lemma.second.parent.c_str(), best_parent_from));
      }
      if (best_parent_from + best_len < lemma.second.parent.size()) {
        enc.add_1B(lemma.second.parent.size() - best_parent_from - best_len);
        enc.add_data(lemma.second.parent.c_str() + best_parent_from + best_len);
      }
    }

    prev.assign(lemma.first);
  }
  compressor::save(os, enc);

  // Append the morphology after the derivator dictionary model
  if (!dictionary.seekg(dictionary_start, dictionary.beg)) runtime_failure("Cannot seek in the morpho model!");
  for (auto length = dictionary_end - dictionary_start; length; length--)
    os.put(dictionary.get());

  cerr << "done" << endl;
}