Пример #1
0
void morpho_prefix_guesser_encoder::encode(istream& is, binary_encoder& enc) {
  vector<string> filters;
  unordered_map<string, uint32_t> filters_map;
  unordered_map<string, uint32_t> prefixes_initial;
  unordered_map<string, uint32_t> prefixes_middle;
  auto* prefixes_current = &prefixes_initial;

  // Load prefix guesser
  string line;
  vector<string> tokens;
  while (getline(is, line)) {
    if (line.empty() && prefixes_current == &prefixes_initial) {
      prefixes_current = &prefixes_middle;
      continue;
    }
    split(line, '\t', tokens);
    if (tokens.size() != 2) training_failure("Line " << line << " in prefix guesser prefixes file does not contain two columns!");

    auto it = filters_map.emplace(tokens[1], 1<<filters.size());
    if (it.second)
      filters.emplace_back(tokens[1]);
    auto filter = it.first->second;
    if (!filter) training_failure("Too much different tag filters in the prefix guesser when adding tag filter '" << tokens[1] << "'!");

    (*prefixes_current)[tokens[0]] |= filter;
  }

  // Encode prefix guesser
  enc.add_1B(filters.size());
  for (auto&& filter : filters) {
    enc.add_1B(filter.size());
    enc.add_data(filter);
  }

  persistent_unordered_map(prefixes_initial, 5, true, false, [](binary_encoder& enc, uint32_t mask) { enc.add_4B(mask); }).save(enc);
  persistent_unordered_map(prefixes_middle, 5, true, false, [](binary_encoder& enc, uint32_t mask) { enc.add_4B(mask); }).save(enc);
}
void morpho_statistical_guesser_encoder::encode(FILE* f, binary_encoder& enc) {
  unordered_map<string, vector<pair<vector<string>, vector<int>>>> statistical_guesser;
  vector<string> tags;
  unordered_map<string, int> tags_map;

  // Load statistical guesser
  string line;
  vector<string> tokens;
  if (!getline(f, line)) runtime_errorf("Missing first line with default tag in statistical guesser file");
  int statistical_guesser_default = tags_map.emplace(line.data(), tags.size()).first->second;
  if (unsigned(statistical_guesser_default) >= tags.size()) tags.emplace_back(line.data());

  while (getline(f, line)) {
    split(line, '\t', tokens);
    if (tokens.size() < 3 || (tokens.size() % 2) != 1) runtime_errorf("Cannot parse line %s in statistical guesser file!", line.c_str());

    vector<string> affixes;
    split(tokens[0], ' ', affixes);
    if (affixes.size() != 2) runtime_errorf("Cannot parse prefix_suffix '%s' in statistical guesser file!", tokens[0].c_str());
    reverse(affixes[1].begin(), affixes[1].end());

    auto& rules = statistical_guesser[affixes[1] + ' ' + affixes[0]];
    for (unsigned i = 1; i < tokens.size(); i+= 2) {
      vector<string> replacements;
      split(tokens[i], ' ', replacements);
      if (replacements.size() != 4) runtime_errorf("Cannot parse replacement rule '%s' in statistical guesser file!", tokens[i].c_str());

      vector<string> rule_tags;
      split(tokens[i+1], ' ', rule_tags);
      vector<int> decoded_tags;
      for (auto&& rule_tag : rule_tags) {
        int tag = tags_map.emplace(rule_tag, tags.size()).first->second;
        if (unsigned(tag) >= tags.size()) tags.emplace_back(rule_tag);
        decoded_tags.emplace_back(tag);
      }

      rules.emplace_back(replacements, decoded_tags);
    }
  }

  // Encode statistical guesser
  enc.add_2B(tags.size());
  for (auto&& tag : tags) {
    enc.add_1B(tag.size());
    enc.add_str(tag);
  }
  enc.add_2B(statistical_guesser_default);

  persistent_unordered_map(statistical_guesser, 5, true, false, [](binary_encoder& enc, vector<pair<vector<string>, vector<int>>> rules) {
    binary_encoder e;
    e.add_1B(rules.size());
    for (auto&& rule : rules) {
      if (rule.first.size() != 4) runtime_errorf("Replacement rule not of size 4 in statistical guesser!");
      for (auto&& affix : rule.first) {
        e.add_1B(affix.size());
        e.add_str(affix);
      }
      e.add_1B(rule.second.size());
      for (auto&& tag : rule.second)
        e.add_2B(tag);
    }
    enc.add_2B(e.data.size());
    enc.add_data(e.data);
  }).save(enc);
}