void sentence_processor::save(binary_encoder& enc) { enc.add_4B(window); enc.add_4B(map.bucket_count()); enc.add_4B(map.size()); vector<pair<string, ner_feature>> map_elements(map.begin(), map.end()); sort(map_elements.begin(), map_elements.end()); for (auto&& element : map_elements) { enc.add_str(element.first); enc.add_4B(element.second); } }
void morpho_prefix_guesser_encoder::encode(istream& is, binary_encoder& enc) { vector<string> filters; unordered_map<string, uint32_t> filters_map; unordered_map<string, uint32_t> prefixes_initial; unordered_map<string, uint32_t> prefixes_middle; auto* prefixes_current = &prefixes_initial; // Load prefix guesser string line; vector<string> tokens; while (getline(is, line)) { if (line.empty() && prefixes_current == &prefixes_initial) { prefixes_current = &prefixes_middle; continue; } split(line, '\t', tokens); if (tokens.size() != 2) training_failure("Line " << line << " in prefix guesser prefixes file does not contain two columns!"); auto it = filters_map.emplace(tokens[1], 1<<filters.size()); if (it.second) filters.emplace_back(tokens[1]); auto filter = it.first->second; if (!filter) training_failure("Too much different tag filters in the prefix guesser when adding tag filter '" << tokens[1] << "'!"); (*prefixes_current)[tokens[0]] |= filter; } // Encode prefix guesser enc.add_1B(filters.size()); for (auto&& filter : filters) { enc.add_1B(filter.size()); enc.add_data(filter); } persistent_unordered_map(prefixes_initial, 5, true, false, [](binary_encoder& enc, uint32_t mask) { enc.add_4B(mask); }).save(enc); persistent_unordered_map(prefixes_middle, 5, true, false, [](binary_encoder& enc, uint32_t mask) { enc.add_4B(mask); }).save(enc); }
void morpho_statistical_guesser_encoder::encode(FILE* f, binary_encoder& enc) { unordered_map<string, vector<pair<vector<string>, vector<int>>>> statistical_guesser; vector<string> tags; unordered_map<string, int> tags_map; // Load statistical guesser string line; vector<string> tokens; if (!getline(f, line)) runtime_errorf("Missing first line with default tag in statistical guesser file"); int statistical_guesser_default = tags_map.emplace(line.data(), tags.size()).first->second; if (unsigned(statistical_guesser_default) >= tags.size()) tags.emplace_back(line.data()); while (getline(f, line)) { split(line, '\t', tokens); if (tokens.size() < 3 || (tokens.size() % 2) != 1) runtime_errorf("Cannot parse line %s in statistical guesser file!", line.c_str()); vector<string> affixes; split(tokens[0], ' ', affixes); if (affixes.size() != 2) runtime_errorf("Cannot parse prefix_suffix '%s' in statistical guesser file!", tokens[0].c_str()); reverse(affixes[1].begin(), affixes[1].end()); auto& rules = statistical_guesser[affixes[1] + ' ' + affixes[0]]; for (unsigned i = 1; i < tokens.size(); i+= 2) { vector<string> replacements; split(tokens[i], ' ', replacements); if (replacements.size() != 4) runtime_errorf("Cannot parse replacement rule '%s' in statistical guesser file!", tokens[i].c_str()); vector<string> rule_tags; split(tokens[i+1], ' ', rule_tags); vector<int> decoded_tags; for (auto&& rule_tag : rule_tags) { int tag = tags_map.emplace(rule_tag, tags.size()).first->second; if (unsigned(tag) >= tags.size()) tags.emplace_back(rule_tag); decoded_tags.emplace_back(tag); } rules.emplace_back(replacements, decoded_tags); } } // Encode statistical guesser enc.add_2B(tags.size()); for (auto&& tag : tags) { enc.add_1B(tag.size()); enc.add_str(tag); } enc.add_2B(statistical_guesser_default); persistent_unordered_map(statistical_guesser, 5, true, false, [](binary_encoder& enc, vector<pair<vector<string>, vector<int>>> rules) { binary_encoder e; e.add_1B(rules.size()); for (auto&& rule : rules) { if (rule.first.size() != 4) runtime_errorf("Replacement rule not of size 4 in statistical guesser!"); for (auto&& affix : rule.first) { e.add_1B(affix.size()); e.add_str(affix); } e.add_1B(rule.second.size()); for (auto&& tag : rule.second) e.add_2B(tag); } enc.add_2B(e.data.size()); enc.add_data(e.data); }).save(enc); }