void tag_xml(istream& is, ostream& os, const tagger& tagger, tokenizer& tokenizer, const tagset_converter& tagset_converter, const derivation_formatter& derivation, morpho::guesser_mode guesser) { string para; vector<string_piece> forms; vector<tagged_lemma> tags; while (getpara(is, para)) { // Tokenize and tag tokenizer.set_text(para); const char* unprinted = para.c_str(); while (tokenizer.next_sentence(&forms, nullptr)) { tagger.tag(forms, tags, guesser); for (unsigned i = 0; i < forms.size(); i++) { tagset_converter.convert(tags[i]); derivation.format_derivation(tags[i].lemma); os << xml_encoded(string_piece(unprinted, forms[i].str - unprinted)); if (!i) os << "<sentence>"; os << "<token lemma=\"" << xml_encoded(tags[i].lemma, true) << "\" tag=\"" << xml_encoded(tags[i].tag, true) << "\">" << xml_encoded(forms[i]) << "</token>"; if (i + 1 == forms.size()) os << "</sentence>"; unprinted = forms[i].str + forms[i].len; } } os << xml_encoded(string_piece(unprinted, para.c_str() + para.size() - unprinted)) << flush; } }
void recognize_vertical(istream& is, ostream& os, const ner& recognizer, tokenizer& tokenizer) { string para; vector<string_piece> forms; vector<named_entity> entities; unsigned total_tokens = 0; string entity_ids, entity_text; while (getpara(is, para)) { // Tokenize and tag tokenizer.set_text(para); while (tokenizer.next_sentence(&forms, nullptr)) { recognizer.recognize(forms, entities); sort_entities(entities); for (auto&& entity : entities) { entity_ids.clear(); entity_text.clear(); for (auto i = entity.start; i < entity.start + entity.length; i++) { if (i > entity.start) { entity_ids += ','; entity_text += ' '; } entity_ids += to_string(total_tokens + i + 1); entity_text.append(forms[i].str, forms[i].len); } os << entity_ids << '\t' << entity.type << '\t' << entity_text << '\n'; } os << flush; total_tokens += forms.size() + 1; } } }
void tokenize_vertical(istream& is, ostream& os, tokenizer& tokenizer) { string para; vector<string_piece> forms; while (getpara(is, para)) { // Tokenize tokenizer.set_text(para); while (tokenizer.next_sentence(&forms, nullptr)) { for (auto&& form : forms) { os << form << '\n'; } os << '\n' << flush; } } }
void tag_vertical(istream& is, ostream& os, const tagger& tagger, tokenizer& tokenizer, const tagset_converter& tagset_converter, const derivation_formatter& derivation, morpho::guesser_mode guesser) { string para; vector<string_piece> forms; vector<tagged_lemma> tags; while (getpara(is, para)) { // Tokenize and tag tokenizer.set_text(para); while (tokenizer.next_sentence(&forms, nullptr)) { tagger.tag(forms, tags, guesser); for (unsigned i = 0; i < tags.size(); i++) { tagset_converter.convert(tags[i]); derivation.format_derivation(tags[i].lemma); os << forms[i] << '\t' << tags[i].lemma << '\t' << tags[i].tag << '\n'; } os << endl; } } }
static void tokenize_xml(istream& is, ostream& os, tokenizer& tokenizer) { string para; vector<string_piece> forms; while (getpara(is, para)) { // Tokenize tokenizer.set_text(para); const char* unprinted = para.c_str(); while (tokenizer.next_sentence(&forms, nullptr)) for (unsigned i = 0; i < forms.size(); i++) { if (unprinted < forms[i].str) os << xml_encoded(string_piece(unprinted, forms[i].str - unprinted)); if (!i) os << "<sentence>"; os << "<token>" << xml_encoded(forms[i]) << "</token>"; if (i + 1 == forms.size()) os << "</sentence>"; unprinted = forms[i].str + forms[i].len; } if (unprinted < para.c_str() + para.size()) os << xml_encoded(string_piece(unprinted, para.c_str() + para.size() - unprinted)); os << flush; } }
void recognize_untokenized(istream& is, ostream& os, const ner& recognizer, tokenizer& tokenizer) { string para; vector<string_piece> forms; vector<named_entity> entities; vector<size_t> entity_ends; while (getpara(is, para)) { // Tokenize the text and find named entities tokenizer.set_text(para); const char* unprinted = para.c_str(); while (tokenizer.next_sentence(&forms, nullptr)) { recognizer.recognize(forms, entities); sort_entities(entities); for (unsigned i = 0, e = 0; i < forms.size(); i++) { if (unprinted < forms[i].str) os << xml_encoded(string_piece(unprinted, forms[i].str - unprinted)); if (i == 0) os << "<sentence>"; // Open entities starting at current token for (; e < entities.size() && entities[e].start == i; e++) { os << "<ne type=\"" << xml_encoded(entities[e].type, true) << "\">"; entity_ends.push_back(entities[e].start + entities[e].length - 1); } // The token itself os << "<token>" << xml_encoded(forms[i]) << "</token>"; // Close entities ending after current token while (!entity_ends.empty() && entity_ends.back() == i) { os << "</ne>"; entity_ends.pop_back(); } if (i + 1 == forms.size()) os << "</sentence>"; unprinted = forms[i].str + forms[i].len; } } // Write rest of the text (should be just spaces) if (unprinted < para.c_str() + para.size()) os << xml_encoded(string_piece(unprinted, para.c_str() + para.size() - unprinted)); os << flush; } }
void recognize_conll(istream& is, ostream& os, const ner& recognizer, tokenizer& tokenizer) { string para; vector<string_piece> forms; vector<named_entity> entities; while (getpara(is, para)) { // Tokenize and tag tokenizer.set_text(para); while (tokenizer.next_sentence(&forms, nullptr)) { recognizer.recognize(forms, entities); sort_entities(entities); string entity_type; unsigned in_entity = 0; bool entity_start; for (unsigned i = 0, e = 0; i < forms.size(); i++) { if (!in_entity && e < entities.size() && entities[e].start == i) { in_entity = entities[e].length; entity_start = true; entity_type = entities[e].type; e++; } os << forms[i] << '\t'; if (in_entity) { os << (entity_start ? "B-" : "I-") << entity_type; entity_start = false; in_entity--; } else { os << '_'; } os << '\n'; } os << '\n' << flush; } } }