void depth_featurizer::tree_tokenize(corpus::document& doc, const parser::parse_tree& tree) const { height_visitor vtor; auto rep = "depth-" + std::to_string(tree.visit(vtor)); doc.increment(rep, 1); }
void depth_analyzer::tree_tokenize(corpus::document& doc, const parse_tree& tree) { size_t h = parse_tree::height(tree); std::string representation = std::to_string(h); doc.increment(representation, 1); }
void ngram_pos_analyzer::tokenize(corpus::document& doc) { // first, get tokens stream_->set_content(get_content(doc)); std::vector<sequence::sequence> sentences; sequence::sequence seq; // put tokens into sequences, excluding sentence markers while (*stream_) { auto next = stream_->next(); if (next.empty() || next == " " || next == "<s>") continue; if (next == "</s>") sentences.emplace_back(std::move(seq)); else seq.add_observation( {sequence::symbol_t{next}, sequence::tag_t{"[unknown]"}}); } auto tagger = crf_->make_tagger(); for (auto& seq : sentences) { // generate CRF features seq_analyzer_.analyze(seq); // POS-tag sentence tagger.tag(seq); // create ngrams for (size_t i = n_value() - 1; i < seq.size(); ++i) { std::string combined = seq_analyzer_.tag(seq[i].label()); for (size_t j = 1; j < n_value(); ++j) { std::string next = seq_analyzer_.tag(seq[i - j].label()); combined = next + "_" + combined; } doc.increment(combined, 1); } } }