Esempio n. 1
0
void depth_featurizer::tree_tokenize(corpus::document& doc,
                                     const parser::parse_tree& tree) const
{
    height_visitor vtor;
    auto rep = "depth-" + std::to_string(tree.visit(vtor));
    doc.increment(rep, 1);
}
Esempio n. 2
0
void depth_analyzer::tree_tokenize(corpus::document& doc,
                                   const parse_tree& tree)
{
    size_t h = parse_tree::height(tree);
    std::string representation = std::to_string(h);
    doc.increment(representation, 1);
}
Esempio n. 3
0
void ngram_pos_analyzer::tokenize(corpus::document& doc)
{
    // first, get tokens
    stream_->set_content(get_content(doc));
    std::vector<sequence::sequence> sentences;
    sequence::sequence seq;

    // put tokens into sequences, excluding sentence markers
    while (*stream_)
    {
        auto next = stream_->next();
        if (next.empty() || next == " " || next == "<s>")
            continue;

        if (next == "</s>")
            sentences.emplace_back(std::move(seq));
        else
            seq.add_observation(
                {sequence::symbol_t{next}, sequence::tag_t{"[unknown]"}});
    }

    auto tagger = crf_->make_tagger();
    for (auto& seq : sentences)
    {
        // generate CRF features
        seq_analyzer_.analyze(seq);

        // POS-tag sentence
        tagger.tag(seq);

        // create ngrams
        for (size_t i = n_value() - 1; i < seq.size(); ++i)
        {
            std::string combined = seq_analyzer_.tag(seq[i].label());
            for (size_t j = 1; j < n_value(); ++j)
            {
                std::string next = seq_analyzer_.tag(seq[i - j].label());
                combined = next + "_" + combined;
            }

            doc.increment(combined, 1);
        }
    }
}