Beispiel #1
0
std::string get_content(const corpus::document& doc)
{
    if (!doc.contains_content())
        throw analyzer_exception{
            "document content was not populated for analysis"};

    return utf::to_utf8(doc.content(), doc.encoding());
}
Beispiel #2
0
void depth_featurizer::tree_tokenize(corpus::document& doc,
                                     const parser::parse_tree& tree) const
{
    height_visitor vtor;
    auto rep = "depth-" + std::to_string(tree.visit(vtor));
    doc.increment(rep, 1);
}
void depth_analyzer::tree_tokenize(corpus::document& doc,
                                   const parse_tree& tree)
{
    size_t h = parse_tree::height(tree);
    std::string representation = std::to_string(h);
    doc.increment(representation, 1);
}
Beispiel #4
0
void check_analyzer_expected(Analyzer& ana, corpus::document doc,
                             uint64_t num_unique, uint64_t length)
{
    ana.tokenize(doc);
    ASSERT_EQUAL(doc.counts().size(), num_unique);
    ASSERT_EQUAL(doc.length(), length);
    ASSERT_EQUAL(doc.id(), 47ul);
    if (doc.contains_content())
    {
        ASSERT_EQUAL(doc.path(), "/home/person/filename.txt");
        ASSERT_EQUAL(doc.name(), "filename.txt");
    }
    else
    {
        ASSERT_EQUAL(doc.path(), "../data/sample-document.txt");
        ASSERT_EQUAL(doc.name(), "sample-document.txt");
    }
}
void ngram_pos_analyzer::tokenize(corpus::document& doc)
{
    // first, get tokens
    stream_->set_content(get_content(doc));
    std::vector<sequence::sequence> sentences;
    sequence::sequence seq;

    // put tokens into sequences, excluding sentence markers
    while (*stream_)
    {
        auto next = stream_->next();
        if (next.empty() || next == " " || next == "<s>")
            continue;

        if (next == "</s>")
            sentences.emplace_back(std::move(seq));
        else
            seq.add_observation(
                {sequence::symbol_t{next}, sequence::tag_t{"[unknown]"}});
    }

    auto tagger = crf_->make_tagger();
    for (auto& seq : sentences)
    {
        // generate CRF features
        seq_analyzer_.analyze(seq);

        // POS-tag sentence
        tagger.tag(seq);

        // create ngrams
        for (size_t i = n_value() - 1; i < seq.size(); ++i)
        {
            std::string combined = seq_analyzer_.tag(seq[i].label());
            for (size_t j = 1; j < n_value(); ++j)
            {
                std::string next = seq_analyzer_.tag(seq[i - j].label());
                combined = next + "_" + combined;
            }

            doc.increment(combined, 1);
        }
    }
}
Beispiel #6
0
std::vector<std::pair<doc_id, double>>
ranker::score(inverted_index& idx, corpus::document& query,
              uint64_t num_results /* = 10 */,
              const std::function<bool(doc_id d_id)>& filter /* return true */)
{
    if (query.counts().empty())
        idx.tokenize(query);

    score_data sd{idx,            idx.avg_doc_length(),
                  idx.num_docs(), idx.total_corpus_terms(),
                  query};

    // zeros out elements and (if necessary) resizes the vector; this eliminates
    // constructing a new vector each query for the same index
    results_.assign(sd.num_docs, std::numeric_limits<double>::lowest());

    for (auto& tpair : query.counts())
    {
        term_id t_id{idx.get_term_id(tpair.first)};
        auto pdata = idx.search_primary(t_id);
        sd.doc_count = pdata->counts().size();
        sd.t_id = t_id;
        sd.query_term_count = tpair.second;
        sd.corpus_term_count = idx.total_num_occurences(sd.t_id);
        for (auto& dpair : pdata->counts())
        {
            sd.d_id = dpair.first;
            sd.doc_term_count = dpair.second;
            sd.doc_size = idx.doc_size(dpair.first);
            sd.doc_unique_terms = idx.unique_terms(dpair.first);

            // if this is the first time we've seen this document, compute
            // its initial score
            if (results_[dpair.first] == std::numeric_limits<double>::lowest())
                results_[dpair.first] = initial_score(sd);

            results_[dpair.first] += score_one(sd);
        }
    }

    using doc_pair = std::pair<doc_id, double>;
    auto doc_pair_comp = [](const doc_pair& a, const doc_pair& b)
    { return a.second > b.second; };

    std::priority_queue<doc_pair,
                        std::vector<doc_pair>,
                        decltype(doc_pair_comp)> pq{doc_pair_comp};
    for (uint64_t id = 0; id < results_.size(); ++id)
    {
        if (!filter(doc_id{id}))
            continue;

        pq.emplace(doc_id{id}, results_[id]);
        if (pq.size() > num_results)
            pq.pop();
    }

    std::vector<doc_pair> sorted;
    while (!pq.empty())
    {
        sorted.emplace_back(pq.top());
        pq.pop();
    }
    std::reverse(sorted.begin(), sorted.end());

    return sorted;
}