std::string get_content(const corpus::document& doc) { if (!doc.contains_content()) throw analyzer_exception{ "document content was not populated for analysis"}; return utf::to_utf8(doc.content(), doc.encoding()); }
void depth_featurizer::tree_tokenize(corpus::document& doc, const parser::parse_tree& tree) const { height_visitor vtor; auto rep = "depth-" + std::to_string(tree.visit(vtor)); doc.increment(rep, 1); }
void depth_analyzer::tree_tokenize(corpus::document& doc, const parse_tree& tree) { size_t h = parse_tree::height(tree); std::string representation = std::to_string(h); doc.increment(representation, 1); }
void check_analyzer_expected(Analyzer& ana, corpus::document doc, uint64_t num_unique, uint64_t length) { ana.tokenize(doc); ASSERT_EQUAL(doc.counts().size(), num_unique); ASSERT_EQUAL(doc.length(), length); ASSERT_EQUAL(doc.id(), 47ul); if (doc.contains_content()) { ASSERT_EQUAL(doc.path(), "/home/person/filename.txt"); ASSERT_EQUAL(doc.name(), "filename.txt"); } else { ASSERT_EQUAL(doc.path(), "../data/sample-document.txt"); ASSERT_EQUAL(doc.name(), "sample-document.txt"); } }
void ngram_pos_analyzer::tokenize(corpus::document& doc) { // first, get tokens stream_->set_content(get_content(doc)); std::vector<sequence::sequence> sentences; sequence::sequence seq; // put tokens into sequences, excluding sentence markers while (*stream_) { auto next = stream_->next(); if (next.empty() || next == " " || next == "<s>") continue; if (next == "</s>") sentences.emplace_back(std::move(seq)); else seq.add_observation( {sequence::symbol_t{next}, sequence::tag_t{"[unknown]"}}); } auto tagger = crf_->make_tagger(); for (auto& seq : sentences) { // generate CRF features seq_analyzer_.analyze(seq); // POS-tag sentence tagger.tag(seq); // create ngrams for (size_t i = n_value() - 1; i < seq.size(); ++i) { std::string combined = seq_analyzer_.tag(seq[i].label()); for (size_t j = 1; j < n_value(); ++j) { std::string next = seq_analyzer_.tag(seq[i - j].label()); combined = next + "_" + combined; } doc.increment(combined, 1); } } }
std::vector<std::pair<doc_id, double>> ranker::score(inverted_index& idx, corpus::document& query, uint64_t num_results /* = 10 */, const std::function<bool(doc_id d_id)>& filter /* return true */) { if (query.counts().empty()) idx.tokenize(query); score_data sd{idx, idx.avg_doc_length(), idx.num_docs(), idx.total_corpus_terms(), query}; // zeros out elements and (if necessary) resizes the vector; this eliminates // constructing a new vector each query for the same index results_.assign(sd.num_docs, std::numeric_limits<double>::lowest()); for (auto& tpair : query.counts()) { term_id t_id{idx.get_term_id(tpair.first)}; auto pdata = idx.search_primary(t_id); sd.doc_count = pdata->counts().size(); sd.t_id = t_id; sd.query_term_count = tpair.second; sd.corpus_term_count = idx.total_num_occurences(sd.t_id); for (auto& dpair : pdata->counts()) { sd.d_id = dpair.first; sd.doc_term_count = dpair.second; sd.doc_size = idx.doc_size(dpair.first); sd.doc_unique_terms = idx.unique_terms(dpair.first); // if this is the first time we've seen this document, compute // its initial score if (results_[dpair.first] == std::numeric_limits<double>::lowest()) results_[dpair.first] = initial_score(sd); results_[dpair.first] += score_one(sd); } } using doc_pair = std::pair<doc_id, double>; auto doc_pair_comp = [](const doc_pair& a, const doc_pair& b) { return a.second > b.second; }; std::priority_queue<doc_pair, std::vector<doc_pair>, decltype(doc_pair_comp)> pq{doc_pair_comp}; for (uint64_t id = 0; id < results_.size(); ++id) { if (!filter(doc_id{id})) continue; pq.emplace(doc_id{id}, results_[id]); if (pq.size() > num_results) pq.pop(); } std::vector<doc_pair> sorted; while (!pq.empty()) { sorted.emplace_back(pq.top()); pq.pop(); } std::reverse(sorted.begin(), sorted.end()); return sorted; }