Example #1
0
bool SimpleIndex::addDoc(const string &doc_name, const ValueMap &doc_term_counts){
	if (doc_dict.getId(doc_name) != -1){
		return false;
	}
	int doc_id = doc_dict.getId(doc_name, true);

	doc_info_list.push_back(DocInfo());
	DocInfo &doc_info = doc_info_list.back();

	for (shared_ptr<ConstValueIterator> value_itr = doc_term_counts.const_iterator(); value_itr->ok(); value_itr->next()){
		const int term_id = value_itr->id();
		const double term_count = value_itr->get();
		doc_info.doc_length += term_count;
		doc_info.term_list.push_back(make_pair(term_id, term_count));

		map<int, TermInfo>::iterator itr;
		tie(itr, tuples::ignore) = term_info_map.insert(make_pair(term_id, TermInfo()));
		itr->second.doc_list.push_back(make_pair(doc_id, term_count));
	}

	return true;
}
Example #2
0
void SimpleKLRetriever::retrieve(const SimpleIndex &index, const ValueMap &query_term_counts, std::vector<std::pair<int, double> > &ranking) const {
	ranking.clear();

	vector<double> doc_scores(index.getDocCount() + 1);
	fill(doc_scores.begin(), doc_scores.end(), 0.0);

	double query_length = 0.0;
	double col_likelihood = 0.0;
	for (shared_ptr<ConstValueIterator> value_itr = query_term_counts.const_iterator(); value_itr->ok(); value_itr->next()){
		const int term_id = value_itr->id();
		const double query_term_count = value_itr->get();

		query_length += query_term_count;
		const double col_prob = getColProb(term_id);
		col_likelihood += query_term_count * log(col_prob);

		const vector<pair<int, float> > *doc_list = index.getDocList(term_id);
		if (doc_list) {
			typedef pair<int, float> P;
			BOOST_FOREACH(const P &p, *doc_list) {
				const int doc_id = p.first;
				const float doc_term_count = p.second;
				doc_scores[doc_id] += query_term_count * log(1.0 + doc_term_count / dir_prior / col_prob);
			}
		}
	}

	for (int doc_id = 1; doc_id <= index.getDocCount(); ++ doc_id){
		if (doc_scores[doc_id] > 0.0){
			double score = doc_scores[doc_id] + col_likelihood;
			score /= query_length;
			score += log(dir_prior / (index.getDocLength(doc_id) + dir_prior));
			ranking.push_back(make_pair(doc_id, score));
		}
	}
	sort(ranking.begin(), ranking.end(), util::cmp2ndReverse<int, double>);
}