bool SimpleIndex::addDoc(const string &doc_name, const ValueMap &doc_term_counts){ if (doc_dict.getId(doc_name) != -1){ return false; } int doc_id = doc_dict.getId(doc_name, true); doc_info_list.push_back(DocInfo()); DocInfo &doc_info = doc_info_list.back(); for (shared_ptr<ConstValueIterator> value_itr = doc_term_counts.const_iterator(); value_itr->ok(); value_itr->next()){ const int term_id = value_itr->id(); const double term_count = value_itr->get(); doc_info.doc_length += term_count; doc_info.term_list.push_back(make_pair(term_id, term_count)); map<int, TermInfo>::iterator itr; tie(itr, tuples::ignore) = term_info_map.insert(make_pair(term_id, TermInfo())); itr->second.doc_list.push_back(make_pair(doc_id, term_count)); } return true; }
void SimpleKLRetriever::retrieve(const SimpleIndex &index, const ValueMap &query_term_counts, std::vector<std::pair<int, double> > &ranking) const { ranking.clear(); vector<double> doc_scores(index.getDocCount() + 1); fill(doc_scores.begin(), doc_scores.end(), 0.0); double query_length = 0.0; double col_likelihood = 0.0; for (shared_ptr<ConstValueIterator> value_itr = query_term_counts.const_iterator(); value_itr->ok(); value_itr->next()){ const int term_id = value_itr->id(); const double query_term_count = value_itr->get(); query_length += query_term_count; const double col_prob = getColProb(term_id); col_likelihood += query_term_count * log(col_prob); const vector<pair<int, float> > *doc_list = index.getDocList(term_id); if (doc_list) { typedef pair<int, float> P; BOOST_FOREACH(const P &p, *doc_list) { const int doc_id = p.first; const float doc_term_count = p.second; doc_scores[doc_id] += query_term_count * log(1.0 + doc_term_count / dir_prior / col_prob); } } } for (int doc_id = 1; doc_id <= index.getDocCount(); ++ doc_id){ if (doc_scores[doc_id] > 0.0){ double score = doc_scores[doc_id] + col_likelihood; score /= query_length; score += log(dir_prior / (index.getDocLength(doc_id) + dir_prior)); ranking.push_back(make_pair(doc_id, score)); } } sort(ranking.begin(), ranking.end(), util::cmp2ndReverse<int, double>); }