static void printClarity( const std::string& query, indri::api::QueryEnvironment & env, const std::vector<indri::query::RelevanceModel::Gram*>& grams, int numTerms ) { int count = 0; double sum=0, ln_Pr=0; for( size_t j=0; j< numTerms && j < grams.size(); j++ ) { std::string t = grams[j]->terms[0]; count++; // query-clarity = SUM_w{P(w|Q)*log(P(w|Q)/P(w))} // P(w)=cf(w)/|C| // the relevance model uses stemmed terms, so use stemCount double pw = ((double)env.stemCount(t)/(double)env.termCount()); // P(w|Q) is a prob computed by any model, e.g. relevance models double pwq = grams[j]->weight; sum += pwq; ln_Pr += (pwq)*log(pwq/pw); } std::cout << "# query: " << query << " = " << count << " " << (ln_Pr/(sum ? sum : 1.0)/log(2.0)) << std::endl; for( size_t j=0; j< numTerms && j < grams.size(); j++ ) { std::string t = grams[j]->terms[0]; double pw = ((double)env.stemCount(t)/(double)env.termCount()); std::cout << t << " " << (grams[j]->weight*log(grams[j]->weight/ // the relevance model uses stemmed terms, so use stemCount ((double)env.stemCount(t)/ (double)env.termCount())))/log(2.0) << std::endl; } }
SEXP collFreq(string _term) { long res = environment.termCount(_term); return Rcpp::wrap(res); }
SEXP getTermCount() { long res = environment.termCount(); return Rcpp::wrap(res); }