static void printClarity( const std::string& query, indri::api::QueryEnvironment & env, const std::vector<indri::query::RelevanceModel::Gram*>& grams, int numTerms ) { int count = 0; double sum=0, ln_Pr=0; for( size_t j=0; j< numTerms && j < grams.size(); j++ ) { std::string t = grams[j]->terms[0]; count++; // query-clarity = SUM_w{P(w|Q)*log(P(w|Q)/P(w))} // P(w)=cf(w)/|C| // the relevance model uses stemmed terms, so use stemCount double pw = ((double)env.stemCount(t)/(double)env.termCount()); // P(w|Q) is a prob computed by any model, e.g. relevance models double pwq = grams[j]->weight; sum += pwq; ln_Pr += (pwq)*log(pwq/pw); } std::cout << "# query: " << query << " = " << count << " " << (ln_Pr/(sum ? sum : 1.0)/log(2.0)) << std::endl; for( size_t j=0; j< numTerms && j < grams.size(); j++ ) { std::string t = grams[j]->terms[0]; double pw = ((double)env.stemCount(t)/(double)env.termCount()); std::cout << t << " " << (grams[j]->weight*log(grams[j]->weight/ // the relevance model uses stemmed terms, so use stemCount ((double)env.stemCount(t)/ (double)env.termCount())))/log(2.0) << std::endl; } }
void buildStats() { HGram::iterator iter; resultsData.tfMatrix = arma::zeros<arma::mat>(results.size(), _gramTable.size()); // Initialize the resultsData.dfVector.set_size(_gramTable.size()); resultsData.ctfVector.set_size(_gramTable.size()); int tmpTermID = -1; for( iter = _gramTable.begin(); iter != _gramTable.end(); iter++ ) { double gramCount = 0; ++tmpTermID; Gram* gram = *iter->first; GramCounts* gramCounts = *iter->second; gram->internal_termID = tmpTermID; terms.push_back(gram->term); if( resultsData.queryStems.find(gram->term) != resultsData.queryStems.end() ) resultsData.queryStemIndex[gram->term] = tmpTermID; resultsData.ctfVector(tmpTermID) = environment.stemCount(gram->term); resultsData.dfVector(tmpTermID) = environment.documentStemCount(gram->term); size_t c, r; for( r = 0, c = 0; r < results.size() && c < gramCounts->counts.size(); r++ ) { if( gramCounts->counts[c].first == r ) { resultsData.tfMatrix(r, tmpTermID) = gramCounts->counts[c].second; c++; } } } _gramTable.clear(); }
void matIR::QueryStats::init(const std::string& query, indri::api::QueryEnvironment& environment) { // Extract only the terms from the query and add to the vector indri::api::QueryParserWrapper *parser = indri::api::QueryParserFactory::get(query, "indri"); indri::lang::ScoredExtentNode* rootNode = parser->query(); indri::lang::RawScorerNodeExtractor extractor; rootNode->walk(extractor); std::vector<indri::lang::RawScorerNode*>& scorerNodes = extractor.getScorerNodes(); for (int i = 0; i < scorerNodes.size(); i++){ std::string qterm = environment.stemTerm(scorerNodes[i]->queryText()); queryString.push_back(qterm); if(environment.stemCount(qterm) == 0) continue; if( _queryTokens.find(qterm) == _queryTokens.end() ) _queryTokens.insert(make_pair( qterm, 1)); else _queryTokens[qterm] += 1; } // Initialize vectors _query_collectionFrequency.set_size(_queryTokens.size()); _query_documentFrequency.set_size(_queryTokens.size()); // Now obtain the statistics int i = 0; map<std::string, int>::const_iterator iter; for (iter=_queryTokens.begin(); iter != _queryTokens.end(); ++iter) { std::string stem = environment.stemTerm(iter->first); _query_collectionFrequency(i) = (double) environment.stemCount(stem); _query_documentFrequency(i) = (double) environment.documentStemCount(stem); ++i; } }
void updateQueryDetails(indri::api::QueryEnvironment& environment, Results& resultData, string query){ indri::api::QueryParserWrapper *parser = indri::api::QueryParserFactory::get(query, "indri"); indri::lang::ScoredExtentNode* rootNode = parser->query(); indri::lang::RawScorerNodeExtractor extractor; rootNode->walk(extractor); vector<indri::lang::RawScorerNode*>& scorerNodes = extractor.getScorerNodes(); for (int i = 0; i < scorerNodes.size(); i++){ string qterm = environment.stemTerm(scorerNodes[i]->queryText()); if(environment.stemCount(qterm) == 0) continue; if( resultData.queryStems.find(qterm) == resultData.queryStems.end() ){ resultData.queryStems.insert(make_pair( qterm, 1)); resultData.queryStemOrder.push_back(qterm); } else resultData.queryStems[qterm] += 1; } }