Beispiel #1
0
static void printClarity( const std::string& query, 
                          indri::api::QueryEnvironment & env, 
                          const std::vector<indri::query::RelevanceModel::Gram*>& grams, int numTerms ) {

  int count = 0;
  double sum=0, ln_Pr=0;
  for( size_t j=0; j< numTerms && j < grams.size(); j++ ) {
    std::string t = grams[j]->terms[0];
    count++;
    // query-clarity = SUM_w{P(w|Q)*log(P(w|Q)/P(w))}
    // P(w)=cf(w)/|C|
    // the relevance model uses stemmed terms, so use stemCount
    double pw = ((double)env.stemCount(t)/(double)env.termCount());
    // P(w|Q) is a prob computed by any model, e.g. relevance models
    double pwq = grams[j]->weight;
    sum += pwq;    
    ln_Pr += (pwq)*log(pwq/pw);
  }
  std::cout << "# query: " << query <<  " = " << count << " " 
            << (ln_Pr/(sum ? sum : 1.0)/log(2.0)) << std::endl;
  for( size_t j=0; j< numTerms && j < grams.size(); j++ ) {
    std::string t = grams[j]->terms[0];
    double pw = ((double)env.stemCount(t)/(double)env.termCount());
    std::cout << t << " "
              << (grams[j]->weight*log(grams[j]->weight/
    // the relevance model uses stemmed terms, so use stemCount
                            ((double)env.stemCount(t)/
                             (double)env.termCount())))/log(2.0) << std::endl;
  }
}
Beispiel #2
0
    void buildStats() {
        HGram::iterator iter;
        resultsData.tfMatrix = arma::zeros<arma::mat>(results.size(),
                                                      _gramTable.size());
        // Initialize the
        resultsData.dfVector.set_size(_gramTable.size());
        resultsData.ctfVector.set_size(_gramTable.size());

        int tmpTermID = -1;
        for( iter = _gramTable.begin(); iter != _gramTable.end(); iter++ ) {
            double gramCount = 0;
            ++tmpTermID;
            Gram* gram = *iter->first;
            GramCounts* gramCounts = *iter->second;
            gram->internal_termID = tmpTermID;
            terms.push_back(gram->term);
             if( resultsData.queryStems.find(gram->term) != resultsData.queryStems.end() )
                resultsData.queryStemIndex[gram->term] = tmpTermID;

            resultsData.ctfVector(tmpTermID) = environment.stemCount(gram->term);
            resultsData.dfVector(tmpTermID) =  environment.documentStemCount(gram->term);
            size_t c, r;
            for( r = 0, c = 0; r < results.size() && c < gramCounts->counts.size(); r++ ) {
                if( gramCounts->counts[c].first == r ) {
                    resultsData.tfMatrix(r, tmpTermID) = gramCounts->counts[c].second;
                    c++;
                }
            }
        }
        _gramTable.clear();
    }
Beispiel #3
0
void matIR::QueryStats::init(const std::string& query, indri::api::QueryEnvironment& environment)
{

    // Extract only the terms from the query and add to the vector
    indri::api::QueryParserWrapper *parser = indri::api::QueryParserFactory::get(query, "indri");
    indri::lang::ScoredExtentNode* rootNode = parser->query();
    indri::lang::RawScorerNodeExtractor extractor;
    rootNode->walk(extractor);
    std::vector<indri::lang::RawScorerNode*>& scorerNodes = extractor.getScorerNodes();

    for (int i = 0; i < scorerNodes.size(); i++){
        std::string qterm = environment.stemTerm(scorerNodes[i]->queryText());
        queryString.push_back(qterm);
        if(environment.stemCount(qterm) == 0)
            continue;
        if( _queryTokens.find(qterm) == _queryTokens.end() )
            _queryTokens.insert(make_pair( qterm, 1));
        else
            _queryTokens[qterm] += 1;
    }

    // Initialize vectors


    _query_collectionFrequency.set_size(_queryTokens.size());
    _query_documentFrequency.set_size(_queryTokens.size());



    // Now obtain the statistics
    int i = 0;
    map<std::string, int>::const_iterator iter;
    for (iter=_queryTokens.begin(); iter != _queryTokens.end(); ++iter) {
        std::string stem = environment.stemTerm(iter->first);
        _query_collectionFrequency(i) = (double) environment.stemCount(stem);
        _query_documentFrequency(i) = (double) environment.documentStemCount(stem);
        ++i;

    }
}
Beispiel #4
0
    void updateQueryDetails(indri::api::QueryEnvironment& environment,
                            Results& resultData,
                            string query){

        indri::api::QueryParserWrapper *parser = indri::api::QueryParserFactory::get(query, "indri");
        indri::lang::ScoredExtentNode* rootNode = parser->query();
        indri::lang::RawScorerNodeExtractor extractor;
        rootNode->walk(extractor);
        vector<indri::lang::RawScorerNode*>& scorerNodes = extractor.getScorerNodes();

        for (int i = 0; i < scorerNodes.size(); i++){
            string qterm = environment.stemTerm(scorerNodes[i]->queryText());
            if(environment.stemCount(qterm) == 0)
                continue;
            if( resultData.queryStems.find(qterm) == resultData.queryStems.end() ){
                resultData.queryStems.insert(make_pair( qterm, 1));
                resultData.queryStemOrder.push_back(qterm);
            }
            else
                resultData.queryStems[qterm] += 1;
        }
    }