void buildStats() { HGram::iterator iter; resultsData.tfMatrix = arma::zeros<arma::mat>(results.size(), _gramTable.size()); // Initialize the resultsData.dfVector.set_size(_gramTable.size()); resultsData.ctfVector.set_size(_gramTable.size()); int tmpTermID = -1; for( iter = _gramTable.begin(); iter != _gramTable.end(); iter++ ) { double gramCount = 0; ++tmpTermID; Gram* gram = *iter->first; GramCounts* gramCounts = *iter->second; gram->internal_termID = tmpTermID; terms.push_back(gram->term); if( resultsData.queryStems.find(gram->term) != resultsData.queryStems.end() ) resultsData.queryStemIndex[gram->term] = tmpTermID; resultsData.ctfVector(tmpTermID) = environment.stemCount(gram->term); resultsData.dfVector(tmpTermID) = environment.documentStemCount(gram->term); size_t c, r; for( r = 0, c = 0; r < results.size() && c < gramCounts->counts.size(); r++ ) { if( gramCounts->counts[c].first == r ) { resultsData.tfMatrix(r, tmpTermID) = gramCounts->counts[c].second; c++; } } } _gramTable.clear(); }
void matIR::QueryStats::init(const std::string& query, indri::api::QueryEnvironment& environment) { // Extract only the terms from the query and add to the vector indri::api::QueryParserWrapper *parser = indri::api::QueryParserFactory::get(query, "indri"); indri::lang::ScoredExtentNode* rootNode = parser->query(); indri::lang::RawScorerNodeExtractor extractor; rootNode->walk(extractor); std::vector<indri::lang::RawScorerNode*>& scorerNodes = extractor.getScorerNodes(); for (int i = 0; i < scorerNodes.size(); i++){ std::string qterm = environment.stemTerm(scorerNodes[i]->queryText()); queryString.push_back(qterm); if(environment.stemCount(qterm) == 0) continue; if( _queryTokens.find(qterm) == _queryTokens.end() ) _queryTokens.insert(make_pair( qterm, 1)); else _queryTokens[qterm] += 1; } // Initialize vectors _query_collectionFrequency.set_size(_queryTokens.size()); _query_documentFrequency.set_size(_queryTokens.size()); // Now obtain the statistics int i = 0; map<std::string, int>::const_iterator iter; for (iter=_queryTokens.begin(); iter != _queryTokens.end(); ++iter) { std::string stem = environment.stemTerm(iter->first); _query_collectionFrequency(i) = (double) environment.stemCount(stem); _query_documentFrequency(i) = (double) environment.documentStemCount(stem); ++i; } }