예제 #1
0
    void buildStats() {
        HGram::iterator iter;
        resultsData.tfMatrix = arma::zeros<arma::mat>(results.size(),
                                                      _gramTable.size());
        // Initialize the
        resultsData.dfVector.set_size(_gramTable.size());
        resultsData.ctfVector.set_size(_gramTable.size());

        int tmpTermID = -1;
        for( iter = _gramTable.begin(); iter != _gramTable.end(); iter++ ) {
            double gramCount = 0;
            ++tmpTermID;
            Gram* gram = *iter->first;
            GramCounts* gramCounts = *iter->second;
            gram->internal_termID = tmpTermID;
            terms.push_back(gram->term);
             if( resultsData.queryStems.find(gram->term) != resultsData.queryStems.end() )
                resultsData.queryStemIndex[gram->term] = tmpTermID;

            resultsData.ctfVector(tmpTermID) = environment.stemCount(gram->term);
            resultsData.dfVector(tmpTermID) =  environment.documentStemCount(gram->term);
            size_t c, r;
            for( r = 0, c = 0; r < results.size() && c < gramCounts->counts.size(); r++ ) {
                if( gramCounts->counts[c].first == r ) {
                    resultsData.tfMatrix(r, tmpTermID) = gramCounts->counts[c].second;
                    c++;
                }
            }
        }
        _gramTable.clear();
    }
예제 #2
0
void matIR::QueryStats::init(const std::string& query, indri::api::QueryEnvironment& environment)
{

    // Extract only the terms from the query and add to the vector
    indri::api::QueryParserWrapper *parser = indri::api::QueryParserFactory::get(query, "indri");
    indri::lang::ScoredExtentNode* rootNode = parser->query();
    indri::lang::RawScorerNodeExtractor extractor;
    rootNode->walk(extractor);
    std::vector<indri::lang::RawScorerNode*>& scorerNodes = extractor.getScorerNodes();

    for (int i = 0; i < scorerNodes.size(); i++){
        std::string qterm = environment.stemTerm(scorerNodes[i]->queryText());
        queryString.push_back(qterm);
        if(environment.stemCount(qterm) == 0)
            continue;
        if( _queryTokens.find(qterm) == _queryTokens.end() )
            _queryTokens.insert(make_pair( qterm, 1));
        else
            _queryTokens[qterm] += 1;
    }

    // Initialize vectors


    _query_collectionFrequency.set_size(_queryTokens.size());
    _query_documentFrequency.set_size(_queryTokens.size());



    // Now obtain the statistics
    int i = 0;
    map<std::string, int>::const_iterator iter;
    for (iter=_queryTokens.begin(); iter != _queryTokens.end(); ++iter) {
        std::string stem = environment.stemTerm(iter->first);
        _query_collectionFrequency(i) = (double) environment.stemCount(stem);
        _query_documentFrequency(i) = (double) environment.documentStemCount(stem);
        ++i;

    }
}