Пример #1
0
    SEXP getDocTermMatrix(string termWeighting){
        Rcpp::List dimnms = Rcpp::List::create(extDocIDs, terms);
        if(termWeighting == "tf"){
            NumericMatrix d = Rcpp::wrap(resultsData.tfMatrix);
            d.attr("dimnames") = dimnms;
            return d;
        }else if(termWeighting == "tf_normalized"){
            arma::mat tfnorm = resultsData.tfMatrix;
            arma::rowvec docLen = arma::sum(tfnorm, 0);
            tfnorm.each_row() /= docLen;
            NumericMatrix d = Rcpp::wrap(tfnorm);
            d.attr("dimnames") = dimnms;
            return d;
        }else if(termWeighting == "tfidf"){
            arma::mat tfidfMat = resultsData.tfMatrix;
            arma::vec idf = arma::log((environment.documentCount() + 1) /
                    (resultsData.dfVector + 0.5));
            tfidfMat.each_row() %= idf.t();
            NumericMatrix d = Rcpp::wrap(tfidfMat);
            d.attr("dimnames") = dimnms;
            return d;
        }else if(termWeighting == "idf"){

        }


    }
Пример #2
0
    SEXP getTermStats(){

        vector<string> statName;
        statName.push_back("DocFreq");
        statName.push_back("IDF");
        statName.push_back("cTF");
        arma::vec idf = arma::log((environment.documentCount() + 1) /
                (resultsData.dfVector + 0.5));
        DataFrame d = DataFrame::create(Named("DocFreq")=resultsData.dfVector,
                                        Named("IDF")=idf,
                                        Named("cTF")=resultsData.ctfVector);
        d.attr("row.names") = terms;
        return d;
    }
Пример #3
0
 SEXP getDocCount() {
     long res = environment.documentCount();
     return Rcpp::wrap(res);
 }
Пример #4
0
 SEXP docFreq(string _term) {
     long res = environment.documentCount(_term);
     return Rcpp::wrap(res);
 }
Пример #5
0
map<string, double> indri::query::ConceptSelectorFuns::normConceptScoreFreq(indri::api::QueryEnvironment & env,
                                                            map<string, int> conceptsFrq,
                                                            multimap<double, pair<string, string> > scoredConcepts_norm )
{

    // extract concept freq for the concepts exist in scoredConcepts_norm and store it in a container
    map<string, double> conceptsFrqExtr_norm;
    double max_cf = 0;
    double min_cf = std::numeric_limits<double>::infinity();;
    for (auto itSc = scoredConcepts_norm.begin(); itSc != scoredConcepts_norm.end(); itSc++)
    {
        conceptsFrqExtr_norm[(itSc->second).second] = conceptsFrq[(itSc->second).second] / double(env.documentCount());
        max_cf = max(max_cf, conceptsFrqExtr_norm[(itSc->second).second]);
        min_cf = min(min_cf, conceptsFrqExtr_norm[(itSc->second).second]);

        cout << "indri::query::ConceptSelectorFuns::normConceptScoreFreq: conceptsFrqExtr_norm: min_cf, max_cf: " << min_cf << ", " << max_cf << endl;
        cout << "indri::query::ConceptSelectorFuns::normConceptScoreFreq: scoredConcepts_norm: " << (itSc->second).second << " " << itSc->first << endl;
        cout << "indri::query::ConceptSelectorFuns::normConceptScoreFreq: conceptsFrq: " << (itSc->second).second << " " << conceptsFrq[(itSc->second).second] << endl;
    }

    // normalize the extracted concepts' scores
    map<string, double> conceptsFrqExtr_norm1;
    for (auto itCf = conceptsFrqExtr_norm.begin(); itCf != conceptsFrqExtr_norm.end(); itCf++)
    {
        conceptsFrqExtr_norm1[itCf->first] = (itCf->second-min_cf)/(max_cf-min_cf);
        cout << "indri::query::ConceptSelectorFuns::normConceptScoreFreq: conceptsFrqExtr_norm1: " << itCf->first << " " << conceptsFrqExtr_norm1[itCf->first] << endl;
    }
    return conceptsFrqExtr_norm1;
}