SEXP getDocTermMatrix(string termWeighting){ Rcpp::List dimnms = Rcpp::List::create(extDocIDs, terms); if(termWeighting == "tf"){ NumericMatrix d = Rcpp::wrap(resultsData.tfMatrix); d.attr("dimnames") = dimnms; return d; }else if(termWeighting == "tf_normalized"){ arma::mat tfnorm = resultsData.tfMatrix; arma::rowvec docLen = arma::sum(tfnorm, 0); tfnorm.each_row() /= docLen; NumericMatrix d = Rcpp::wrap(tfnorm); d.attr("dimnames") = dimnms; return d; }else if(termWeighting == "tfidf"){ arma::mat tfidfMat = resultsData.tfMatrix; arma::vec idf = arma::log((environment.documentCount() + 1) / (resultsData.dfVector + 0.5)); tfidfMat.each_row() %= idf.t(); NumericMatrix d = Rcpp::wrap(tfidfMat); d.attr("dimnames") = dimnms; return d; }else if(termWeighting == "idf"){ } }
SEXP getTermStats(){ vector<string> statName; statName.push_back("DocFreq"); statName.push_back("IDF"); statName.push_back("cTF"); arma::vec idf = arma::log((environment.documentCount() + 1) / (resultsData.dfVector + 0.5)); DataFrame d = DataFrame::create(Named("DocFreq")=resultsData.dfVector, Named("IDF")=idf, Named("cTF")=resultsData.ctfVector); d.attr("row.names") = terms; return d; }
SEXP getDocCount() { long res = environment.documentCount(); return Rcpp::wrap(res); }
SEXP docFreq(string _term) { long res = environment.documentCount(_term); return Rcpp::wrap(res); }
map<string, double> indri::query::ConceptSelectorFuns::normConceptScoreFreq(indri::api::QueryEnvironment & env, map<string, int> conceptsFrq, multimap<double, pair<string, string> > scoredConcepts_norm ) { // extract concept freq for the concepts exist in scoredConcepts_norm and store it in a container map<string, double> conceptsFrqExtr_norm; double max_cf = 0; double min_cf = std::numeric_limits<double>::infinity();; for (auto itSc = scoredConcepts_norm.begin(); itSc != scoredConcepts_norm.end(); itSc++) { conceptsFrqExtr_norm[(itSc->second).second] = conceptsFrq[(itSc->second).second] / double(env.documentCount()); max_cf = max(max_cf, conceptsFrqExtr_norm[(itSc->second).second]); min_cf = min(min_cf, conceptsFrqExtr_norm[(itSc->second).second]); cout << "indri::query::ConceptSelectorFuns::normConceptScoreFreq: conceptsFrqExtr_norm: min_cf, max_cf: " << min_cf << ", " << max_cf << endl; cout << "indri::query::ConceptSelectorFuns::normConceptScoreFreq: scoredConcepts_norm: " << (itSc->second).second << " " << itSc->first << endl; cout << "indri::query::ConceptSelectorFuns::normConceptScoreFreq: conceptsFrq: " << (itSc->second).second << " " << conceptsFrq[(itSc->second).second] << endl; } // normalize the extracted concepts' scores map<string, double> conceptsFrqExtr_norm1; for (auto itCf = conceptsFrqExtr_norm.begin(); itCf != conceptsFrqExtr_norm.end(); itCf++) { conceptsFrqExtr_norm1[itCf->first] = (itCf->second-min_cf)/(max_cf-min_cf); cout << "indri::query::ConceptSelectorFuns::normConceptScoreFreq: conceptsFrqExtr_norm1: " << itCf->first << " " << conceptsFrqExtr_norm1[itCf->first] << endl; } return conceptsFrqExtr_norm1; }