SEXP generateResults(string _qno, string _query, int _documentLimit, bool stats) { resultsData = resultsData_nullCopy; documentIDs.clear(); scores.clear(); extDocIDs.clear(); terms.clear(); _gramTable.clear(); results.clear(); qno = _qno; query = _query; documentLimit = _documentLimit; qa = environment.runAnnotatedQuery(query, _documentLimit); results = qa->getResults(); _logtoposterior(results); // Extract Documents for (size_t i = 0; i < results.size(); i++){ documentIDs.push_back(results[i].document); scores.push_back(results[i].score); } extDocIDs = environment.documentMetadata(documentIDs, "docno"); if(stats){ updateQueryDetails(environment, resultsData, query); countGrams(); buildStats(); } return Rcpp::wrap(true); }
void buildStats() { HGram::iterator iter; resultsData.tfMatrix = arma::zeros<arma::mat>(results.size(), _gramTable.size()); // Initialize the resultsData.dfVector.set_size(_gramTable.size()); resultsData.ctfVector.set_size(_gramTable.size()); int tmpTermID = -1; for( iter = _gramTable.begin(); iter != _gramTable.end(); iter++ ) { double gramCount = 0; ++tmpTermID; Gram* gram = *iter->first; GramCounts* gramCounts = *iter->second; gram->internal_termID = tmpTermID; terms.push_back(gram->term); if( resultsData.queryStems.find(gram->term) != resultsData.queryStems.end() ) resultsData.queryStemIndex[gram->term] = tmpTermID; resultsData.ctfVector(tmpTermID) = environment.stemCount(gram->term); resultsData.dfVector(tmpTermID) = environment.documentStemCount(gram->term); size_t c, r; for( r = 0, c = 0; r < results.size() && c < gramCounts->counts.size(); r++ ) { if( gramCounts->counts[c].first == r ) { resultsData.tfMatrix(r, tmpTermID) = gramCounts->counts[c].second; c++; } } } _gramTable.clear(); }
SEXP runQuery(string _qno, string _query, int _documentLimit, string _runid="default"){ indri::api::QueryAnnotation* qa; qa = environment.runAnnotatedQuery(_query, _documentLimit); std::vector<indri::api::ScoredExtentResult> results = qa->getResults(); //_logtoposterior(results); // Extract Documents std::vector<lemur::api::DOCID_T> documentIDs; std::vector<double> scores; for (size_t i = 0; i < results.size(); i++){ documentIDs.push_back(results[i].document); scores.push_back(results[i].score); } vector<string> res_qno; vector<string> res_q0; vector<string> res_runid; int documentLimit = _documentLimit; for(int i=0; i < documentLimit; i++){ res_qno.push_back(qno); res_q0.push_back("Q0"); res_runid.push_back(_runid); } std::vector<string> extDocIDs = environment.documentMetadata(documentIDs, "docno"); return Rcpp::DataFrame::create( Named("topic")= _qno, Named("q0")= res_q0, Named("docID")= wrap(extDocIDs), Named("rank")= seq( 1, documentLimit ), Named("score")= wrap(scores), Named("runID")= res_runid); }
Index(string _indexPath, bool _server) { try { if (_server) environment.addServer(_indexPath); else environment.addIndex(_indexPath); } catch (std::exception &ex) { forward_exception_to_r(ex); } catch (lemur::api::Exception& e) { ::Rf_error("Unable to open index"); } catch (...) { ::Rf_error("Caught unhandled exception"); } }
double indri::query::ConceptSelectorFuns::findConceptScorePrf(string conceptSty, string conceptStr, string qId, std::vector<lemur::api::DOCID_T> topDocIds, indri::api::QueryEnvironment & env, indri::query::QueryReformulator * queryReformulator, vector<string> resourceNames_) { // runQuery the new query text on these workset of top-ranked documents vector<pair<string, vector<pair<string, string> > > > candConcepts_; vector<pair<string, string> > tmp = {make_pair(conceptSty, conceptStr)}; candConcepts_ = {make_pair(qId, tmp )}; oneResourceConceptsParams.oneResourceConcepts = candConcepts_; vector<pair<string, string> > queriesText = queryReformulator->testOneConceptAddition2OneQuery(conceptSty, conceptStr, qId, resourceNames_); std::vector< indri::api::ScoredExtentResult > results_; if(wsuIr::expander::Utility::runQuery_results_isExist(queriesText, topDocIds)) { results_ = wsuIr::expander::Utility::runQuery_results_get(queriesText, topDocIds); } else { results_ = env.runQuery(queriesText.front().second, topDocIds, topDocIds.size()); wsuIr::expander::Utility::runQuery_results_store(queriesText, topDocIds, results_); } if(results_.size() != topDocIds.size()) throw runtime_error("RunQUery.cpp: some of top-ranked documents are not scored"); double conceptScore = 0; for(auto r: results_) { conceptScore += r.score; } return conceptScore; }
SEXP getDocTermMatrix(string termWeighting){ Rcpp::List dimnms = Rcpp::List::create(extDocIDs, terms); if(termWeighting == "tf"){ NumericMatrix d = Rcpp::wrap(resultsData.tfMatrix); d.attr("dimnames") = dimnms; return d; }else if(termWeighting == "tf_normalized"){ arma::mat tfnorm = resultsData.tfMatrix; arma::rowvec docLen = arma::sum(tfnorm, 0); tfnorm.each_row() /= docLen; NumericMatrix d = Rcpp::wrap(tfnorm); d.attr("dimnames") = dimnms; return d; }else if(termWeighting == "tfidf"){ arma::mat tfidfMat = resultsData.tfMatrix; arma::vec idf = arma::log((environment.documentCount() + 1) / (resultsData.dfVector + 0.5)); tfidfMat.each_row() %= idf.t(); NumericMatrix d = Rcpp::wrap(tfidfMat); d.attr("dimnames") = dimnms; return d; }else if(termWeighting == "idf"){ } }
// Runs the query, expanding it if necessary. Will print output as well if verbose is on. void _runQuery( std::stringstream& output, const std::string& query, const std::string &queryType, const std::vector<std::string> &workingSet, std::vector<std::string> relFBDocs ) { try { if( _printQuery ) output << "# query: " << query << std::endl; std::vector<lemur::api::DOCID_T> docids;; if (workingSet.size() > 0) docids = _environment.documentIDsFromMetadata("docno", workingSet); if (relFBDocs.size() == 0) { if( _printSnippets ) { if (workingSet.size() > 0) _annotation = _environment.runAnnotatedQuery( query, docids, _initialRequested, queryType ); else _annotation = _environment.runAnnotatedQuery( query, _initialRequested ); _results = _annotation->getResults(); } else { if (workingSet.size() > 0) _results = _environment.runQuery( query, docids, _initialRequested, queryType ); else _results = _environment.runQuery( query, _initialRequested, queryType ); } } if( _expander ) { std::vector<indri::api::ScoredExtentResult> fbDocs; if (relFBDocs.size() > 0) { docids = _environment.documentIDsFromMetadata("docno", relFBDocs); for (size_t i = 0; i < docids.size(); i++) { indri::api::ScoredExtentResult r(0.0, docids[i]); fbDocs.push_back(r); } } std::string expandedQuery; if (relFBDocs.size() != 0) expandedQuery = _expander->expand( query, fbDocs ); else expandedQuery = _expander->expand( query, _results ); if( _printQuery ) output << "# expanded: " << expandedQuery << std::endl; if (workingSet.size() > 0) { docids = _environment.documentIDsFromMetadata("docno", workingSet); _results = _environment.runQuery( expandedQuery, docids, _requested, queryType ); } else { _results = _environment.runQuery( expandedQuery, _requested, queryType ); } } } catch( lemur::api::Exception& e ) { _results.clear(); LEMUR_RETHROW(e, "QueryThread::_runQuery Exception"); } }
static void open_indexes( indri::api::QueryEnvironment& environment, indri::api::Parameters& param ) { if( param.exists( "index" ) ) { indri::api::Parameters indexes = param["index"]; for( unsigned int i=0; i < indexes.size(); i++ ) { environment.addIndex( std::string(indexes[i]) ); } } if( param.exists( "server" ) ) { indri::api::Parameters servers = param["server"]; for( unsigned int i=0; i < servers.size(); i++ ) { environment.addServer( std::string(servers[i]) ); } } std::vector<std::string> smoothingRules; if( copy_parameters_to_string_vector( smoothingRules, param, "rule" ) ) environment.setScoringRules( smoothingRules ); }
// how to just compute the clarity score without printing out the terms. static double clarity( const std::string& query, indri::api::QueryEnvironment & env, const std::vector<indri::query::RelevanceModel::Gram*>& grams, int numTerms ) { int count = 0; double sum=0, ln_Pr=0; for( size_t j=0; j< numTerms && j < grams.size(); j++ ) { std::string t = grams[j]->terms[0]; count++; // query-clarity = SUM_w{P(w|Q)*log(P(w|Q)/P(w))} // P(w)=cf(w)/|C| // the relevance model uses stemmed terms, so use stemCount double pw = ((double)env.stemCount(t)/(double)env.termCount()); // P(w|Q) is a prob computed by any model, e.g. relevance models double pwq = grams[j]->weight; sum += pwq; ln_Pr += (pwq)*log(pwq/pw); } return (ln_Pr/(sum ? sum : 1.0)/log(2.0)); }
void countGrams() { std::vector<indri::api::DocumentVector*> vectors = environment.documentVectors( documentIDs ); // for each query result for( size_t i=0; i< results.size(); i++ ) { // run through the text, extracting n-grams indri::api::ScoredExtentResult& result = results[i]; indri::api::DocumentVector* v = vectors[i]; std::vector<int>& positions = v->positions(); std::vector<std::string>& stems = v->stems(); std::vector< indri::api::DocumentVector::Field >& fields = v->fields(); if (result.end == 0) result.end = positions.size(); // for each word position in the text for( int j = result.begin; j < result.end; j++ ) { //int maxGram = std::min( _maxGrams, result.end - j ); GramCounts* newCounts = new GramCounts; bool containsOOV = false; // build the gram if( positions[ j ] == 0 || (! isValid(stems[ positions[ j ] ])) ) { containsOOV = true; continue; } newCounts->gram.term = stems[ positions[ j ] ] ; if( containsOOV ) { // if this contanied OOV, all larger n-grams // starting at this point also will delete newCounts; break; } GramCounts** gramCounts = 0; gramCounts = _gramTable.find( &newCounts->gram ); if( gramCounts == 0 ) { _gramTable.insert( &newCounts->gram, newCounts ); gramCounts = &newCounts; } else { delete newCounts; } if( (*gramCounts)->counts.size() && (*gramCounts)->counts.back().first == i ) { // we already have some counts going for this query result, so just add this one (*gramCounts)->counts.back().second++; } else { // no counts yet in this document, so add an entry (*gramCounts)->counts.push_back( std::make_pair( i, 1 ) ); } } } for (unsigned int i = 0; i < vectors.size(); i++) delete vectors[i]; }
SEXP generateSnippets(bool html){ vector<string> snippetString; vector< indri::api::ParsedDocument* > pdocuments = environment.documents(documentIDs); indri::api::SnippetBuilder sp(html); for( size_t row=0; row < documentIDs.size(); row++ ) snippetString.push_back(sp.build(documentIDs[row], pdocuments[row], qa)); CharacterVector c = wrap(snippetString); c.attr("names") = extDocIDs; return c; }
SEXP addServer(string _server){ try { environment.addServer(_server); } catch (std::exception &ex) { forward_exception_to_r(ex); } catch (lemur::api::Exception& e) { ::Rf_error("Unable to open index"); } catch (...) { ::Rf_error("Caught unhandled exception"); } }
void matIR::QueryStats::init(const std::string& query, indri::api::QueryEnvironment& environment) { // Extract only the terms from the query and add to the vector indri::api::QueryParserWrapper *parser = indri::api::QueryParserFactory::get(query, "indri"); indri::lang::ScoredExtentNode* rootNode = parser->query(); indri::lang::RawScorerNodeExtractor extractor; rootNode->walk(extractor); std::vector<indri::lang::RawScorerNode*>& scorerNodes = extractor.getScorerNodes(); for (int i = 0; i < scorerNodes.size(); i++){ std::string qterm = environment.stemTerm(scorerNodes[i]->queryText()); queryString.push_back(qterm); if(environment.stemCount(qterm) == 0) continue; if( _queryTokens.find(qterm) == _queryTokens.end() ) _queryTokens.insert(make_pair( qterm, 1)); else _queryTokens[qterm] += 1; } // Initialize vectors _query_collectionFrequency.set_size(_queryTokens.size()); _query_documentFrequency.set_size(_queryTokens.size()); // Now obtain the statistics int i = 0; map<std::string, int>::const_iterator iter; for (iter=_queryTokens.begin(); iter != _queryTokens.end(); ++iter) { std::string stem = environment.stemTerm(iter->first); _query_collectionFrequency(i) = (double) environment.stemCount(stem); _query_documentFrequency(i) = (double) environment.documentStemCount(stem); ++i; } }
void updateQueryDetails(indri::api::QueryEnvironment& environment, Results& resultData, string query){ indri::api::QueryParserWrapper *parser = indri::api::QueryParserFactory::get(query, "indri"); indri::lang::ScoredExtentNode* rootNode = parser->query(); indri::lang::RawScorerNodeExtractor extractor; rootNode->walk(extractor); vector<indri::lang::RawScorerNode*>& scorerNodes = extractor.getScorerNodes(); for (int i = 0; i < scorerNodes.size(); i++){ string qterm = environment.stemTerm(scorerNodes[i]->queryText()); if(environment.stemCount(qterm) == 0) continue; if( resultData.queryStems.find(qterm) == resultData.queryStems.end() ){ resultData.queryStems.insert(make_pair( qterm, 1)); resultData.queryStemOrder.push_back(qterm); } else resultData.queryStems[qterm] += 1; } }
SEXP getTermStats(){ vector<string> statName; statName.push_back("DocFreq"); statName.push_back("IDF"); statName.push_back("cTF"); arma::vec idf = arma::log((environment.documentCount() + 1) / (resultsData.dfVector + 0.5)); DataFrame d = DataFrame::create(Named("DocFreq")=resultsData.dfVector, Named("IDF")=idf, Named("cTF")=resultsData.ctfVector); d.attr("row.names") = terms; return d; }
// Runs the query, expanding it if necessary. Will print output as well if verbose is on. void _runQuery( std::stringstream& output, const std::string& query, const std::string &queryType ) { try { if( _printQuery ) output << "# query: " << query << std::endl; if( _printSnippets ) { _annotation = _environment.runAnnotatedQuery( query, _initialRequested ); _results = _annotation->getResults(); } else { _results = _environment.runQuery( query, _initialRequested, queryType ); } if( _expander ) { std::string expandedQuery = _expander->expand( query, _results ); if( _printQuery ) output << "# expanded: " << expandedQuery << std::endl; _results = _environment.runQuery( expandedQuery, _requested, queryType ); } } catch( lemur::api::Exception& e ) { _results.clear(); LEMUR_RETHROW(e, "QueryThread::_runQuery Exception"); } }
SEXP setScoringRules(string method, string parameters){ vector<string> scoringRules; //if(method == "tfidf" || method == "Okapi" || method == "BM25" ){ // string rule = method + "," + parameters; //environment.setBaseline(rule); //scoringRules.push_back(""); //environment.setScoringRules(scoringRules); //}else{ string rule = "method:" + method + "," + parameters; scoringRules.push_back(rule); environment.setScoringRules(scoringRules); //} return R_NilValue; }
multimap<double, pair<string, string> > indri::query::ConceptSelectorFuns::normConceptScorePrf( vector<pair<string, string> > concatenatedGoodConcepts, string qId, vector<string> topDocsNames, indri::api::QueryEnvironment & env, indri::query::QueryReformulator * queryReformulator, vector<string> resourceNames_) { std::vector<lemur::api::DOCID_T> topDocIds = env.documentIDsFromMetadata("docno", topDocsNames); multimap<double, pair<string, string>, std::greater<double> > scoredConcepts_; for(auto concStyStrPair: concatenatedGoodConcepts) // for each each extracted concept { string conceptSty = concStyStrPair.first; string conceptStr = concStyStrPair.second; double conceptScore = indri::query::ConceptSelectorFuns::findConceptScorePrf(conceptSty, conceptStr, qId, topDocIds, env, queryReformulator, resourceNames_); scoredConcepts_.insert(make_pair(conceptScore, make_pair(conceptSty, conceptStr))); cout << "indri::query::ConceptSelectorFuns::normConceptScorePrf: conceptScore = " << conceptStr << " -> " << conceptScore << endl; } double max_sc = 0; double min_sc = std::numeric_limits<double>::infinity(); for (auto sc: scoredConcepts_) { max_sc = max(max_sc, sc.first); min_sc = min(min_sc, sc.first); } cout << "indri::query::ConceptSelectorFuns::normConceptScorePrf: min_sc, max_sc: " << min_sc << ", " << max_sc << endl; // min-max normalize socores in scoredConcepts_ multimap<double, pair<string, string> > scoredConcepts_norm; for (auto itSc = scoredConcepts_.begin(); itSc != scoredConcepts_.end(); itSc++) { double conceptScore = (itSc->first- min_sc)/(max_sc- min_sc); scoredConcepts_norm.insert(make_pair(conceptScore, make_pair((itSc->second).first, (itSc->second).second))); cout << "indri::query::ConceptSelectorFuns::normConceptScorePrf: scoredConcepts_norm: scoredConcepts_ = " << itSc->first << endl; cout << "indri::query::ConceptSelectorFuns::normConceptScorePrf: scoredConcepts_norm: conceptScore = " << conceptScore << " = (" << itSc->first << " - " << min_sc << " )/( " << max_sc << " - " << min_sc << " )" << endl; } return scoredConcepts_norm; }
void convert_docnoscore_to_binary( indri::file::File& outfile, const std::string& infile, indri::api::QueryEnvironment& env ) { std::ifstream in; std::string docnoName = "docno"; indri::file::SequentialWriteBuffer* outb = new indri::file::SequentialWriteBuffer( outfile, 1024*1024 ); in.open( infile.c_str(), std::ifstream::in ); while( !in.eof() ) { std::string docno; double score; in >> docno >> score; if( in.eof() ) break; std::cout << "looking up: " << docno << " " << score << std::endl; std::vector<std::string> docnoValues; docnoValues.push_back( docno ); std::vector<lemur::api::DOCID_T> result = env.documentIDsFromMetadata( docnoName, docnoValues ); if( result.size() == 0 ) { // LEMUR_THROW( LEMUR_IO_ERROR, "No document exists with docno: " + docno ); continue; // allow entries that don't exist and ignore silently. } int document = result[0]; std::cout << document << std::endl; outb->write( (const void*) &document, sizeof(UINT32) ); outb->write( (const void*) &score, sizeof(double) ); } outb->flush(); delete outb; in.close(); }
SEXP closeIndex() { environment.close(); return (Rcpp::wrap(true)); }
SEXP collFreq(string _term) { long res = environment.termCount(_term); return Rcpp::wrap(res); }
UINT64 initialize() { _environment.setSingleBackgroundModel( _parameters.get("singleBackgroundModel", false) ); std::vector<std::string> stopwords; if( copy_parameters_to_string_vector( stopwords, _parameters, "stopper.word" ) ) _environment.setStopwords(stopwords); std::vector<std::string> smoothingRules; if( copy_parameters_to_string_vector( smoothingRules, _parameters, "rule" ) ) _environment.setScoringRules( smoothingRules ); if( _parameters.exists( "index" ) ) { indri::api::Parameters indexes = _parameters["index"]; for( size_t i=0; i < indexes.size(); i++ ) { _environment.addIndex( std::string(indexes[i]) ); } } if( _parameters.exists( "server" ) ) { indri::api::Parameters servers = _parameters["server"]; for( size_t i=0; i < servers.size(); i++ ) { _environment.addServer( std::string(servers[i]) ); } } if( _parameters.exists("maxWildcardTerms") ) _environment.setMaxWildcardTerms(_parameters.get("maxWildcardTerms", 100)); _requested = _parameters.get( "count", 1000 ); _initialRequested = _parameters.get( "fbDocs", _requested ); _runID = _parameters.get( "runID", "indri" ); _trecFormat = _parameters.get( "trecFormat" , false ); _inexFormat = _parameters.exists( "inex" ); _printQuery = _parameters.get( "printQuery", false ); _printDocuments = _parameters.get( "printDocuments", false ); _printPassages = _parameters.get( "printPassages", false ); _printSnippets = _parameters.get( "printSnippets", false ); if (_parameters.exists("baseline")) { // doing a baseline std::string baseline = _parameters["baseline"]; _environment.setBaseline(baseline); // need a factory for this... if( _parameters.get( "fbDocs", 0 ) != 0 ) { // have to push the method in... std::string rule = "method:" + baseline; _parameters.set("rule", rule); _expander = new indri::query::TFIDFExpander( &_environment, _parameters ); } } else { if( _parameters.get( "fbDocs", 0 ) != 0 ) { _expander = new indri::query::RMExpander( &_environment, _parameters ); } } if (_parameters.exists("maxWildcardTerms")) { _environment.setMaxWildcardTerms((int)_parameters.get("maxWildcardTerms")); } return 0; }
void deinitialize() { delete _expander; _environment.close(); }
SEXP getDocCount() { long res = environment.documentCount(); return Rcpp::wrap(res); }
void _printResultRegion( std::stringstream& output, std::string queryIndex, int start, int end ) { std::vector<std::string> documentNames; std::vector<indri::api::ParsedDocument*> documents; std::vector<indri::api::ScoredExtentResult> resultSubset; resultSubset.assign( _results.begin() + start, _results.begin() + end ); // Fetch document data for printing if( _printDocuments || _printPassages || _printSnippets ) { // Need document text, so we'll fetch the whole document documents = _environment.documents( resultSubset ); documentNames.clear(); for( size_t i=0; i<resultSubset.size(); i++ ) { indri::api::ParsedDocument* doc = documents[i]; std::string documentName; indri::utility::greedy_vector<indri::parse::MetadataPair>::iterator iter = std::find_if( documents[i]->metadata.begin(), documents[i]->metadata.end(), indri::parse::MetadataPair::key_equal( "docno" ) ); if( iter != documents[i]->metadata.end() ) documentName = (char*) iter->value; // store the document name in a separate vector so later code can find it documentNames.push_back( documentName ); } } else { // We only want document names, so the documentMetadata call may be faster documentNames = _environment.documentMetadata( resultSubset, "docno" ); } std::vector<std::string> pathNames; if ( _inexFormat ) { // retrieve path names pathNames = _environment.pathNames( resultSubset ); } // Print results for( size_t i=0; i < resultSubset.size(); i++ ) { int rank = start+i+1; std::string queryNumber = queryIndex; if( _trecFormat ) { // TREC formatted output: queryNumber, Q0, documentName, rank, score, runID output << queryNumber << " " << "Q0 " << documentNames[i] << " " << rank << " " << resultSubset[ i ].score << " " << _runID << std::endl; } else if( _inexFormat ) { output << " <result>" << std::endl << " <file>" << documentNames[i] << "</file>" << std::endl << " <path>" << pathNames[i] << "</path>" << std::endl << " <rsv>" << resultSubset[i].score << "</rsv>" << std::endl << " </result>" << std::endl; } else { // score, documentName, firstWord, lastWord output << resultSubset[i].score << "\t" << documentNames[i] << "\t" << resultSubset[i].begin << "\t" << resultSubset[i].end << std::endl; } if( _printDocuments ) { output << documents[i]->text << std::endl; } if( _printPassages ) { int byteBegin = documents[i]->positions[ resultSubset[i].begin ].begin; int byteEnd = documents[i]->positions[ resultSubset[i].end-1 ].end; output.write( documents[i]->text + byteBegin, byteEnd - byteBegin ); output << std::endl; } if( _printSnippets ) { indri::api::SnippetBuilder builder(false); output << builder.build( resultSubset[i].document, documents[i], _annotation ) << std::endl; } if( documents.size() ) delete documents[i]; } }
SEXP getTermCount() { long res = environment.termCount(); return Rcpp::wrap(res); }
SEXP stemTerm(string _term) { return Rcpp::wrap(environment.stemTerm(_term)); }
SEXP docFreq(string _term) { long res = environment.documentCount(_term); return Rcpp::wrap(res); }
SEXP getDocumentLengths(){ vector<int> docLength; for (size_t i = 0; i < documentIDs.size(); i++) docLength.push_back(environment.documentLength(documentIDs[i])); return wrap(docLength); }
SEXP getMetaData(string metaDataKey){ vector<string> metaDataString = environment.documentMetadata(documentIDs, metaDataKey); CharacterVector c = wrap(metaDataString); c.attr("names") = extDocIDs; return c; }