Beispiel #1
0
    SEXP generateResults(string _qno, string _query, int _documentLimit, bool stats) {

        resultsData = resultsData_nullCopy;
        documentIDs.clear();
        scores.clear();
        extDocIDs.clear();
        terms.clear();
        _gramTable.clear();
        results.clear();
        qno = _qno;
        query = _query;



        documentLimit = _documentLimit;
        qa = environment.runAnnotatedQuery(query, _documentLimit);


        results = qa->getResults();
        _logtoposterior(results);

        // Extract Documents
        for (size_t i = 0; i < results.size(); i++){
            documentIDs.push_back(results[i].document);
            scores.push_back(results[i].score);
        }
        extDocIDs = environment.documentMetadata(documentIDs, "docno");
        if(stats){
            updateQueryDetails(environment, resultsData, query);
            countGrams();
            buildStats();
        }

        return Rcpp::wrap(true);
    }
Beispiel #2
0
    void buildStats() {
        HGram::iterator iter;
        resultsData.tfMatrix = arma::zeros<arma::mat>(results.size(),
                                                      _gramTable.size());
        // Initialize the
        resultsData.dfVector.set_size(_gramTable.size());
        resultsData.ctfVector.set_size(_gramTable.size());

        int tmpTermID = -1;
        for( iter = _gramTable.begin(); iter != _gramTable.end(); iter++ ) {
            double gramCount = 0;
            ++tmpTermID;
            Gram* gram = *iter->first;
            GramCounts* gramCounts = *iter->second;
            gram->internal_termID = tmpTermID;
            terms.push_back(gram->term);
             if( resultsData.queryStems.find(gram->term) != resultsData.queryStems.end() )
                resultsData.queryStemIndex[gram->term] = tmpTermID;

            resultsData.ctfVector(tmpTermID) = environment.stemCount(gram->term);
            resultsData.dfVector(tmpTermID) =  environment.documentStemCount(gram->term);
            size_t c, r;
            for( r = 0, c = 0; r < results.size() && c < gramCounts->counts.size(); r++ ) {
                if( gramCounts->counts[c].first == r ) {
                    resultsData.tfMatrix(r, tmpTermID) = gramCounts->counts[c].second;
                    c++;
                }
            }
        }
        _gramTable.clear();
    }
Beispiel #3
0
    SEXP runQuery(string _qno, string _query, int _documentLimit, string _runid="default"){
        indri::api::QueryAnnotation* qa;
        qa = environment.runAnnotatedQuery(_query, _documentLimit);

        std::vector<indri::api::ScoredExtentResult> results = qa->getResults();
        //_logtoposterior(results);

        // Extract Documents
        std::vector<lemur::api::DOCID_T> documentIDs;
        std::vector<double> scores;
        for (size_t i = 0; i < results.size(); i++){
            documentIDs.push_back(results[i].document);
            scores.push_back(results[i].score);
        }
        vector<string> res_qno;
        vector<string> res_q0;
        vector<string> res_runid;

        int documentLimit = _documentLimit;

        for(int i=0; i < documentLimit; i++){
            res_qno.push_back(qno);
            res_q0.push_back("Q0");
            res_runid.push_back(_runid);

        }
        std::vector<string> extDocIDs = environment.documentMetadata(documentIDs, "docno");
        return Rcpp::DataFrame::create( Named("topic")= _qno,
                Named("q0")= res_q0, Named("docID")=  wrap(extDocIDs),
                Named("rank")= seq( 1, documentLimit ),
                Named("score")= wrap(scores),
                Named("runID")= res_runid);
        }
Beispiel #4
0
 Index(string _indexPath, bool _server) {
     try {
         if (_server) environment.addServer(_indexPath);
         else environment.addIndex(_indexPath);
     } catch (std::exception &ex) {
         forward_exception_to_r(ex);
     } catch (lemur::api::Exception& e) {
         ::Rf_error("Unable to open index");
     } catch (...) {
         ::Rf_error("Caught unhandled exception");
     }
 }
Beispiel #5
0
double indri::query::ConceptSelectorFuns::findConceptScorePrf(string conceptSty, string conceptStr, string qId, std::vector<lemur::api::DOCID_T> topDocIds,
        indri::api::QueryEnvironment & env,
        indri::query::QueryReformulator * queryReformulator,
        vector<string> resourceNames_)
{
    // runQuery the new query text on these workset of top-ranked documents
    vector<pair<string, vector<pair<string, string> > > > candConcepts_;
    vector<pair<string, string> > tmp = {make_pair(conceptSty, conceptStr)};
    candConcepts_ = {make_pair(qId, tmp )};
    oneResourceConceptsParams.oneResourceConcepts = candConcepts_;

    vector<pair<string, string> > queriesText = queryReformulator->testOneConceptAddition2OneQuery(conceptSty, conceptStr, qId, resourceNames_);

    std::vector< indri::api::ScoredExtentResult > results_;
    if(wsuIr::expander::Utility::runQuery_results_isExist(queriesText, topDocIds))
    {
        results_ = wsuIr::expander::Utility::runQuery_results_get(queriesText, topDocIds);
    }
    else
    {
        results_ = env.runQuery(queriesText.front().second, topDocIds, topDocIds.size());
        wsuIr::expander::Utility::runQuery_results_store(queriesText, topDocIds, results_);
    }

    if(results_.size() != topDocIds.size())
        throw runtime_error("RunQUery.cpp: some of top-ranked documents are not scored");

    double conceptScore = 0;
    for(auto r: results_)
    {
        conceptScore += r.score;
    }

    return conceptScore;
}
Beispiel #6
0
    SEXP getDocTermMatrix(string termWeighting){
        Rcpp::List dimnms = Rcpp::List::create(extDocIDs, terms);
        if(termWeighting == "tf"){
            NumericMatrix d = Rcpp::wrap(resultsData.tfMatrix);
            d.attr("dimnames") = dimnms;
            return d;
        }else if(termWeighting == "tf_normalized"){
            arma::mat tfnorm = resultsData.tfMatrix;
            arma::rowvec docLen = arma::sum(tfnorm, 0);
            tfnorm.each_row() /= docLen;
            NumericMatrix d = Rcpp::wrap(tfnorm);
            d.attr("dimnames") = dimnms;
            return d;
        }else if(termWeighting == "tfidf"){
            arma::mat tfidfMat = resultsData.tfMatrix;
            arma::vec idf = arma::log((environment.documentCount() + 1) /
                    (resultsData.dfVector + 0.5));
            tfidfMat.each_row() %= idf.t();
            NumericMatrix d = Rcpp::wrap(tfidfMat);
            d.attr("dimnames") = dimnms;
            return d;
        }else if(termWeighting == "idf"){

        }


    }
Beispiel #7
0
  // Runs the query, expanding it if necessary.  Will print output as well if verbose is on.
  void _runQuery( std::stringstream& output, const std::string& query,
                  const std::string &queryType, const std::vector<std::string> &workingSet, std::vector<std::string> relFBDocs ) {
    try {
      if( _printQuery ) output << "# query: " << query << std::endl;
      std::vector<lemur::api::DOCID_T> docids;;
      if (workingSet.size() > 0) 
        docids = _environment.documentIDsFromMetadata("docno", workingSet);

      if (relFBDocs.size() == 0) {
          if( _printSnippets ) {
            if (workingSet.size() > 0) 
              _annotation = _environment.runAnnotatedQuery( query, docids, _initialRequested, queryType ); 
            else
              _annotation = _environment.runAnnotatedQuery( query, _initialRequested );
            _results = _annotation->getResults();
          } else {
            if (workingSet.size() > 0)
              _results = _environment.runQuery( query, docids, _initialRequested, queryType );
            else
              _results = _environment.runQuery( query, _initialRequested, queryType );
          }
      }
      
      if( _expander ) {
        std::vector<indri::api::ScoredExtentResult> fbDocs;
        if (relFBDocs.size() > 0) {
          docids = _environment.documentIDsFromMetadata("docno", relFBDocs);
          for (size_t i = 0; i < docids.size(); i++) {
            indri::api::ScoredExtentResult r(0.0, docids[i]);
            fbDocs.push_back(r);
          }
        }
        std::string expandedQuery;
        if (relFBDocs.size() != 0)
          expandedQuery = _expander->expand( query, fbDocs );
        else
          expandedQuery = _expander->expand( query, _results );
        if( _printQuery ) output << "# expanded: " << expandedQuery << std::endl;
        if (workingSet.size() > 0) {
          docids = _environment.documentIDsFromMetadata("docno", workingSet);
          _results = _environment.runQuery( expandedQuery, docids, _requested, queryType );
        } else {
          _results = _environment.runQuery( expandedQuery, _requested, queryType );
        }
      }
    }
    catch( lemur::api::Exception& e )
    {
      _results.clear();
      LEMUR_RETHROW(e, "QueryThread::_runQuery Exception");
    }
  }
Beispiel #8
0
static void open_indexes( indri::api::QueryEnvironment& environment, 
                          indri::api::Parameters& param ) {
  if( param.exists( "index" ) ) {
    indri::api::Parameters indexes = param["index"];
    for( unsigned int i=0; i < indexes.size(); i++ ) {
      environment.addIndex( std::string(indexes[i]) );
    }
  }
  if( param.exists( "server" ) ) {
    indri::api::Parameters servers = param["server"];
    for( unsigned int i=0; i < servers.size(); i++ ) {
      environment.addServer( std::string(servers[i]) );
    }
  }
  std::vector<std::string> smoothingRules;
  if( copy_parameters_to_string_vector( smoothingRules, param, "rule" ) )
    environment.setScoringRules( smoothingRules );
}
Beispiel #9
0
// how to just compute the clarity score without printing out the terms.
static double clarity( const std::string& query, 
                       indri::api::QueryEnvironment & env, 
                       const std::vector<indri::query::RelevanceModel::Gram*>& grams, int numTerms ) {

  int count = 0;
  double sum=0, ln_Pr=0;
  for( size_t j=0; j< numTerms && j < grams.size(); j++ ) {
    std::string t = grams[j]->terms[0];
    count++;
    // query-clarity = SUM_w{P(w|Q)*log(P(w|Q)/P(w))}
    // P(w)=cf(w)/|C|
    // the relevance model uses stemmed terms, so use stemCount
    double pw = ((double)env.stemCount(t)/(double)env.termCount());
    // P(w|Q) is a prob computed by any model, e.g. relevance models
    double pwq = grams[j]->weight;
    sum += pwq;    
    ln_Pr += (pwq)*log(pwq/pw);
  }
  return (ln_Pr/(sum ? sum : 1.0)/log(2.0));
}
Beispiel #10
0
    void countGrams() {
        std::vector<indri::api::DocumentVector*> vectors =
                environment.documentVectors( documentIDs );
        // for each query result
        for( size_t i=0; i< results.size(); i++ ) {
            // run through the text, extracting n-grams
            indri::api::ScoredExtentResult& result = results[i];
            indri::api::DocumentVector* v = vectors[i];
            std::vector<int>& positions = v->positions();
            std::vector<std::string>& stems = v->stems();
            std::vector< indri::api::DocumentVector::Field >& fields = v->fields();
            if (result.end == 0) result.end = positions.size();
            // for each word position in the text
            for( int j = result.begin; j < result.end; j++ ) {
                //int maxGram = std::min( _maxGrams, result.end - j );

                GramCounts* newCounts = new GramCounts;
                bool containsOOV = false;

                // build the gram

                if( positions[ j ] == 0 || (! isValid(stems[ positions[ j ] ])) ) {
                    containsOOV = true;
                    continue;
                }

                newCounts->gram.term =  stems[ positions[ j ] ] ;
                if( containsOOV ) {
                    // if this contanied OOV, all larger n-grams
                    // starting at this point also will
                    delete newCounts;
                    break;
                }

                GramCounts** gramCounts = 0;
                gramCounts = _gramTable.find( &newCounts->gram );
                if( gramCounts == 0 ) {
                    _gramTable.insert( &newCounts->gram, newCounts );
                    gramCounts = &newCounts;
                } else {
                    delete newCounts;
                }
                if( (*gramCounts)->counts.size() && (*gramCounts)->counts.back().first == i ) {
                    // we already have some counts going for this query result, so just add this one
                    (*gramCounts)->counts.back().second++;
                } else {
                    // no counts yet in this document, so add an entry
                    (*gramCounts)->counts.push_back( std::make_pair( i, 1 ) );
                }
            }
        }
        for (unsigned int i = 0; i < vectors.size(); i++)
            delete vectors[i];
    }
Beispiel #11
0
    SEXP generateSnippets(bool html){
        vector<string> snippetString;
        vector< indri::api::ParsedDocument* > pdocuments = environment.documents(documentIDs);
        indri::api::SnippetBuilder sp(html);

        for( size_t row=0; row < documentIDs.size(); row++ )
            snippetString.push_back(sp.build(documentIDs[row], pdocuments[row], qa));
        CharacterVector c = wrap(snippetString);
        c.attr("names") = extDocIDs;
        return c;
    }
Beispiel #12
0
 SEXP addServer(string _server){
     try {
         environment.addServer(_server);
     } catch (std::exception &ex) {
         forward_exception_to_r(ex);
     } catch (lemur::api::Exception& e) {
         ::Rf_error("Unable to open index");
     } catch (...) {
         ::Rf_error("Caught unhandled exception");
     }
 }
Beispiel #13
0
void matIR::QueryStats::init(const std::string& query, indri::api::QueryEnvironment& environment)
{

    // Extract only the terms from the query and add to the vector
    indri::api::QueryParserWrapper *parser = indri::api::QueryParserFactory::get(query, "indri");
    indri::lang::ScoredExtentNode* rootNode = parser->query();
    indri::lang::RawScorerNodeExtractor extractor;
    rootNode->walk(extractor);
    std::vector<indri::lang::RawScorerNode*>& scorerNodes = extractor.getScorerNodes();

    for (int i = 0; i < scorerNodes.size(); i++){
        std::string qterm = environment.stemTerm(scorerNodes[i]->queryText());
        queryString.push_back(qterm);
        if(environment.stemCount(qterm) == 0)
            continue;
        if( _queryTokens.find(qterm) == _queryTokens.end() )
            _queryTokens.insert(make_pair( qterm, 1));
        else
            _queryTokens[qterm] += 1;
    }

    // Initialize vectors


    _query_collectionFrequency.set_size(_queryTokens.size());
    _query_documentFrequency.set_size(_queryTokens.size());



    // Now obtain the statistics
    int i = 0;
    map<std::string, int>::const_iterator iter;
    for (iter=_queryTokens.begin(); iter != _queryTokens.end(); ++iter) {
        std::string stem = environment.stemTerm(iter->first);
        _query_collectionFrequency(i) = (double) environment.stemCount(stem);
        _query_documentFrequency(i) = (double) environment.documentStemCount(stem);
        ++i;

    }
}
Beispiel #14
0
    void updateQueryDetails(indri::api::QueryEnvironment& environment,
                            Results& resultData,
                            string query){

        indri::api::QueryParserWrapper *parser = indri::api::QueryParserFactory::get(query, "indri");
        indri::lang::ScoredExtentNode* rootNode = parser->query();
        indri::lang::RawScorerNodeExtractor extractor;
        rootNode->walk(extractor);
        vector<indri::lang::RawScorerNode*>& scorerNodes = extractor.getScorerNodes();

        for (int i = 0; i < scorerNodes.size(); i++){
            string qterm = environment.stemTerm(scorerNodes[i]->queryText());
            if(environment.stemCount(qterm) == 0)
                continue;
            if( resultData.queryStems.find(qterm) == resultData.queryStems.end() ){
                resultData.queryStems.insert(make_pair( qterm, 1));
                resultData.queryStemOrder.push_back(qterm);
            }
            else
                resultData.queryStems[qterm] += 1;
        }
    }
Beispiel #15
0
    SEXP getTermStats(){

        vector<string> statName;
        statName.push_back("DocFreq");
        statName.push_back("IDF");
        statName.push_back("cTF");
        arma::vec idf = arma::log((environment.documentCount() + 1) /
                (resultsData.dfVector + 0.5));
        DataFrame d = DataFrame::create(Named("DocFreq")=resultsData.dfVector,
                                        Named("IDF")=idf,
                                        Named("cTF")=resultsData.ctfVector);
        d.attr("row.names") = terms;
        return d;
    }
Beispiel #16
0
  // Runs the query, expanding it if necessary.  Will print output as well if verbose is on.
  void _runQuery( std::stringstream& output, const std::string& query,
                  const std::string &queryType ) {
    try {
      if( _printQuery ) output << "# query: " << query << std::endl;

      if( _printSnippets ) {
        _annotation = _environment.runAnnotatedQuery( query, _initialRequested );
        _results = _annotation->getResults();
      } else {
        _results = _environment.runQuery( query, _initialRequested, queryType );
      }

      if( _expander ) {
        std::string expandedQuery = _expander->expand( query, _results );
        if( _printQuery ) output << "# expanded: " << expandedQuery << std::endl;
        _results = _environment.runQuery( expandedQuery, _requested, queryType );
      }
    }
    catch( lemur::api::Exception& e )
    {
      _results.clear();
      LEMUR_RETHROW(e, "QueryThread::_runQuery Exception");
    }
  }
Beispiel #17
0
    SEXP setScoringRules(string method, string parameters){
        vector<string> scoringRules;
        //if(method == "tfidf" || method == "Okapi" || method == "BM25" ){
          //  string rule = method + "," + parameters;
            //environment.setBaseline(rule);
            //scoringRules.push_back("");
            //environment.setScoringRules(scoringRules);
        //}else{
            string rule = "method:" + method + "," + parameters;
            scoringRules.push_back(rule);
            environment.setScoringRules(scoringRules);
        //}


        return R_NilValue;
    }
Beispiel #18
0
multimap<double, pair<string, string> > indri::query::ConceptSelectorFuns::normConceptScorePrf(
                                                            vector<pair<string, string> > concatenatedGoodConcepts,
                                                            string qId,
                                                            vector<string> topDocsNames,
                                                            indri::api::QueryEnvironment & env,
                                                            indri::query::QueryReformulator * queryReformulator,
                                                            vector<string> resourceNames_)
{
    std::vector<lemur::api::DOCID_T> topDocIds = env.documentIDsFromMetadata("docno", topDocsNames);
    multimap<double, pair<string, string>, std::greater<double> > scoredConcepts_;
    for(auto concStyStrPair: concatenatedGoodConcepts) // for each each extracted concept
    {
        string conceptSty = concStyStrPair.first;
        string conceptStr = concStyStrPair.second;
        double conceptScore = indri::query::ConceptSelectorFuns::findConceptScorePrf(conceptSty,
                                                                                        conceptStr,
                                                                                        qId,
                                                                                        topDocIds,
                                                                                        env,
                                                                                        queryReformulator,
                                                                                        resourceNames_);

        scoredConcepts_.insert(make_pair(conceptScore, make_pair(conceptSty, conceptStr)));
        cout << "indri::query::ConceptSelectorFuns::normConceptScorePrf: conceptScore = " << conceptStr << " -> " << conceptScore << endl;
    }

    double max_sc = 0;
    double min_sc = std::numeric_limits<double>::infinity();
    for (auto sc: scoredConcepts_)
    {
        max_sc = max(max_sc, sc.first);
        min_sc = min(min_sc, sc.first);
    }
    cout << "indri::query::ConceptSelectorFuns::normConceptScorePrf: min_sc, max_sc: " << min_sc << ", " << max_sc << endl;

    // min-max normalize socores in scoredConcepts_
    multimap<double, pair<string, string> > scoredConcepts_norm;
    for (auto itSc = scoredConcepts_.begin(); itSc != scoredConcepts_.end(); itSc++)
    {
        double conceptScore = (itSc->first- min_sc)/(max_sc- min_sc);
        scoredConcepts_norm.insert(make_pair(conceptScore, make_pair((itSc->second).first, (itSc->second).second)));
        cout << "indri::query::ConceptSelectorFuns::normConceptScorePrf: scoredConcepts_norm: scoredConcepts_ = " << itSc->first  << endl;
        cout << "indri::query::ConceptSelectorFuns::normConceptScorePrf: scoredConcepts_norm: conceptScore = " << conceptScore << " =  (" << itSc->first << " - " << min_sc << " )/( " << max_sc << " - " << min_sc << " )" << endl;
    }

    return scoredConcepts_norm;
}
Beispiel #19
0
void convert_docnoscore_to_binary( indri::file::File& outfile, const std::string& infile, indri::api::QueryEnvironment& env ) {
  std::ifstream in;
  std::string docnoName = "docno";
  
  indri::file::SequentialWriteBuffer* outb = new indri::file::SequentialWriteBuffer( outfile, 1024*1024 );
  in.open( infile.c_str(), std::ifstream::in );
  
  while( !in.eof() ) {
    std::string docno;
    double score;
    
    in >> docno
       >> score;

    if( in.eof() )
      break;
       
    std::cout << "looking up: " << docno << " " << score << std::endl;
       
    std::vector<std::string> docnoValues;
    docnoValues.push_back( docno );
       
    std::vector<lemur::api::DOCID_T> result = env.documentIDsFromMetadata( docnoName, docnoValues );
    
    if( result.size() == 0 ) {
      //      LEMUR_THROW( LEMUR_IO_ERROR, "No document exists with docno: " + docno );
      continue; // allow entries that don't exist and ignore silently.
    }
    
    int document = result[0];
    std::cout << document << std::endl;
      
    outb->write( (const void*) &document, sizeof(UINT32) );
    outb->write( (const void*) &score, sizeof(double) );
  }
  
  outb->flush();
  delete outb;
  in.close();
}
Beispiel #20
0
 SEXP closeIndex() {
     environment.close();
     return (Rcpp::wrap(true));
 }
Beispiel #21
0
 SEXP collFreq(string _term) {
     long res = environment.termCount(_term);
     return Rcpp::wrap(res);
 }
Beispiel #22
0
  UINT64 initialize() {
    _environment.setSingleBackgroundModel( _parameters.get("singleBackgroundModel", false) );

    std::vector<std::string> stopwords;
    if( copy_parameters_to_string_vector( stopwords, _parameters, "stopper.word" ) )
      _environment.setStopwords(stopwords);

    std::vector<std::string> smoothingRules;
    if( copy_parameters_to_string_vector( smoothingRules, _parameters, "rule" ) )
      _environment.setScoringRules( smoothingRules );

   if( _parameters.exists( "index" ) ) {
      indri::api::Parameters indexes = _parameters["index"];

      for( size_t i=0; i < indexes.size(); i++ ) {
        _environment.addIndex( std::string(indexes[i]) );
      }
    }

    if( _parameters.exists( "server" ) ) {
      indri::api::Parameters servers = _parameters["server"];

      for( size_t i=0; i < servers.size(); i++ ) {
        _environment.addServer( std::string(servers[i]) );
      }
    }

    if( _parameters.exists("maxWildcardTerms") )
        _environment.setMaxWildcardTerms(_parameters.get("maxWildcardTerms", 100));

    _requested = _parameters.get( "count", 1000 );
    _initialRequested = _parameters.get( "fbDocs", _requested );
    _runID = _parameters.get( "runID", "indri" );
    _trecFormat = _parameters.get( "trecFormat" , false );
    _inexFormat = _parameters.exists( "inex" );

    _printQuery = _parameters.get( "printQuery", false );
    _printDocuments = _parameters.get( "printDocuments", false );
    _printPassages = _parameters.get( "printPassages", false );
    _printSnippets = _parameters.get( "printSnippets", false );

    if (_parameters.exists("baseline")) {
      // doing a baseline
      std::string baseline = _parameters["baseline"];
      _environment.setBaseline(baseline);
      // need a factory for this...
      if( _parameters.get( "fbDocs", 0 ) != 0 ) {
        // have to push the method in...
        std::string rule = "method:" + baseline;
        _parameters.set("rule", rule);
        _expander = new indri::query::TFIDFExpander( &_environment, _parameters );
      }
    } else {
      if( _parameters.get( "fbDocs", 0 ) != 0 ) {
        _expander = new indri::query::RMExpander( &_environment, _parameters );
      }
    }

    if (_parameters.exists("maxWildcardTerms")) {
      _environment.setMaxWildcardTerms((int)_parameters.get("maxWildcardTerms"));
    }

    return 0;
  }
Beispiel #23
0
 void deinitialize() {
   delete _expander;
   _environment.close();
 }
Beispiel #24
0
 SEXP getDocCount() {
     long res = environment.documentCount();
     return Rcpp::wrap(res);
 }
Beispiel #25
0
  void _printResultRegion( std::stringstream& output, std::string queryIndex, int start, int end  ) {
    std::vector<std::string> documentNames;
    std::vector<indri::api::ParsedDocument*> documents;

    std::vector<indri::api::ScoredExtentResult> resultSubset;

    resultSubset.assign( _results.begin() + start, _results.begin() + end );


    // Fetch document data for printing
    if( _printDocuments || _printPassages || _printSnippets ) {
      // Need document text, so we'll fetch the whole document
      documents = _environment.documents( resultSubset );
      documentNames.clear();

      for( size_t i=0; i<resultSubset.size(); i++ ) {
        indri::api::ParsedDocument* doc = documents[i];
        std::string documentName;

        indri::utility::greedy_vector<indri::parse::MetadataPair>::iterator iter = std::find_if( documents[i]->metadata.begin(),
          documents[i]->metadata.end(),
          indri::parse::MetadataPair::key_equal( "docno" ) );

        if( iter != documents[i]->metadata.end() )
          documentName = (char*) iter->value;

        // store the document name in a separate vector so later code can find it
        documentNames.push_back( documentName );
      }
    } else {
      // We only want document names, so the documentMetadata call may be faster
      documentNames = _environment.documentMetadata( resultSubset, "docno" );
    }

    std::vector<std::string> pathNames;
    if ( _inexFormat ) {
      // retrieve path names
      pathNames = _environment.pathNames( resultSubset );
    }

    // Print results
    for( size_t i=0; i < resultSubset.size(); i++ ) {
      int rank = start+i+1;
      std::string queryNumber = queryIndex;

      if( _trecFormat ) {
        // TREC formatted output: queryNumber, Q0, documentName, rank, score, runID
        output << queryNumber << " "
                << "Q0 "
                << documentNames[i] << " "
                << rank << " "
                << resultSubset[ i ].score << " "
                << _runID << std::endl;
      } else if( _inexFormat ) {

  output << "    <result>" << std::endl
         << "      <file>" << documentNames[i] << "</file>" << std::endl
         << "      <path>" << pathNames[i] << "</path>" << std::endl
         << "      <rsv>" << resultSubset[i].score << "</rsv>"  << std::endl
         << "    </result>" << std::endl;
      }
      else {
        // score, documentName, firstWord, lastWord
        output << resultSubset[i].score << "\t"
                << documentNames[i] << "\t"
                << resultSubset[i].begin << "\t"
                << resultSubset[i].end << std::endl;
      }

      if( _printDocuments ) {
        output << documents[i]->text << std::endl;
      }

      if( _printPassages ) {
        int byteBegin = documents[i]->positions[ resultSubset[i].begin ].begin;
        int byteEnd = documents[i]->positions[ resultSubset[i].end-1 ].end;
        output.write( documents[i]->text + byteBegin, byteEnd - byteBegin );
        output << std::endl;
      }

      if( _printSnippets ) {
        indri::api::SnippetBuilder builder(false);
        output << builder.build( resultSubset[i].document, documents[i], _annotation ) << std::endl;
      }

      if( documents.size() )
        delete documents[i];
    }
  }
Beispiel #26
0
 SEXP getTermCount() {
     long res = environment.termCount();
     return Rcpp::wrap(res);
 }
Beispiel #27
0
 SEXP stemTerm(string _term) {
     return Rcpp::wrap(environment.stemTerm(_term));
 }
Beispiel #28
0
 SEXP docFreq(string _term) {
     long res = environment.documentCount(_term);
     return Rcpp::wrap(res);
 }
Beispiel #29
0
 SEXP getDocumentLengths(){
     vector<int> docLength;
     for (size_t i = 0; i < documentIDs.size(); i++)
         docLength.push_back(environment.documentLength(documentIDs[i]));
     return wrap(docLength);
 }
Beispiel #30
0
 SEXP getMetaData(string metaDataKey){
     vector<string> metaDataString = environment.documentMetadata(documentIDs, metaDataKey);
     CharacterVector c = wrap(metaDataString);
     c.attr("names") = extDocIDs;
     return c;
 }