SEXP generateSnippets(bool html){ vector<string> snippetString; vector< indri::api::ParsedDocument* > pdocuments = environment.documents(documentIDs); indri::api::SnippetBuilder sp(html); for( size_t row=0; row < documentIDs.size(); row++ ) snippetString.push_back(sp.build(documentIDs[row], pdocuments[row], qa)); CharacterVector c = wrap(snippetString); c.attr("names") = extDocIDs; return c; }
void _printResultRegion( std::stringstream& output, std::string queryIndex, int start, int end ) { std::vector<std::string> documentNames; std::vector<indri::api::ParsedDocument*> documents; std::vector<indri::api::ScoredExtentResult> resultSubset; resultSubset.assign( _results.begin() + start, _results.begin() + end ); // Fetch document data for printing if( _printDocuments || _printPassages || _printSnippets ) { // Need document text, so we'll fetch the whole document documents = _environment.documents( resultSubset ); documentNames.clear(); for( size_t i=0; i<resultSubset.size(); i++ ) { indri::api::ParsedDocument* doc = documents[i]; std::string documentName; indri::utility::greedy_vector<indri::parse::MetadataPair>::iterator iter = std::find_if( documents[i]->metadata.begin(), documents[i]->metadata.end(), indri::parse::MetadataPair::key_equal( "docno" ) ); if( iter != documents[i]->metadata.end() ) documentName = (char*) iter->value; // store the document name in a separate vector so later code can find it documentNames.push_back( documentName ); } } else { // We only want document names, so the documentMetadata call may be faster documentNames = _environment.documentMetadata( resultSubset, "docno" ); } std::vector<std::string> pathNames; if ( _inexFormat ) { // retrieve path names pathNames = _environment.pathNames( resultSubset ); } // Print results for( size_t i=0; i < resultSubset.size(); i++ ) { int rank = start+i+1; std::string queryNumber = queryIndex; if( _trecFormat ) { // TREC formatted output: queryNumber, Q0, documentName, rank, score, runID output << queryNumber << " " << "Q0 " << documentNames[i] << " " << rank << " " << resultSubset[ i ].score << " " << _runID << std::endl; } else if( _inexFormat ) { output << " <result>" << std::endl << " <file>" << documentNames[i] << "</file>" << std::endl << " <path>" << pathNames[i] << "</path>" << std::endl << " <rsv>" << resultSubset[i].score << "</rsv>" << std::endl << " </result>" << std::endl; } else { // score, documentName, firstWord, lastWord output << resultSubset[i].score << "\t" << documentNames[i] << "\t" << resultSubset[i].begin << "\t" << resultSubset[i].end << std::endl; } if( _printDocuments ) { output << documents[i]->text << std::endl; } if( _printPassages ) { int byteBegin = documents[i]->positions[ resultSubset[i].begin ].begin; int byteEnd = documents[i]->positions[ resultSubset[i].end-1 ].end; output.write( documents[i]->text + byteBegin, byteEnd - byteBegin ); output << std::endl; } if( _printSnippets ) { indri::api::SnippetBuilder builder(false); output << builder.build( resultSubset[i].document, documents[i], _annotation ) << std::endl; } if( documents.size() ) delete documents[i]; } }