void push_queue( std::queue< query_t* >& q, indri::api::Parameters& queries, int queryOffset ) { for( size_t i=0; i<queries.size(); i++ ) { std::string queryNumber; std::string queryText; std::string queryType = "indri"; if( queries[i].exists( "type" ) ) queryType = (std::string) queries[i]["type"]; if (queries[i].exists("text")) queryText = (std::string) queries[i]["text"]; if( queries[i].exists( "number" ) ) { queryNumber = (std::string) queries[i]["number"]; } else { int thisQuery=queryOffset + int(i); std::stringstream s; s << thisQuery; queryNumber = s.str(); } if (queryText.size() == 0) queryText = (std::string) queries[i]; // working set and RELFB docs go here. // working set to restrict retrieval std::vector<std::string> workingSet; // Rel fb docs std::vector<std::string> relFBDocs; copy_parameters_to_string_vector( workingSet, queries[i], "workingSetDocno" ); copy_parameters_to_string_vector( relFBDocs, queries[i], "feedbackDocno" ); q.push( new query_t( i, queryNumber, queryText, queryType, workingSet, relFBDocs ) ); } }
static void open_indexes( indri::api::QueryEnvironment& environment, indri::api::Parameters& param ) { if( param.exists( "index" ) ) { indri::api::Parameters indexes = param["index"]; for( unsigned int i=0; i < indexes.size(); i++ ) { environment.addIndex( std::string(indexes[i]) ); } } if( param.exists( "server" ) ) { indri::api::Parameters servers = param["server"]; for( unsigned int i=0; i < servers.size(); i++ ) { environment.addServer( std::string(servers[i]) ); } } std::vector<std::string> smoothingRules; if( copy_parameters_to_string_vector( smoothingRules, param, "rule" ) ) environment.setScoringRules( smoothingRules ); }
UINT64 initialize() { _environment.setSingleBackgroundModel( _parameters.get("singleBackgroundModel", false) ); std::vector<std::string> stopwords; if( copy_parameters_to_string_vector( stopwords, _parameters, "stopper.word" ) ) _environment.setStopwords(stopwords); std::vector<std::string> smoothingRules; if( copy_parameters_to_string_vector( smoothingRules, _parameters, "rule" ) ) _environment.setScoringRules( smoothingRules ); if( _parameters.exists( "index" ) ) { indri::api::Parameters indexes = _parameters["index"]; for( size_t i=0; i < indexes.size(); i++ ) { _environment.addIndex( std::string(indexes[i]) ); } } if( _parameters.exists( "server" ) ) { indri::api::Parameters servers = _parameters["server"]; for( size_t i=0; i < servers.size(); i++ ) { _environment.addServer( std::string(servers[i]) ); } } if( _parameters.exists("maxWildcardTerms") ) _environment.setMaxWildcardTerms(_parameters.get("maxWildcardTerms", 100)); _requested = _parameters.get( "count", 1000 ); _initialRequested = _parameters.get( "fbDocs", _requested ); _runID = _parameters.get( "runID", "indri" ); _trecFormat = _parameters.get( "trecFormat" , false ); _inexFormat = _parameters.exists( "inex" ); _printQuery = _parameters.get( "printQuery", false ); _printDocuments = _parameters.get( "printDocuments", false ); _printPassages = _parameters.get( "printPassages", false ); _printSnippets = _parameters.get( "printSnippets", false ); if (_parameters.exists("baseline")) { // doing a baseline std::string baseline = _parameters["baseline"]; _environment.setBaseline(baseline); // need a factory for this... if( _parameters.get( "fbDocs", 0 ) != 0 ) { // have to push the method in... std::string rule = "method:" + baseline; _parameters.set("rule", rule); _expander = new indri::query::TFIDFExpander( &_environment, _parameters ); } } else { if( _parameters.get( "fbDocs", 0 ) != 0 ) { _expander = new indri::query::RMExpander( &_environment, _parameters ); } } if (_parameters.exists("maxWildcardTerms")) { _environment.setMaxWildcardTerms((int)_parameters.get("maxWildcardTerms")); } return 0; }
int main(int argc, char * argv[]) { try { indri::api::Parameters& parameters = indri::api::Parameters::instance(); parameters.loadCommandLine( argc, argv ); require_parameter( "corpus", parameters ); require_parameter( "index", parameters ); StatusMonitor monitor; indri::api::IndexEnvironment env; std::string repositoryPath = parameters["index"]; buildindex_start_time(); if( parameters.get( "version", 0 ) ) { std::cout << INDRI_DISTRIBUTION << std::endl; } env.setMemory( parameters.get("memory", INT64(1024*1024*1024)) ); env.setNormalization( parameters.get("normalize", true)); env.setInjectURL( parameters.get("injectURL", true)); env.setStoreDocs( parameters.get("storeDocs", true)); std::string blackList = parameters.get("blacklist", ""); if( blackList.length() ) { int count = env.setBlackList(blackList); std::cout << "Added to blacklist: "<< count << std::endl; std::cout.flush(); } std::string offsetAnnotationHint=parameters.get("offsetannotationhint", "default"); if (offsetAnnotationHint=="ordered") { env.setOffsetAnnotationIndexHint(indri::parse::OAHintOrderedAnnotations); } if (offsetAnnotationHint=="unordered") { env.setOffsetAnnotationIndexHint(indri::parse::OAHintSizeBuffers); } else { env.setOffsetAnnotationIndexHint(indri::parse::OAHintDefault); } std::string stemmerName = parameters.get("stemmer.name", ""); if( stemmerName.length() ) env.setStemmer(stemmerName); std::vector<std::string> stopwords; if( copy_parameters_to_string_vector( stopwords, parameters, "stopper.word" ) ) env.setStopwords(stopwords); // fields to include as metadata (unindexed) std::vector<std::string> metadata; // metadata fields that should have a forward lookup table. std::vector<std::string> metadataForward; // metadata fields that should have a backward lookup table. std::vector<std::string> metadataBackward; copy_parameters_to_string_vector( metadata, parameters, "metadata.field" ); downcase_string_vector(metadata); copy_parameters_to_string_vector( metadataForward, parameters, "metadata.forward" ); downcase_string_vector(metadataForward); copy_parameters_to_string_vector( metadataBackward, parameters, "metadata.backward" ); downcase_string_vector(metadataBackward); // docno is a special field, automagically add it as forward and backward. std::string docno = "docno"; if( std::find( metadataForward.begin(), metadataForward.end(), docno ) == metadataForward.end() ) metadataForward.push_back(docno); if( std::find( metadataBackward.begin(), metadataBackward.end(), docno ) == metadataBackward.end() ) metadataBackward.push_back(docno); env.setMetadataIndexedFields( metadataForward, metadataBackward ); #if 0 // "document" is a special field. // automagically add it as an indexed field. indri::api::Parameters field = parameters.append("field"); field.set( "name", "document" ); field.set( "ordinal", true ); field.set("parental", true); #endif std::vector<std::string> fields; std::string subName = "name"; if( copy_parameters_to_string_vector( fields, parameters, "field", &subName ) ) { downcase_string_vector(fields); env.setIndexedFields(fields); process_numeric_fields( parameters, env ); process_ordinal_fields( parameters, env ); process_parental_fields( parameters, env ); //pto } if( indri::collection::Repository::exists( repositoryPath ) ) { // check if the repository was corrupted by an indexing crash // if so, recover it and continue. if (_recoverRepository(repositoryPath)) { env.open( repositoryPath, &monitor ); buildindex_print_event( std::string() + "Opened repository " + repositoryPath ); } else { // failed to open it, needs to be created from scratch. // create will remove any cruft. env.create( repositoryPath, &monitor ); buildindex_print_event( std::string() + "Created repository " + repositoryPath ); } } else { env.create( repositoryPath, &monitor ); buildindex_print_event( std::string() + "Created repository " + repositoryPath ); } indri::api::Parameters corpus = parameters["corpus"]; for( unsigned int i=0; i<corpus.size(); i++ ) { indri::api::Parameters thisCorpus = corpus[i]; require_parameter( "path", thisCorpus ); std::string corpusPath = thisCorpus["path"]; std::string fileClass = thisCorpus.get("class", ""); // augment field/metadata tags in the environment if needed. if( fileClass.length() ) { indri::parse::FileClassEnvironmentFactory::Specification *spec = env.getFileClassSpec(fileClass); if( spec ) { // add fields if necessary, only update if changed. if( augmentSpec( spec, fields, metadata, metadataForward, metadataBackward ) ) env.addFileClass(*spec); delete(spec); } } bool isDirectory = indri::file::Path::isDirectory( corpusPath ); // First record the document root, and then the paths to any annotator inputs env.setDocumentRoot( corpusPath ); // Support for anchor text std::string anchorText = thisCorpus.get("inlink", ""); env.setAnchorTextPath( anchorText ); // Support for offset annotations std::string offsetAnnotationsPath = thisCorpus.get( "annotations", "" ); env.setOffsetAnnotationsPath( offsetAnnotationsPath ); // Support for offset metadata file std::string offsetMetadataPath = thisCorpus.get( "metadata", "" ); env.setOffsetMetadataPath( offsetMetadataPath ); if( isDirectory ) { indri::file::FileTreeIterator files( corpusPath ); for( ; files != indri::file::FileTreeIterator::end(); files++ ) { if( fileClass.length() ) env.addFile( *files, fileClass ); else { std::string extension = indri::file::Path::extension( *files ); indri::parse::FileClassEnvironmentFactory::Specification *spec = env.getFileClassSpec(extension); if( spec ) { // add fields if necessary, only update if changed. if( augmentSpec( spec, fields, metadata, metadataForward, metadataBackward ) ) env.addFileClass(*spec); delete(spec); } env.addFile( *files ); } } } else { if( fileClass.length() ) env.addFile( corpusPath, fileClass ); else { std::string extension = indri::file::Path::extension( corpusPath ); indri::parse::FileClassEnvironmentFactory::Specification *spec = env.getFileClassSpec(extension); if( spec ) { // add fields if necessary, only update if changed. if( augmentSpec( spec, fields, metadata, metadataForward, metadataBackward ) ) env.addFileClass(*spec); delete(spec); } env.addFile( corpusPath ); } } } buildindex_print_event( "Closing index" ); env.close(); buildindex_print_event( "Finished" ); } catch( lemur::api::Exception& e ) { LEMUR_ABORT(e); } return 0; }
UINT64 initialize() { try { _environment.setSingleBackgroundModel( _parameters.get("singleBackgroundModel", false) ); std::vector<std::string> stopwords; if( copy_parameters_to_string_vector( stopwords, _parameters, "stopper.word" ) ) _environment.setStopwords(stopwords); std::vector<std::string> smoothingRules; if( copy_parameters_to_string_vector( smoothingRules, _parameters, "rule" ) ) _environment.setScoringRules( smoothingRules ); if( _parameters.exists( "index" ) ) { indri::api::Parameters indexes = _parameters["index"]; for( size_t i=0; i < indexes.size(); i++ ) { _environment.addIndex( std::string(indexes[i]) ); } } if( _parameters.exists( "server" ) ) { indri::api::Parameters servers = _parameters["server"]; for( size_t i=0; i < servers.size(); i++ ) { _environment.addServer( std::string(servers[i]) ); } } if( _parameters.exists("maxWildcardTerms") ) _environment.setMaxWildcardTerms(_parameters.get("maxWildcardTerms", 100)); _requested = _parameters.get( "count", 1000 ); _initialRequested = _parameters.get( "fbDocs", _requested ); _runID = _parameters.get( "runID", "indri" ); _trecFormat = _parameters.get( "trecFormat" , false ); _inexFormat = _parameters.exists( "inex" ); _printQuery = _parameters.get( "printQuery", false ); _printDocuments = _parameters.get( "printDocuments", false ); _printPassages = _parameters.get( "printPassages", false ); _printSnippets = _parameters.get( "printSnippets", false ); if (_parameters.exists("baseline")) { // doing a baseline std::string baseline = _parameters["baseline"]; _environment.setBaseline(baseline); // need a factory for this... if( _parameters.get( "fbDocs", 0 ) != 0 ) { // have to push the method in... std::string rule = "method:" + baseline; _parameters.set("rule", rule); _expander = new indri::query::TFIDFExpander( &_environment, _parameters ); } } else { if( _parameters.get( "fbDocs", 0 ) != 0 ) { _expander = new indri::query::RMExpander( &_environment, _parameters ); } } if (_parameters.exists("maxWildcardTerms")) { _environment.setMaxWildcardTerms((int)_parameters.get("maxWildcardTerms")); } } catch ( lemur::api::Exception& e ) { while( _queries.size() ) { query_t *query = _queries.front(); _queries.pop(); _output.push( new query_t( query->index, query->number, "query: " + query->number + " QueryThread::_initialize exception\n" ) ); _queueEvent.notifyAll(); LEMUR_RETHROW(e, "QueryThread::_initialize"); } } return 0; }