int main( int argc, char** argv ) { try { indri::api::Parameters& param = indri::api::Parameters::instance(); param.loadCommandLine( argc, argv ); usage( param ); indri::api::QueryEnvironment environment; open_indexes( environment, param ); indri::api::Parameters parameterQueries = param[ "query" ]; std::string rmSmoothing = param.get("smoothing", "method:jm,lambda,0.5"); int documents = (int) param.get( "documents", 5 ); int terms = (int) param.get( "terms", 10 ); int maxGrams = 1; for( size_t i=0; i<parameterQueries.size(); i++ ) { std::string query = parameterQueries[i]; indri::query::RelevanceModel model( environment, rmSmoothing, maxGrams, documents ); model.generate( query ); const std::vector<indri::query::RelevanceModel::Gram*>& grams = model.getGrams(); printClarity(query, environment, grams, terms); } } catch( lemur::api::Exception& e ) { LEMUR_ABORT(e); } catch( ... ) { std::cout << "Caught an unhandled exception" << std::endl; } return 0; }
int main( int argc, char** argv ) { try { indri::api::Parameters& param = indri::api::Parameters::instance(); param.loadCommandLine( argc, argv ); usage( param ); indri::api::QueryEnvironment environment; open_indexes( environment, param ); indri::api::Parameters parameterQueries = param[ "query" ]; std::string rmSmoothing = ""; // eventually, we should offer relevance model smoothing int documents = (int) param[ "documents" ]; int maxGrams = (int) param.get( "maxGrams", 1 ); // unigram is default for( size_t i=0; i<parameterQueries.size(); i++ ) { std::string query = parameterQueries[i]; indri::query::RelevanceModel model( environment, rmSmoothing, maxGrams, documents ); model.generate( query ); const std::vector<indri::query::RelevanceModel::Gram*>& grams = model.getGrams(); printGrams( query, grams ); } } catch( lemur::api::Exception& e ) { LEMUR_ABORT(e); } catch( ... ) { std::cout << "Caught an unhandled exception" << std::endl; } return 0; }
// Recover a repository that crashed during build to be consistent with // its latest checkpoint. If it can't be recovered, create an empty one. static bool _recoverRepository(const std::string &path) { indri::collection::Repository repo; try { repo.open(path); } catch (lemur::api::Exception &ex) { // failed to open, can't fix it, recreate. return false; } // count up the documents that made it to disk indri::collection::Repository::index_state indexes = repo.indexes(); INT64 total = 0; for( size_t i = 0; i < indexes->size(); i++ ) { indri::thread::ScopedLock lock( (*indexes)[i]->statisticsLock() ); total += (*indexes)[i]->documentCount(); } total -= repo.deletedList().deletedCount(); // identify the docids that are in the collection but not in a disk index indri::collection::CompressedCollection *col = repo.collection(); indri::index::DeletedDocumentList del; bool marked = false; int numMarked = 0; for (int i = (int)total + 1; col->exists(i); i++) { del.markDeleted(i); marked = true; numMarked++; } // compact to delete the data associated with the unindexed docids. if (marked) { try { std::cerr << "Reovering Repository: " << path << "\nDeleting " << numMarked << " uncommitted documents." << std::endl; col->compact(del); // check for any partial disk indexes (crash during write) // and remove them std::string indexPath = indri::file::Path::combine( path, "index" ); indri::file::DirectoryIterator idirs( indexPath ); while (! (idirs == indri::file::DirectoryIterator::end())) { // iterate over the subdirectories, removing any that don't have a // manifest file. std::string current = *idirs; std::string manifest = indri::file::Path::combine(current, "manifest"); if (!indri::file::Path::exists(manifest)) { std::cerr << "Removing corrupted index directory: " << current << std::endl; indri::file::Path::remove(current); } idirs++; } } catch (lemur::api::Exception &e) { // no recovery possible here... LEMUR_ABORT(e); } } repo.close(); // successfully opened and closed return true; }
int main( int argc, char** argv ) { try { indri::api::Parameters& parameters = indri::api::Parameters::instance(); parameters.loadCommandLine( argc, argv ); if( parameters.get( "version", false ) ) { std::cout << INDRI_DISTRIBUTION << std::endl; } require_parameter( "corpus", parameters ); require_parameter( "links", parameters ); require_parameter( "output", parameters ); std::string corpusPath = parameters[ "corpus" ]; std::string linkPath = parameters[ "links" ]; std::string outputFile = parameters[ "output" ]; std::string indexPath = parameters.get("index", ""); UINT64 colLen = 0; indri::parse::PageRank *pr = 0; if (indexPath.size() > 0) { int maxIters = parameters.get( "iters", 100 ); double c = parameters.get( "c", 0.85 ); pr = new indri::parse::PageRank ( corpusPath, linkPath, indexPath ); pr->indexPageRank(outputFile, maxIters, c); } else { int docsPerIter = parameters.get( "docs", 1000 ); int maxIters = parameters.get( "iters", 10 ); double c = parameters.get( "c", 0.5 ); pr = new indri::parse::PageRank( corpusPath, linkPath, colLen ); pr->computePageRank( outputFile, maxIters, docsPerIter, c ); } if( parameters.get( "writeRaw", false ) ) { std::string rawFile = outputFile + ".raw"; pr->writeRaw( outputFile, rawFile ); } // default is to produce a prior file for makeprior. if( parameters.get( "writePriors", true ) ) { std::string priorFile = outputFile + ".prior"; pr->writePriors( outputFile, priorFile ); } if( parameters.get( "writeRanks", false ) ) { std::string ranksFile = outputFile + ".ranks"; pr->writeRanks( outputFile, ranksFile ); } // don't really need the outputFile ::remove(outputFile.c_str()); delete pr; } catch( lemur::api::Exception& e ) { LEMUR_ABORT(e); } }
int main( int argc, char* argv[] ) { try { indri::api::Parameters& parameters = indri::api::Parameters::instance(); parameters.loadCommandLine( argc, argv ); indri::net::NetworkListener listener; int port = parameters.get( "port", INDRID_PORT ); verbose = parameters.get( "verbose", false ); std::string repositoryPath = parameters["index"]; // wrap the index in a local server that the stub can talk to indri::collection::Repository* repository = new indri::collection::Repository(); // pass in parameters, in case anyone wants to do query side stopping. repository->openRead( repositoryPath, ¶meters ); indri::server::LocalQueryServer server( *repository ); // open for business listener.listen( port ); indri::net::NetworkStream* connection; std::list<connection_info*> connections; // this handles the threading issue by only allowing one // connection at a time; for our current uses this is fine while( connection = listener.accept() ) { connection_info* info = build_connection( connection, &server ); connections.push_back( info ); clean_connections( connections ); } wait_connections( connections ); repository->close(); delete repository; return 0; } catch( lemur::api::Exception& e ) { LEMUR_ABORT(e); } }
int main(int argc, char * argv[]) { try { indri::api::Parameters& param = indri::api::Parameters::instance(); param.loadCommandLine( argc, argv ); if( param.get( "version", 0 ) ) { std::cout << INDRI_DISTRIBUTION << std::endl; } if( !param.exists( "query" ) ) LEMUR_THROW( LEMUR_MISSING_PARAMETER_ERROR, "Must specify at least one query." ); if( !param.exists("index") && !param.exists("server") ) LEMUR_THROW( LEMUR_MISSING_PARAMETER_ERROR, "Must specify a server or index to query against." ); if (param.exists("baseline") && param.exists("rule")) LEMUR_THROW( LEMUR_BAD_PARAMETER_ERROR, "Smoothing rules may not be specified when running a baseline." ); int threadCount = param.get( "threads", 1 ); std::queue< query_t* > queries; std::priority_queue< query_t*, std::vector< query_t* >, query_t::greater > output; std::vector< QueryThread* > threads; indri::thread::Mutex queueLock; indri::thread::ConditionVariable queueEvent; // push all queries onto a queue indri::api::Parameters parameterQueries = param[ "query" ]; int queryOffset = param.get( "queryOffset", 0 ); push_queue( queries, parameterQueries, queryOffset ); int queryCount = (int)queries.size(); // launch threads for( int i=0; i<threadCount; i++ ) { threads.push_back( new QueryThread( queries, output, queueLock, queueEvent, param ) ); threads.back()->start(); } int query = 0; bool inexFormat = param.exists( "inex" ); if( inexFormat ) { std::string participantID = param.get( "inex.participantID", "1"); std::string runID = param.get( "runID", "indri" ); std::string inexTask = param.get( "inex.task", "CO.Thorough" ); std::string inexTopicPart = param.get( "inex.topicPart", "T" ); std::string description = param.get( "inex.description", "" ); std::string queryType = param.get("inex.query", "automatic"); std::cout << "<inex-submission participant-id=\"" << participantID << "\" run-id=\"" << runID << "\" task=\"" << inexTask << "\" query=\"" << queryType << "\" topic-part=\"" << inexTopicPart << "\">" << std::endl << " <description>" << std::endl << description << std::endl << " </description>" << std::endl; } // acquire the lock. queueLock.lock(); // process output as it appears on the queue while( query < queryCount ) { query_t* result = NULL; // wait for something to happen queueEvent.wait( queueLock ); while( output.size() && output.top()->index == query ) { result = output.top(); output.pop(); queueLock.unlock(); std::cout << result->text; delete result; query++; queueLock.lock(); } } queueLock.unlock(); if( inexFormat ) { std::cout << "</inex-submission>" << std::endl; } // join all the threads for( size_t i=0; i<threads.size(); i++ ) threads[i]->join(); // we've seen all the query output now, so we can quit indri::utility::delete_vector_contents( threads ); } catch( lemur::api::Exception& e ) { LEMUR_ABORT(e); } catch( ... ) { std::cout << "Caught unhandled exception" << std::endl; return -1; } return 0; }
int main(int argc, char * argv[]) { try { indri::api::Parameters& parameters = indri::api::Parameters::instance(); parameters.loadCommandLine( argc, argv ); require_parameter( "corpus", parameters ); require_parameter( "index", parameters ); StatusMonitor monitor; indri::api::IndexEnvironment env; std::string repositoryPath = parameters["index"]; buildindex_start_time(); if( parameters.get( "version", 0 ) ) { std::cout << INDRI_DISTRIBUTION << std::endl; } env.setMemory( parameters.get("memory", INT64(1024*1024*1024)) ); env.setNormalization( parameters.get("normalize", true)); env.setInjectURL( parameters.get("injectURL", true)); env.setStoreDocs( parameters.get("storeDocs", true)); std::string blackList = parameters.get("blacklist", ""); if( blackList.length() ) { int count = env.setBlackList(blackList); std::cout << "Added to blacklist: "<< count << std::endl; std::cout.flush(); } std::string offsetAnnotationHint=parameters.get("offsetannotationhint", "default"); if (offsetAnnotationHint=="ordered") { env.setOffsetAnnotationIndexHint(indri::parse::OAHintOrderedAnnotations); } if (offsetAnnotationHint=="unordered") { env.setOffsetAnnotationIndexHint(indri::parse::OAHintSizeBuffers); } else { env.setOffsetAnnotationIndexHint(indri::parse::OAHintDefault); } std::string stemmerName = parameters.get("stemmer.name", ""); if( stemmerName.length() ) env.setStemmer(stemmerName); std::vector<std::string> stopwords; if( copy_parameters_to_string_vector( stopwords, parameters, "stopper.word" ) ) env.setStopwords(stopwords); // fields to include as metadata (unindexed) std::vector<std::string> metadata; // metadata fields that should have a forward lookup table. std::vector<std::string> metadataForward; // metadata fields that should have a backward lookup table. std::vector<std::string> metadataBackward; copy_parameters_to_string_vector( metadata, parameters, "metadata.field" ); downcase_string_vector(metadata); copy_parameters_to_string_vector( metadataForward, parameters, "metadata.forward" ); downcase_string_vector(metadataForward); copy_parameters_to_string_vector( metadataBackward, parameters, "metadata.backward" ); downcase_string_vector(metadataBackward); // docno is a special field, automagically add it as forward and backward. std::string docno = "docno"; if( std::find( metadataForward.begin(), metadataForward.end(), docno ) == metadataForward.end() ) metadataForward.push_back(docno); if( std::find( metadataBackward.begin(), metadataBackward.end(), docno ) == metadataBackward.end() ) metadataBackward.push_back(docno); env.setMetadataIndexedFields( metadataForward, metadataBackward ); #if 0 // "document" is a special field. // automagically add it as an indexed field. indri::api::Parameters field = parameters.append("field"); field.set( "name", "document" ); field.set( "ordinal", true ); field.set("parental", true); #endif std::vector<std::string> fields; std::string subName = "name"; if( copy_parameters_to_string_vector( fields, parameters, "field", &subName ) ) { downcase_string_vector(fields); env.setIndexedFields(fields); process_numeric_fields( parameters, env ); process_ordinal_fields( parameters, env ); process_parental_fields( parameters, env ); //pto } if( indri::collection::Repository::exists( repositoryPath ) ) { // check if the repository was corrupted by an indexing crash // if so, recover it and continue. if (_recoverRepository(repositoryPath)) { env.open( repositoryPath, &monitor ); buildindex_print_event( std::string() + "Opened repository " + repositoryPath ); } else { // failed to open it, needs to be created from scratch. // create will remove any cruft. env.create( repositoryPath, &monitor ); buildindex_print_event( std::string() + "Created repository " + repositoryPath ); } } else { env.create( repositoryPath, &monitor ); buildindex_print_event( std::string() + "Created repository " + repositoryPath ); } indri::api::Parameters corpus = parameters["corpus"]; for( unsigned int i=0; i<corpus.size(); i++ ) { indri::api::Parameters thisCorpus = corpus[i]; require_parameter( "path", thisCorpus ); std::string corpusPath = thisCorpus["path"]; std::string fileClass = thisCorpus.get("class", ""); // augment field/metadata tags in the environment if needed. if( fileClass.length() ) { indri::parse::FileClassEnvironmentFactory::Specification *spec = env.getFileClassSpec(fileClass); if( spec ) { // add fields if necessary, only update if changed. if( augmentSpec( spec, fields, metadata, metadataForward, metadataBackward ) ) env.addFileClass(*spec); delete(spec); } } bool isDirectory = indri::file::Path::isDirectory( corpusPath ); // First record the document root, and then the paths to any annotator inputs env.setDocumentRoot( corpusPath ); // Support for anchor text std::string anchorText = thisCorpus.get("inlink", ""); env.setAnchorTextPath( anchorText ); // Support for offset annotations std::string offsetAnnotationsPath = thisCorpus.get( "annotations", "" ); env.setOffsetAnnotationsPath( offsetAnnotationsPath ); // Support for offset metadata file std::string offsetMetadataPath = thisCorpus.get( "metadata", "" ); env.setOffsetMetadataPath( offsetMetadataPath ); if( isDirectory ) { indri::file::FileTreeIterator files( corpusPath ); for( ; files != indri::file::FileTreeIterator::end(); files++ ) { if( fileClass.length() ) env.addFile( *files, fileClass ); else { std::string extension = indri::file::Path::extension( *files ); indri::parse::FileClassEnvironmentFactory::Specification *spec = env.getFileClassSpec(extension); if( spec ) { // add fields if necessary, only update if changed. if( augmentSpec( spec, fields, metadata, metadataForward, metadataBackward ) ) env.addFileClass(*spec); delete(spec); } env.addFile( *files ); } } } else { if( fileClass.length() ) env.addFile( corpusPath, fileClass ); else { std::string extension = indri::file::Path::extension( corpusPath ); indri::parse::FileClassEnvironmentFactory::Specification *spec = env.getFileClassSpec(extension); if( spec ) { // add fields if necessary, only update if changed. if( augmentSpec( spec, fields, metadata, metadataForward, metadataBackward ) ) env.addFileClass(*spec); delete(spec); } env.addFile( corpusPath ); } } } buildindex_print_event( "Closing index" ); env.close(); buildindex_print_event( "Finished" ); } catch( lemur::api::Exception& e ) { LEMUR_ABORT(e); } return 0; }
int main( int argc, char** argv ) { try { indri::api::Parameters& param = indri::api::Parameters::instance(); param.loadCommandLine( argc, argv ); if( !param.exists("index") || !param.exists("input") || !param.exists("name") ) { std::cerr << "makeprior usage: " << std::endl << " makeprior -index=myindex -input=myinputfile -name=priorname" << std::endl << " myindex: a valid Indri index " << std::endl << " myinputfile: a two column text file, where the first column contains docno values" << std::endl << " and the second column contains log probabilities (should be between -infinity and zero)" << std::endl << " name: the name of this prior (as you will reference it in queries, using the #prior(name) syntax)" << std::endl; exit(-1); } std::string index = param["index"]; // get the total document count, including deleted documents. indri::collection::Repository* _repository = new indri::collection::Repository(); _repository->openRead(index); indri::collection::Repository::index_state indexes = _repository->indexes(); int documentCount = 0; for( size_t i=0; i<indexes->size(); i++ ) { indri::thread::ScopedLock lock( (*indexes)[i]->statisticsLock() ); documentCount += (int)(*indexes)[i]->documentCount(); } delete _repository; indri::api::QueryEnvironment env; std::cout << "opening index: " << index << std::endl; env.addIndex( index ); std::string input = param["input"]; std::string priorName = param["name"]; size_t memory = param.get( "memory", 50*1024*1024 ); // step one - convert file from docno/score format to binary format indri::file::File unsortedBinary; std::string unsortedName; unsortedBinary.openTemporary( unsortedName ); std::cout << "converting to binary..."; std::cout.flush(); convert_docnoscore_to_binary( unsortedBinary, input, env ); std::cout << "finished" << std::endl; // step two -- sort the binary version indri::file::File uncompressedPrior; std::string uncompressedPriorName; uncompressedPrior.openTemporary( uncompressedPriorName ); std::cout << "sorting..."; std::cout.flush(); sort_file( uncompressedPrior, unsortedBinary, memory, documentCount ); std::cout << "finished"; unsortedBinary.close(); lemur_compat::remove( unsortedName.c_str() ); // step three -- check to see if it's compressable, if so, compress it std::map<double, int> table; indri::file::File compressedPrior; std::string compressedPriorName; compressedPrior.openTemporary( compressedPriorName ); indri::file::File& finalPrior = uncompressedPrior; std::cout << "checking for compressability..."; std::cout.flush(); bool result = extract_compression_table( table, uncompressedPrior ); if( result ) { std::cout << "yep" << std::endl; // compress the file by using a lookup table std::cout << "compressing..."; std::cout.flush(); compress_file( compressedPrior, uncompressedPrior, table ); std::cout << std::endl; finalPrior = compressedPrior; } else { std::cout << "nope" << std::endl; } // step four -- install the prior in the index std::cout << "installing..."; std::cout.flush(); install_prior( index, priorName, finalPrior ); std::cout << "finished" << std::endl; // clean up uncompressedPrior.close(); compressedPrior.close(); lemur_compat::remove( uncompressedPriorName.c_str() ); lemur_compat::remove( compressedPriorName.c_str() ); } catch( lemur::api::Exception& e ) { LEMUR_ABORT(e); } return 0; }