Ejemplo n.º 1
0
int main( int argc, char** argv ) {
  try {
    indri::api::Parameters& param = indri::api::Parameters::instance();
    param.loadCommandLine( argc, argv );
    usage( param );

    indri::api::QueryEnvironment environment;
    open_indexes( environment, param );
              
    indri::api::Parameters parameterQueries = param[ "query" ];
    std::string rmSmoothing = param.get("smoothing", "method:jm,lambda,0.5");
    int documents = (int) param.get( "documents", 5 );
    int terms = (int) param.get( "terms", 10 );
    int maxGrams = 1; 

    for( size_t i=0; i<parameterQueries.size(); i++ ) {
      std::string query = parameterQueries[i];
      indri::query::RelevanceModel model( environment, rmSmoothing, maxGrams, documents );
      model.generate( query );

      const std::vector<indri::query::RelevanceModel::Gram*>& grams = model.getGrams();
      printClarity(query, environment, grams, terms);
    }
  } catch( lemur::api::Exception& e ) {
    LEMUR_ABORT(e);
  } catch( ... ) {
    std::cout << "Caught an unhandled exception" << std::endl;
  }
  return 0;
}
Ejemplo n.º 2
0
int main( int argc, char** argv ) {
  try {
    indri::api::Parameters& param = indri::api::Parameters::instance();
    param.loadCommandLine( argc, argv );
    usage( param );

    indri::api::QueryEnvironment environment;
    open_indexes( environment, param );
              
    indri::api::Parameters parameterQueries = param[ "query" ];
    std::string rmSmoothing = ""; // eventually, we should offer relevance model smoothing
    int documents = (int) param[ "documents" ];
    int maxGrams = (int) param.get( "maxGrams", 1 ); // unigram is default

    for( size_t i=0; i<parameterQueries.size(); i++ ) {
      std::string query = parameterQueries[i];
      indri::query::RelevanceModel model( environment, rmSmoothing, maxGrams, documents );
      model.generate( query );

      const std::vector<indri::query::RelevanceModel::Gram*>& grams = model.getGrams();
      printGrams( query, grams );
    }
  } catch( lemur::api::Exception& e ) {
    LEMUR_ABORT(e);
  } catch( ... ) {
    std::cout << "Caught an unhandled exception" << std::endl;
  }

  return 0;
}
// Recover a repository that crashed during build to be consistent with
// its latest checkpoint. If it can't be recovered, create an empty one.
static bool _recoverRepository(const std::string &path) {
  indri::collection::Repository repo;
  try {
    repo.open(path);
  } catch (lemur::api::Exception &ex) {
    // failed to open, can't fix it, recreate.
    return false;
  }
  
  // count up the documents that made it to disk
  indri::collection::Repository::index_state indexes = repo.indexes();
  INT64 total = 0;
  for( size_t i = 0; i < indexes->size(); i++ ) {
    indri::thread::ScopedLock lock( (*indexes)[i]->statisticsLock() );
    total += (*indexes)[i]->documentCount();
  }
  total -= repo.deletedList().deletedCount();

  // identify the docids that are in the collection but not in a disk index
  indri::collection::CompressedCollection *col = repo.collection();
  indri::index::DeletedDocumentList del;
  bool marked = false;
  int numMarked = 0;
  for (int i = (int)total + 1; col->exists(i); i++) {
      del.markDeleted(i);
      marked = true;
      numMarked++;
  }
  // compact to delete the data associated with the unindexed docids.
  if (marked) {
    try {
      std::cerr << "Reovering Repository: " << path << "\nDeleting " 
                << numMarked << " uncommitted documents." << std::endl;
      col->compact(del);
      // check for any partial disk indexes (crash during write)
      // and remove them
      std::string indexPath = indri::file::Path::combine( path, "index" );
      indri::file::DirectoryIterator idirs( indexPath );
      while (! (idirs == indri::file::DirectoryIterator::end())) {
        // iterate over the subdirectories, removing any that don't have a 
        // manifest file.
        std::string current = *idirs;
        std::string manifest = indri::file::Path::combine(current, "manifest");
        if (!indri::file::Path::exists(manifest)) {
          std::cerr << "Removing corrupted index directory: " << current 
                    << std::endl;
          indri::file::Path::remove(current);
          }
        idirs++;
      }
    } catch (lemur::api::Exception &e) {
      // no recovery possible here...
      LEMUR_ABORT(e);
    }
  }
  repo.close();
  // successfully opened and closed
  return true;
}
Ejemplo n.º 4
0
int main( int argc, char** argv ) {
  try {
    indri::api::Parameters& parameters = indri::api::Parameters::instance();
    parameters.loadCommandLine( argc, argv );

    if( parameters.get( "version", false ) ) {
      std::cout << INDRI_DISTRIBUTION << std::endl;
    }
    require_parameter( "corpus", parameters );
    require_parameter( "links", parameters );
    require_parameter( "output", parameters );

    std::string corpusPath = parameters[ "corpus" ];
    std::string linkPath = parameters[ "links" ];
    std::string outputFile = parameters[ "output" ];
    std::string indexPath = parameters.get("index", "");
    UINT64 colLen = 0;
    
    indri::parse::PageRank *pr = 0;
    
    if (indexPath.size() > 0) {
      int maxIters = parameters.get( "iters", 100 );
      double c = parameters.get( "c", 0.85 );
      pr = new indri::parse::PageRank ( corpusPath, linkPath, indexPath );
      pr->indexPageRank(outputFile, maxIters, c);
    } else {
      int docsPerIter = parameters.get( "docs", 1000 );
      int maxIters = parameters.get( "iters", 10 );
      double c = parameters.get( "c", 0.5 );
      pr = new indri::parse::PageRank( corpusPath, linkPath, colLen );
      pr->computePageRank( outputFile, maxIters, docsPerIter, c );
    }
    
    if( parameters.get( "writeRaw", false ) ) {
      std::string rawFile = outputFile + ".raw";
      pr->writeRaw( outputFile, rawFile );
    }
    // default is to produce a prior file for makeprior.
    if( parameters.get( "writePriors", true ) ) {
      std::string priorFile = outputFile + ".prior";
      pr->writePriors( outputFile, priorFile );
    }
    if( parameters.get( "writeRanks", false ) ) {
      std::string ranksFile = outputFile + ".ranks";
      pr->writeRanks( outputFile, ranksFile );
    }
    // don't really need the outputFile
    ::remove(outputFile.c_str());
    delete pr;
  }
  catch( lemur::api::Exception& e ) {
    LEMUR_ABORT(e);
  }
}
Ejemplo n.º 5
0
int main( int argc, char* argv[] ) {
  try {
    indri::api::Parameters& parameters = indri::api::Parameters::instance();
    parameters.loadCommandLine( argc, argv );

    indri::net::NetworkListener listener;
    int port = parameters.get( "port", INDRID_PORT );
    verbose = parameters.get( "verbose", false );
    std::string repositoryPath = parameters["index"];

    // wrap the index in a local server that the stub can talk to
    indri::collection::Repository* repository = new indri::collection::Repository();
    // pass in parameters, in case anyone wants to do query side stopping.
    repository->openRead( repositoryPath, &parameters );
    indri::server::LocalQueryServer server( *repository );

    // open for business
    listener.listen( port );
    indri::net::NetworkStream* connection;

    std::list<connection_info*> connections;

    // this handles the threading issue by only allowing one
    // connection at a time; for our current uses this is fine
    while( connection = listener.accept() ) {
      connection_info* info = build_connection( connection, &server );
      connections.push_back( info );

      clean_connections( connections );
    }

    wait_connections( connections );
    repository->close();
    delete repository;
    return 0;
  } catch( lemur::api::Exception& e ) {
    LEMUR_ABORT(e);
  }
}
Ejemplo n.º 6
0
int main(int argc, char * argv[]) {
  try {
    indri::api::Parameters& param = indri::api::Parameters::instance();
    param.loadCommandLine( argc, argv );

    if( param.get( "version", 0 ) ) {
      std::cout << INDRI_DISTRIBUTION << std::endl;
    }

    if( !param.exists( "query" ) )
      LEMUR_THROW( LEMUR_MISSING_PARAMETER_ERROR, "Must specify at least one query." );

    if( !param.exists("index") && !param.exists("server") )
      LEMUR_THROW( LEMUR_MISSING_PARAMETER_ERROR, "Must specify a server or index to query against." );

    if (param.exists("baseline") && param.exists("rule"))
      LEMUR_THROW( LEMUR_BAD_PARAMETER_ERROR, "Smoothing rules may not be specified when running a baseline." );

    int threadCount = param.get( "threads", 1 );
    std::queue< query_t* > queries;
    std::priority_queue< query_t*, std::vector< query_t* >, query_t::greater > output;
    std::vector< QueryThread* > threads;
    indri::thread::Mutex queueLock;
    indri::thread::ConditionVariable queueEvent;

    // push all queries onto a queue
    indri::api::Parameters parameterQueries = param[ "query" ];
    int queryOffset = param.get( "queryOffset", 0 );
    push_queue( queries, parameterQueries, queryOffset );
    int queryCount = (int)queries.size();

    // launch threads
    for( int i=0; i<threadCount; i++ ) {
      threads.push_back( new QueryThread( queries, output, queueLock, queueEvent, param ) );
      threads.back()->start();
    }

    int query = 0;

    bool inexFormat = param.exists( "inex" );
    if( inexFormat ) {
      std::string participantID = param.get( "inex.participantID", "1");
      std::string runID = param.get( "runID", "indri" );
      std::string inexTask = param.get( "inex.task", "CO.Thorough" );
      std::string inexTopicPart = param.get( "inex.topicPart", "T" );
      std::string description = param.get( "inex.description", "" );
      std::string queryType = param.get("inex.query", "automatic");
      std::cout << "<inex-submission participant-id=\"" << participantID
    << "\" run-id=\"" << runID
    << "\" task=\"" << inexTask
    << "\" query=\"" << queryType
    << "\" topic-part=\"" << inexTopicPart
    << "\">" << std::endl
    << "  <description>" << std::endl << description
    << std::endl << "  </description>" << std::endl;
    }

    // acquire the lock.
    queueLock.lock();

    // process output as it appears on the queue
    while( query < queryCount ) {
      query_t* result = NULL;

      // wait for something to happen
      queueEvent.wait( queueLock );

      while( output.size() && output.top()->index == query ) {
        result = output.top();
        output.pop();

        queueLock.unlock();

        std::cout << result->text;
        delete result;
        query++;

        queueLock.lock();
      }
    }
    queueLock.unlock();

    if( inexFormat ) {
      std::cout << "</inex-submission>" << std::endl;
    }

    // join all the threads
    for( size_t i=0; i<threads.size(); i++ )
      threads[i]->join();

    // we've seen all the query output now, so we can quit
    indri::utility::delete_vector_contents( threads );
  } catch( lemur::api::Exception& e ) {
    LEMUR_ABORT(e);
  } catch( ... ) {
    std::cout << "Caught unhandled exception" << std::endl;
    return -1;
  }

  return 0;
}
int main(int argc, char * argv[]) {
  try {
    indri::api::Parameters& parameters = indri::api::Parameters::instance();
    parameters.loadCommandLine( argc, argv );

    require_parameter( "corpus", parameters );
    require_parameter( "index", parameters );

    StatusMonitor monitor;
    indri::api::IndexEnvironment env;
    std::string repositoryPath = parameters["index"];

    buildindex_start_time();

    if( parameters.get( "version", 0 ) ) {
      std::cout << INDRI_DISTRIBUTION << std::endl;
    }

    env.setMemory( parameters.get("memory", INT64(1024*1024*1024)) );

    env.setNormalization( parameters.get("normalize", true));
    env.setInjectURL( parameters.get("injectURL", true));
    env.setStoreDocs( parameters.get("storeDocs", true));

    std::string blackList = parameters.get("blacklist", "");
    if( blackList.length() ) {
        int count = env.setBlackList(blackList);
        std::cout << "Added to blacklist: "<< count << std::endl;
        std::cout.flush();
    }

    std::string offsetAnnotationHint=parameters.get("offsetannotationhint", "default");
    if (offsetAnnotationHint=="ordered") {
      env.setOffsetAnnotationIndexHint(indri::parse::OAHintOrderedAnnotations);
    } if (offsetAnnotationHint=="unordered") {
      env.setOffsetAnnotationIndexHint(indri::parse::OAHintSizeBuffers);
    } else {
      env.setOffsetAnnotationIndexHint(indri::parse::OAHintDefault);
    }

    std::string stemmerName = parameters.get("stemmer.name", "");
    if( stemmerName.length() )
      env.setStemmer(stemmerName);

    std::vector<std::string> stopwords;
    if( copy_parameters_to_string_vector( stopwords, parameters, "stopper.word" ) )
      env.setStopwords(stopwords);
    // fields to include as metadata (unindexed)
    std::vector<std::string> metadata;
    // metadata fields that should have a forward lookup table.
    std::vector<std::string> metadataForward;
    // metadata fields that should have a backward lookup table.
    std::vector<std::string> metadataBackward;
    copy_parameters_to_string_vector( metadata, parameters, "metadata.field" ); 
    downcase_string_vector(metadata);
    
    copy_parameters_to_string_vector( metadataForward, parameters, "metadata.forward" ); 
    downcase_string_vector(metadataForward);
    copy_parameters_to_string_vector( metadataBackward, parameters, "metadata.backward" );
    downcase_string_vector(metadataBackward);
    // docno is a special field, automagically add it as forward and backward.
    std::string docno = "docno";
    if( std::find( metadataForward.begin(), 
                   metadataForward.end(), 
                   docno ) == metadataForward.end() )
      metadataForward.push_back(docno);
    if( std::find( metadataBackward.begin(), 
                   metadataBackward.end(), 
                   docno ) == metadataBackward.end() )
      metadataBackward.push_back(docno);

    env.setMetadataIndexedFields( metadataForward, metadataBackward );
#if 0    
    // "document" is a special field.
    // automagically add it as an indexed field.
    indri::api::Parameters field = parameters.append("field");
    field.set( "name", "document" );
    field.set( "ordinal", true );
    field.set("parental", true);
#endif
    std::vector<std::string> fields;    
    std::string subName = "name";
    if( copy_parameters_to_string_vector( fields, parameters, "field", &subName ) ) {
      downcase_string_vector(fields);
      env.setIndexedFields(fields);
      process_numeric_fields( parameters, env );
      process_ordinal_fields( parameters, env );
      process_parental_fields( parameters, env ); //pto
    }

    if( indri::collection::Repository::exists( repositoryPath ) ) {
      // check if the repository was corrupted by an indexing crash
      // if so, recover it and continue.
      if (_recoverRepository(repositoryPath)) {
        env.open( repositoryPath, &monitor );
        buildindex_print_event( std::string() + "Opened repository " + repositoryPath ); 
      } else  {
        //  failed to open it, needs to be created from scratch.
        // create will remove any cruft.
        env.create( repositoryPath, &monitor );
        buildindex_print_event( std::string() + "Created repository " + repositoryPath );
      }
    } else {
      env.create( repositoryPath, &monitor );
      buildindex_print_event( std::string() + "Created repository " + repositoryPath );
    }

    indri::api::Parameters corpus = parameters["corpus"];

    for( unsigned int i=0; i<corpus.size(); i++ ) {
      indri::api::Parameters thisCorpus = corpus[i];
      require_parameter( "path", thisCorpus );
      std::string corpusPath = thisCorpus["path"];
      std::string fileClass = thisCorpus.get("class", "");
      
      // augment field/metadata tags in the environment if needed.
      if( fileClass.length() ) {
        indri::parse::FileClassEnvironmentFactory::Specification *spec = env.getFileClassSpec(fileClass);
        if( spec ) {
          // add fields if necessary, only update if changed.
          if( augmentSpec( spec, fields, metadata, metadataForward, metadataBackward ) ) 
            env.addFileClass(*spec);
          delete(spec);
        }
      }
      
      bool isDirectory = indri::file::Path::isDirectory( corpusPath );
 
      // First record the document root, and then the paths to any annotator inputs
      env.setDocumentRoot( corpusPath );

      // Support for anchor text
      std::string anchorText = thisCorpus.get("inlink", "");
      env.setAnchorTextPath( anchorText );

      // Support for offset annotations
      std::string offsetAnnotationsPath = thisCorpus.get( "annotations", "" );
      env.setOffsetAnnotationsPath( offsetAnnotationsPath );

      // Support for offset metadata file
      std::string offsetMetadataPath = thisCorpus.get( "metadata", "" );
      env.setOffsetMetadataPath( offsetMetadataPath );

      if( isDirectory ) {
        indri::file::FileTreeIterator files( corpusPath );

        for( ; files != indri::file::FileTreeIterator::end(); files++ ) {
          if( fileClass.length() )
            env.addFile( *files, fileClass );
          else {
            std::string extension = indri::file::Path::extension( *files );
            indri::parse::FileClassEnvironmentFactory::Specification *spec = env.getFileClassSpec(extension);
            if( spec ) {
              // add fields if necessary, only update if changed.
              if( augmentSpec( spec, fields, metadata, metadataForward, metadataBackward ) ) 
                env.addFileClass(*spec);
              delete(spec);
            }
            env.addFile( *files );
          }
        }
      } else {
        if( fileClass.length() )
          env.addFile( corpusPath, fileClass );
        else {
          std::string extension = indri::file::Path::extension( corpusPath );
          indri::parse::FileClassEnvironmentFactory::Specification *spec = env.getFileClassSpec(extension);
          if( spec ) {
            // add fields if necessary, only update if changed.
            if( augmentSpec( spec, fields, metadata, metadataForward, metadataBackward ) ) 
              env.addFileClass(*spec);
            delete(spec);
          }
          env.addFile( corpusPath );
        }
      }
    }

    buildindex_print_event( "Closing index" );
    env.close();
    buildindex_print_event( "Finished" );
  } catch( lemur::api::Exception& e ) {
    LEMUR_ABORT(e);
  }

  return 0;
}
Ejemplo n.º 8
0
int main( int argc, char** argv ) {
  try {
    indri::api::Parameters& param = indri::api::Parameters::instance();
    param.loadCommandLine( argc, argv );

    if( !param.exists("index") || !param.exists("input") || !param.exists("name") ) {
      std::cerr << "makeprior usage: " << std::endl
                << "    makeprior -index=myindex -input=myinputfile -name=priorname" << std::endl
                << "      myindex: a valid Indri index " << std::endl
                << "      myinputfile: a two column text file, where the first column contains docno values" << std::endl
                << "         and the second column contains log probabilities (should be between -infinity and zero)" << std::endl
                << "      name: the name of this prior (as you will reference it in queries, using the #prior(name) syntax)" << std::endl;
      exit(-1);
    }
  
    std::string index = param["index"];

    // get the total document count, including deleted documents.
    indri::collection::Repository* _repository = new indri::collection::Repository();
    _repository->openRead(index);    
    indri::collection::Repository::index_state indexes = _repository->indexes();
    int documentCount = 0;
  
    for( size_t i=0; i<indexes->size(); i++ ) {
      indri::thread::ScopedLock lock( (*indexes)[i]->statisticsLock() );
      documentCount += (int)(*indexes)[i]->documentCount();
    }
    delete _repository;
    
    indri::api::QueryEnvironment env;
    std::cout << "opening index: " << index << std::endl;
    env.addIndex( index );
    
    std::string input = param["input"];
    std::string priorName = param["name"];
    size_t memory = param.get( "memory", 50*1024*1024 );
    
    // step one - convert file from docno/score format to binary format
    indri::file::File unsortedBinary;
    std::string unsortedName;
    
    unsortedBinary.openTemporary( unsortedName );
    std::cout << "converting to binary...";
    std::cout.flush();
    convert_docnoscore_to_binary( unsortedBinary, input, env );
    std::cout << "finished" << std::endl;
    
    // step two -- sort the binary version
    indri::file::File uncompressedPrior;
    std::string uncompressedPriorName;
    uncompressedPrior.openTemporary( uncompressedPriorName );
    
    std::cout << "sorting...";
    std::cout.flush();
    sort_file( uncompressedPrior, unsortedBinary, memory, documentCount );
    std::cout << "finished";
    
    unsortedBinary.close();
    lemur_compat::remove( unsortedName.c_str() );
    
    // step three -- check to see if it's compressable, if so, compress it
    std::map<double, int> table;
    indri::file::File compressedPrior;
    
    std::string compressedPriorName;
    compressedPrior.openTemporary( compressedPriorName );
    
    indri::file::File& finalPrior = uncompressedPrior;
    std::cout << "checking for compressability...";
    std::cout.flush();
    bool result = extract_compression_table( table, uncompressedPrior );
    
    if( result ) {
      std::cout << "yep" << std::endl;
      // compress the file by using a lookup table
      std::cout << "compressing...";
      std::cout.flush();
      compress_file( compressedPrior, uncompressedPrior, table );
      std::cout << std::endl;
      finalPrior = compressedPrior;
    } else {
      std::cout << "nope" << std::endl;
    }
    
    // step four -- install the prior in the index
    std::cout << "installing...";
    std::cout.flush();
    install_prior( index, priorName, finalPrior );
    std::cout << "finished" << std::endl;
    
    // clean up
    uncompressedPrior.close();
    compressedPrior.close();
    lemur_compat::remove( uncompressedPriorName.c_str() );
    lemur_compat::remove( compressedPriorName.c_str() );
  } catch( lemur::api::Exception& e ) {
    LEMUR_ABORT(e);
  }
  
  return 0;  
}