Exemple #1
0
void push_queue( std::queue< query_t* >& q, indri::api::Parameters& queries,
                 int queryOffset ) {

  for( size_t i=0; i<queries.size(); i++ ) {
    std::string queryNumber;
    std::string queryText;
    std::string queryType = "indri";
    if( queries[i].exists( "type" ) )
      queryType = (std::string) queries[i]["type"];
    if (queries[i].exists("text"))
      queryText = (std::string) queries[i]["text"];
    if( queries[i].exists( "number" ) ) {
      queryNumber = (std::string) queries[i]["number"];
    } else {
      int thisQuery=queryOffset + int(i);
      std::stringstream s;
      s << thisQuery;
      queryNumber = s.str();
    }
    if (queryText.size() == 0)
      queryText = (std::string) queries[i];

    // working set and RELFB docs go here.
    // working set to restrict retrieval
    std::vector<std::string> workingSet;
    // Rel fb docs
    std::vector<std::string> relFBDocs;
    copy_parameters_to_string_vector( workingSet, queries[i], "workingSetDocno" );
    copy_parameters_to_string_vector( relFBDocs, queries[i], "feedbackDocno" );

    q.push( new query_t( i, queryNumber, queryText, queryType, workingSet, relFBDocs ) );

  }
}
Exemple #2
0
static void open_indexes( indri::api::QueryEnvironment& environment, 
                          indri::api::Parameters& param ) {
  if( param.exists( "index" ) ) {
    indri::api::Parameters indexes = param["index"];
    for( unsigned int i=0; i < indexes.size(); i++ ) {
      environment.addIndex( std::string(indexes[i]) );
    }
  }
  if( param.exists( "server" ) ) {
    indri::api::Parameters servers = param["server"];
    for( unsigned int i=0; i < servers.size(); i++ ) {
      environment.addServer( std::string(servers[i]) );
    }
  }
  std::vector<std::string> smoothingRules;
  if( copy_parameters_to_string_vector( smoothingRules, param, "rule" ) )
    environment.setScoringRules( smoothingRules );
}
Exemple #3
0
  UINT64 initialize() {
    _environment.setSingleBackgroundModel( _parameters.get("singleBackgroundModel", false) );

    std::vector<std::string> stopwords;
    if( copy_parameters_to_string_vector( stopwords, _parameters, "stopper.word" ) )
      _environment.setStopwords(stopwords);

    std::vector<std::string> smoothingRules;
    if( copy_parameters_to_string_vector( smoothingRules, _parameters, "rule" ) )
      _environment.setScoringRules( smoothingRules );

   if( _parameters.exists( "index" ) ) {
      indri::api::Parameters indexes = _parameters["index"];

      for( size_t i=0; i < indexes.size(); i++ ) {
        _environment.addIndex( std::string(indexes[i]) );
      }
    }

    if( _parameters.exists( "server" ) ) {
      indri::api::Parameters servers = _parameters["server"];

      for( size_t i=0; i < servers.size(); i++ ) {
        _environment.addServer( std::string(servers[i]) );
      }
    }

    if( _parameters.exists("maxWildcardTerms") )
        _environment.setMaxWildcardTerms(_parameters.get("maxWildcardTerms", 100));

    _requested = _parameters.get( "count", 1000 );
    _initialRequested = _parameters.get( "fbDocs", _requested );
    _runID = _parameters.get( "runID", "indri" );
    _trecFormat = _parameters.get( "trecFormat" , false );
    _inexFormat = _parameters.exists( "inex" );

    _printQuery = _parameters.get( "printQuery", false );
    _printDocuments = _parameters.get( "printDocuments", false );
    _printPassages = _parameters.get( "printPassages", false );
    _printSnippets = _parameters.get( "printSnippets", false );

    if (_parameters.exists("baseline")) {
      // doing a baseline
      std::string baseline = _parameters["baseline"];
      _environment.setBaseline(baseline);
      // need a factory for this...
      if( _parameters.get( "fbDocs", 0 ) != 0 ) {
        // have to push the method in...
        std::string rule = "method:" + baseline;
        _parameters.set("rule", rule);
        _expander = new indri::query::TFIDFExpander( &_environment, _parameters );
      }
    } else {
      if( _parameters.get( "fbDocs", 0 ) != 0 ) {
        _expander = new indri::query::RMExpander( &_environment, _parameters );
      }
    }

    if (_parameters.exists("maxWildcardTerms")) {
      _environment.setMaxWildcardTerms((int)_parameters.get("maxWildcardTerms"));
    }

    return 0;
  }
int main(int argc, char * argv[]) {
  try {
    indri::api::Parameters& parameters = indri::api::Parameters::instance();
    parameters.loadCommandLine( argc, argv );

    require_parameter( "corpus", parameters );
    require_parameter( "index", parameters );

    StatusMonitor monitor;
    indri::api::IndexEnvironment env;
    std::string repositoryPath = parameters["index"];

    buildindex_start_time();

    if( parameters.get( "version", 0 ) ) {
      std::cout << INDRI_DISTRIBUTION << std::endl;
    }

    env.setMemory( parameters.get("memory", INT64(1024*1024*1024)) );

    env.setNormalization( parameters.get("normalize", true));
    env.setInjectURL( parameters.get("injectURL", true));
    env.setStoreDocs( parameters.get("storeDocs", true));

    std::string blackList = parameters.get("blacklist", "");
    if( blackList.length() ) {
        int count = env.setBlackList(blackList);
        std::cout << "Added to blacklist: "<< count << std::endl;
        std::cout.flush();
    }

    std::string offsetAnnotationHint=parameters.get("offsetannotationhint", "default");
    if (offsetAnnotationHint=="ordered") {
      env.setOffsetAnnotationIndexHint(indri::parse::OAHintOrderedAnnotations);
    } if (offsetAnnotationHint=="unordered") {
      env.setOffsetAnnotationIndexHint(indri::parse::OAHintSizeBuffers);
    } else {
      env.setOffsetAnnotationIndexHint(indri::parse::OAHintDefault);
    }

    std::string stemmerName = parameters.get("stemmer.name", "");
    if( stemmerName.length() )
      env.setStemmer(stemmerName);

    std::vector<std::string> stopwords;
    if( copy_parameters_to_string_vector( stopwords, parameters, "stopper.word" ) )
      env.setStopwords(stopwords);
    // fields to include as metadata (unindexed)
    std::vector<std::string> metadata;
    // metadata fields that should have a forward lookup table.
    std::vector<std::string> metadataForward;
    // metadata fields that should have a backward lookup table.
    std::vector<std::string> metadataBackward;
    copy_parameters_to_string_vector( metadata, parameters, "metadata.field" ); 
    downcase_string_vector(metadata);
    
    copy_parameters_to_string_vector( metadataForward, parameters, "metadata.forward" ); 
    downcase_string_vector(metadataForward);
    copy_parameters_to_string_vector( metadataBackward, parameters, "metadata.backward" );
    downcase_string_vector(metadataBackward);
    // docno is a special field, automagically add it as forward and backward.
    std::string docno = "docno";
    if( std::find( metadataForward.begin(), 
                   metadataForward.end(), 
                   docno ) == metadataForward.end() )
      metadataForward.push_back(docno);
    if( std::find( metadataBackward.begin(), 
                   metadataBackward.end(), 
                   docno ) == metadataBackward.end() )
      metadataBackward.push_back(docno);

    env.setMetadataIndexedFields( metadataForward, metadataBackward );
#if 0    
    // "document" is a special field.
    // automagically add it as an indexed field.
    indri::api::Parameters field = parameters.append("field");
    field.set( "name", "document" );
    field.set( "ordinal", true );
    field.set("parental", true);
#endif
    std::vector<std::string> fields;    
    std::string subName = "name";
    if( copy_parameters_to_string_vector( fields, parameters, "field", &subName ) ) {
      downcase_string_vector(fields);
      env.setIndexedFields(fields);
      process_numeric_fields( parameters, env );
      process_ordinal_fields( parameters, env );
      process_parental_fields( parameters, env ); //pto
    }

    if( indri::collection::Repository::exists( repositoryPath ) ) {
      // check if the repository was corrupted by an indexing crash
      // if so, recover it and continue.
      if (_recoverRepository(repositoryPath)) {
        env.open( repositoryPath, &monitor );
        buildindex_print_event( std::string() + "Opened repository " + repositoryPath ); 
      } else  {
        //  failed to open it, needs to be created from scratch.
        // create will remove any cruft.
        env.create( repositoryPath, &monitor );
        buildindex_print_event( std::string() + "Created repository " + repositoryPath );
      }
    } else {
      env.create( repositoryPath, &monitor );
      buildindex_print_event( std::string() + "Created repository " + repositoryPath );
    }

    indri::api::Parameters corpus = parameters["corpus"];

    for( unsigned int i=0; i<corpus.size(); i++ ) {
      indri::api::Parameters thisCorpus = corpus[i];
      require_parameter( "path", thisCorpus );
      std::string corpusPath = thisCorpus["path"];
      std::string fileClass = thisCorpus.get("class", "");
      
      // augment field/metadata tags in the environment if needed.
      if( fileClass.length() ) {
        indri::parse::FileClassEnvironmentFactory::Specification *spec = env.getFileClassSpec(fileClass);
        if( spec ) {
          // add fields if necessary, only update if changed.
          if( augmentSpec( spec, fields, metadata, metadataForward, metadataBackward ) ) 
            env.addFileClass(*spec);
          delete(spec);
        }
      }
      
      bool isDirectory = indri::file::Path::isDirectory( corpusPath );
 
      // First record the document root, and then the paths to any annotator inputs
      env.setDocumentRoot( corpusPath );

      // Support for anchor text
      std::string anchorText = thisCorpus.get("inlink", "");
      env.setAnchorTextPath( anchorText );

      // Support for offset annotations
      std::string offsetAnnotationsPath = thisCorpus.get( "annotations", "" );
      env.setOffsetAnnotationsPath( offsetAnnotationsPath );

      // Support for offset metadata file
      std::string offsetMetadataPath = thisCorpus.get( "metadata", "" );
      env.setOffsetMetadataPath( offsetMetadataPath );

      if( isDirectory ) {
        indri::file::FileTreeIterator files( corpusPath );

        for( ; files != indri::file::FileTreeIterator::end(); files++ ) {
          if( fileClass.length() )
            env.addFile( *files, fileClass );
          else {
            std::string extension = indri::file::Path::extension( *files );
            indri::parse::FileClassEnvironmentFactory::Specification *spec = env.getFileClassSpec(extension);
            if( spec ) {
              // add fields if necessary, only update if changed.
              if( augmentSpec( spec, fields, metadata, metadataForward, metadataBackward ) ) 
                env.addFileClass(*spec);
              delete(spec);
            }
            env.addFile( *files );
          }
        }
      } else {
        if( fileClass.length() )
          env.addFile( corpusPath, fileClass );
        else {
          std::string extension = indri::file::Path::extension( corpusPath );
          indri::parse::FileClassEnvironmentFactory::Specification *spec = env.getFileClassSpec(extension);
          if( spec ) {
            // add fields if necessary, only update if changed.
            if( augmentSpec( spec, fields, metadata, metadataForward, metadataBackward ) ) 
              env.addFileClass(*spec);
            delete(spec);
          }
          env.addFile( corpusPath );
        }
      }
    }

    buildindex_print_event( "Closing index" );
    env.close();
    buildindex_print_event( "Finished" );
  } catch( lemur::api::Exception& e ) {
    LEMUR_ABORT(e);
  }

  return 0;
}
Exemple #5
0
  UINT64 initialize() {
    try {        
    _environment.setSingleBackgroundModel( _parameters.get("singleBackgroundModel", false) );

    std::vector<std::string> stopwords;
    if( copy_parameters_to_string_vector( stopwords, _parameters, "stopper.word" ) )
      _environment.setStopwords(stopwords);

    std::vector<std::string> smoothingRules;
    if( copy_parameters_to_string_vector( smoothingRules, _parameters, "rule" ) )
      _environment.setScoringRules( smoothingRules );

   if( _parameters.exists( "index" ) ) {
      indri::api::Parameters indexes = _parameters["index"];

      for( size_t i=0; i < indexes.size(); i++ ) {
        _environment.addIndex( std::string(indexes[i]) );
      }
    }

    if( _parameters.exists( "server" ) ) {
      indri::api::Parameters servers = _parameters["server"];

      for( size_t i=0; i < servers.size(); i++ ) {
        _environment.addServer( std::string(servers[i]) );
      }
    }

    if( _parameters.exists("maxWildcardTerms") )
        _environment.setMaxWildcardTerms(_parameters.get("maxWildcardTerms", 100));

    _requested = _parameters.get( "count", 1000 );
    _initialRequested = _parameters.get( "fbDocs", _requested );
    _runID = _parameters.get( "runID", "indri" );
    _trecFormat = _parameters.get( "trecFormat" , false );
    _inexFormat = _parameters.exists( "inex" );

    _printQuery = _parameters.get( "printQuery", false );
    _printDocuments = _parameters.get( "printDocuments", false );
    _printPassages = _parameters.get( "printPassages", false );
    _printSnippets = _parameters.get( "printSnippets", false );

    if (_parameters.exists("baseline")) {
      // doing a baseline
      std::string baseline = _parameters["baseline"];
      _environment.setBaseline(baseline);
      // need a factory for this...
      if( _parameters.get( "fbDocs", 0 ) != 0 ) {
        // have to push the method in...
        std::string rule = "method:" + baseline;
        _parameters.set("rule", rule);
        _expander = new indri::query::TFIDFExpander( &_environment, _parameters );
      }
    } else {
      if( _parameters.get( "fbDocs", 0 ) != 0 ) {
        _expander = new indri::query::RMExpander( &_environment, _parameters );
      }
    }

    if (_parameters.exists("maxWildcardTerms")) {
      _environment.setMaxWildcardTerms((int)_parameters.get("maxWildcardTerms"));
    }    
    } catch ( lemur::api::Exception& e ) {      
      while( _queries.size() ) {
        query_t *query = _queries.front();
        _queries.pop();
        _output.push( new query_t( query->index, query->number, "query: " + query->number + " QueryThread::_initialize exception\n" ) );
        _queueEvent.notifyAll();
        LEMUR_RETHROW(e, "QueryThread::_initialize");
      }
    }
    return 0;
  }