indri::index::DiskTermListFileIterator::DiskTermListFileIterator( indri::file::File& termListFile ) :
  _termListFile(termListFile),
  _buffer(_termListFile, 1024*1024),
  _fileSize(termListFile.size()),
  _finished(false),
  _currentDocument(0)
{
}
Ejemplo n.º 2
0
indri::index::DiskDocListFileIterator::DiskDocListFileIterator( indri::file::File& docListFile, int fieldCount ) : 
  _file( new indri::file::SequentialReadBuffer( docListFile ) ),
  _fileLength( docListFile.size() ),
  _fieldCount( fieldCount ),
  _iterator( _file, 0, 0 ),
  _finished( false )
{
  _termData = (indri::index::TermData*) malloc( ::termdata_size( fieldCount ) );
}
Ejemplo n.º 3
0
void sort_run( indri::file::File& out, indri::file::File& in, size_t memory ) {
  // read the data in
  UINT64 length = in.size();
  char* data = new char[length];
  in.read( data, 0, length );
  in.close();

  qsort( data, length / 12, 12, sort_comparator );

  out.write( data, 0, length );
  delete[] data;
}
Ejemplo n.º 4
0
void install_prior( const std::string& indexPath, const std::string& priorName, indri::file::File& priorFile ) {
  std::string priorDirectory = indri::file::Path::combine( indexPath, "prior" );
  std::string priorPath = indri::file::Path::combine( priorDirectory, priorName );

  // make sure there's a prior directory in the index
  if( indri::file::Path::exists( priorDirectory ) == false ) {
    indri::file::Path::make( priorDirectory );
  }

  // if there's a old prior there with this name, remove it
  if( indri::file::Path::exists( priorPath ) ) { 
    lemur_compat::remove( priorPath.c_str() );
  }

  // copy the file
  indri::file::File output;
  output.create( priorPath );
  size_t length = priorFile.size();
  
  copy_region( output, priorFile, 0, length );
  output.close(); 
}
Ejemplo n.º 5
0
bool extract_compression_table( std::map<double, int>& values, indri::file::File& in ) {
  indri::file::SequentialReadBuffer* inb = new indri::file::SequentialReadBuffer( in, 512*1024 );
    
  MergeFile mf;
  mf.file = &in;
  mf.buffer = inb;
  mf.length = in.size();
  mf.score = 0;
  inb->seek( sizeof(UINT32)*2 );

  while( !mf.finished() && values.size() <= 256 ) {
    mf.readScore();
    std::map<double,int>::iterator iter = values.find( mf.score );
    
    if( iter == values.end() ) {
      values.insert( std::make_pair( mf.score, values.size() ) );
    }
  }
  
  delete mf.buffer;

  return values.size() <= 256;
}
Ejemplo n.º 6
0
void sort_file( indri::file::File& out, indri::file::File& in, size_t memory, int totalDocuments ) {
  UINT64 length = in.size();
  size_t rounded = (memory / 12) * 12;
    
  UINT64 total = 0;
  std::vector<std::string> temporaries;

  while( length > total ) {
    UINT64 chunk = lemur_compat::min<UINT64>( rounded, length - total );
    indri::file::File tempIn;
    indri::file::File tempOut;
    std::string nameIn;
    std::string nameOut;
  
    tempIn.openTemporary( nameIn );
    tempOut.openTemporary( nameOut );

    // make a sorted run
    copy_region( tempIn, in, total, chunk );
    sort_run( tempOut, tempIn, memory );
    
    tempIn.close();
    tempOut.close();
    lemur_compat::remove( nameIn.c_str() );
    temporaries.push_back( nameOut );
    
    total += chunk;
  }

  in.close();
  merge_sorted_runs( out, temporaries, totalDocuments );
  
  for( size_t i=0; i<temporaries.size(); i++ ) {
    lemur_compat::remove( temporaries[i].c_str() );
  }
}