Пример #1
0
void sort_run( indri::file::File& out, indri::file::File& in, size_t memory ) {
  // read the data in
  UINT64 length = in.size();
  char* data = new char[length];
  in.read( data, 0, length );
  in.close();

  qsort( data, length / 12, 12, sort_comparator );

  out.write( data, 0, length );
  delete[] data;
}
Пример #2
0
void copy_region( indri::file::File& out, indri::file::File& in, UINT64 position, UINT64 length ) {
  char* buffer = new char[1024*1024];
  UINT64 bufLength = 1024*1024;
  UINT64 total = 0;
  
  while( length > total ) {
    UINT64 chunk = lemur_compat::min<UINT64>( bufLength, length - total );

    in.read( buffer, position + total, chunk );
    out.write( buffer, total, chunk );
    
    total += chunk;
  }
  
  delete[] buffer;
}
indri::index::DiskTermListFileIterator::DiskTermListFileIterator( indri::file::File& termListFile ) :
  _termListFile(termListFile),
  _buffer(_termListFile, 1024*1024),
  _fileSize(termListFile.size()),
  _finished(false),
  _currentDocument(0)
{
}
Пример #4
0
indri::index::DiskDocListFileIterator::DiskDocListFileIterator( indri::file::File& docListFile, int fieldCount ) : 
  _file( new indri::file::SequentialReadBuffer( docListFile ) ),
  _fileLength( docListFile.size() ),
  _fieldCount( fieldCount ),
  _iterator( _file, 0, 0 ),
  _finished( false )
{
  _termData = (indri::index::TermData*) malloc( ::termdata_size( fieldCount ) );
}
Пример #5
0
void sort_file( indri::file::File& out, indri::file::File& in, size_t memory, int totalDocuments ) {
  UINT64 length = in.size();
  size_t rounded = (memory / 12) * 12;
    
  UINT64 total = 0;
  std::vector<std::string> temporaries;

  while( length > total ) {
    UINT64 chunk = lemur_compat::min<UINT64>( rounded, length - total );
    indri::file::File tempIn;
    indri::file::File tempOut;
    std::string nameIn;
    std::string nameOut;
  
    tempIn.openTemporary( nameIn );
    tempOut.openTemporary( nameOut );

    // make a sorted run
    copy_region( tempIn, in, total, chunk );
    sort_run( tempOut, tempIn, memory );
    
    tempIn.close();
    tempOut.close();
    lemur_compat::remove( nameIn.c_str() );
    temporaries.push_back( nameOut );
    
    total += chunk;
  }

  in.close();
  merge_sorted_runs( out, temporaries, totalDocuments );
  
  for( size_t i=0; i<temporaries.size(); i++ ) {
    lemur_compat::remove( temporaries[i].c_str() );
  }
}
Пример #6
0
static void zlib_read_document( z_stream_s& stream, indri::file::File& infile, UINT64 offset, indri::utility::Buffer& outputBuffer ) {
  // read in data from the file until the stream ends
  // split up the data as necessary
  // decompress positional info

  // read some data
  char inputBuffer[INPUT_BUFFER_SIZE];
  outputBuffer.grow( INPUT_BUFFER_SIZE );
  outputBuffer.write( sizeof(indri::api::ParsedDocument) );

  stream.avail_in = 0;
  stream.avail_out = 0;
  
  while(true) {
    if( !stream.avail_in ) {
      UINT64 readSize = infile.read( inputBuffer, offset, sizeof inputBuffer );
      offset += readSize; 
      
      stream.avail_in = readSize;
      stream.next_in = (Bytef*) inputBuffer;
    }

    stream.avail_out = outputBuffer.size() - outputBuffer.position();
    stream.next_out = (Bytef*) outputBuffer.write( outputBuffer.size() - outputBuffer.position() );

    int result = inflate( &stream, Z_NO_FLUSH );
    outputBuffer.unwrite( stream.avail_out );

    if( result == Z_STREAM_END ) {
      result = inflate( &stream, Z_FINISH );
      
      if( result < 0 )
        LEMUR_THROW( result, "Something bad happened while trying to finish decompressing a document." );

      inflateEnd( &stream );
      break;
    }

    if( result < 0 ) {
      LEMUR_THROW( result, "Something bad happened while trying to decompress a document." );
    }

    if( stream.avail_out == 0 ) {
      outputBuffer.grow();
    }
  }
}
Пример #7
0
void install_prior( const std::string& indexPath, const std::string& priorName, indri::file::File& priorFile ) {
  std::string priorDirectory = indri::file::Path::combine( indexPath, "prior" );
  std::string priorPath = indri::file::Path::combine( priorDirectory, priorName );

  // make sure there's a prior directory in the index
  if( indri::file::Path::exists( priorDirectory ) == false ) {
    indri::file::Path::make( priorDirectory );
  }

  // if there's a old prior there with this name, remove it
  if( indri::file::Path::exists( priorPath ) ) { 
    lemur_compat::remove( priorPath.c_str() );
  }

  // copy the file
  indri::file::File output;
  output.create( priorPath );
  size_t length = priorFile.size();
  
  copy_region( output, priorFile, 0, length );
  output.close(); 
}
Пример #8
0
bool extract_compression_table( std::map<double, int>& values, indri::file::File& in ) {
  indri::file::SequentialReadBuffer* inb = new indri::file::SequentialReadBuffer( in, 512*1024 );
    
  MergeFile mf;
  mf.file = &in;
  mf.buffer = inb;
  mf.length = in.size();
  mf.score = 0;
  inb->seek( sizeof(UINT32)*2 );

  while( !mf.finished() && values.size() <= 256 ) {
    mf.readScore();
    std::map<double,int>::iterator iter = values.find( mf.score );
    
    if( iter == values.end() ) {
      values.insert( std::make_pair( mf.score, values.size() ) );
    }
  }
  
  delete mf.buffer;

  return values.size() <= 256;
}
Пример #9
0
void convert_intscore_to_long_binary( indri::file::File& outfile, const char* infile ) {
  std::ifstream in;
  in.open( infile );
  
  indri::file::SequentialWriteBuffer* outb = new indri::file::SequentialWriteBuffer( outfile, 1024*1024 );

  while( !in.eof() ) {
    int document;
    double score;

    in >> document
       >> score;
  
    outb->write( (const void*) &document, sizeof(UINT32) );
    outb->write( (const void*) &score, sizeof(double) );
  }
  
  outb->flush();
  delete outb;
  
  outfile.close();
  in.close();
}