void sort_run( indri::file::File& out, indri::file::File& in, size_t memory ) { // read the data in UINT64 length = in.size(); char* data = new char[length]; in.read( data, 0, length ); in.close(); qsort( data, length / 12, 12, sort_comparator ); out.write( data, 0, length ); delete[] data; }
void copy_region( indri::file::File& out, indri::file::File& in, UINT64 position, UINT64 length ) { char* buffer = new char[1024*1024]; UINT64 bufLength = 1024*1024; UINT64 total = 0; while( length > total ) { UINT64 chunk = lemur_compat::min<UINT64>( bufLength, length - total ); in.read( buffer, position + total, chunk ); out.write( buffer, total, chunk ); total += chunk; } delete[] buffer; }
indri::index::DiskTermListFileIterator::DiskTermListFileIterator( indri::file::File& termListFile ) : _termListFile(termListFile), _buffer(_termListFile, 1024*1024), _fileSize(termListFile.size()), _finished(false), _currentDocument(0) { }
indri::index::DiskDocListFileIterator::DiskDocListFileIterator( indri::file::File& docListFile, int fieldCount ) : _file( new indri::file::SequentialReadBuffer( docListFile ) ), _fileLength( docListFile.size() ), _fieldCount( fieldCount ), _iterator( _file, 0, 0 ), _finished( false ) { _termData = (indri::index::TermData*) malloc( ::termdata_size( fieldCount ) ); }
void sort_file( indri::file::File& out, indri::file::File& in, size_t memory, int totalDocuments ) { UINT64 length = in.size(); size_t rounded = (memory / 12) * 12; UINT64 total = 0; std::vector<std::string> temporaries; while( length > total ) { UINT64 chunk = lemur_compat::min<UINT64>( rounded, length - total ); indri::file::File tempIn; indri::file::File tempOut; std::string nameIn; std::string nameOut; tempIn.openTemporary( nameIn ); tempOut.openTemporary( nameOut ); // make a sorted run copy_region( tempIn, in, total, chunk ); sort_run( tempOut, tempIn, memory ); tempIn.close(); tempOut.close(); lemur_compat::remove( nameIn.c_str() ); temporaries.push_back( nameOut ); total += chunk; } in.close(); merge_sorted_runs( out, temporaries, totalDocuments ); for( size_t i=0; i<temporaries.size(); i++ ) { lemur_compat::remove( temporaries[i].c_str() ); } }
static void zlib_read_document( z_stream_s& stream, indri::file::File& infile, UINT64 offset, indri::utility::Buffer& outputBuffer ) { // read in data from the file until the stream ends // split up the data as necessary // decompress positional info // read some data char inputBuffer[INPUT_BUFFER_SIZE]; outputBuffer.grow( INPUT_BUFFER_SIZE ); outputBuffer.write( sizeof(indri::api::ParsedDocument) ); stream.avail_in = 0; stream.avail_out = 0; while(true) { if( !stream.avail_in ) { UINT64 readSize = infile.read( inputBuffer, offset, sizeof inputBuffer ); offset += readSize; stream.avail_in = readSize; stream.next_in = (Bytef*) inputBuffer; } stream.avail_out = outputBuffer.size() - outputBuffer.position(); stream.next_out = (Bytef*) outputBuffer.write( outputBuffer.size() - outputBuffer.position() ); int result = inflate( &stream, Z_NO_FLUSH ); outputBuffer.unwrite( stream.avail_out ); if( result == Z_STREAM_END ) { result = inflate( &stream, Z_FINISH ); if( result < 0 ) LEMUR_THROW( result, "Something bad happened while trying to finish decompressing a document." ); inflateEnd( &stream ); break; } if( result < 0 ) { LEMUR_THROW( result, "Something bad happened while trying to decompress a document." ); } if( stream.avail_out == 0 ) { outputBuffer.grow(); } } }
void install_prior( const std::string& indexPath, const std::string& priorName, indri::file::File& priorFile ) { std::string priorDirectory = indri::file::Path::combine( indexPath, "prior" ); std::string priorPath = indri::file::Path::combine( priorDirectory, priorName ); // make sure there's a prior directory in the index if( indri::file::Path::exists( priorDirectory ) == false ) { indri::file::Path::make( priorDirectory ); } // if there's a old prior there with this name, remove it if( indri::file::Path::exists( priorPath ) ) { lemur_compat::remove( priorPath.c_str() ); } // copy the file indri::file::File output; output.create( priorPath ); size_t length = priorFile.size(); copy_region( output, priorFile, 0, length ); output.close(); }
bool extract_compression_table( std::map<double, int>& values, indri::file::File& in ) { indri::file::SequentialReadBuffer* inb = new indri::file::SequentialReadBuffer( in, 512*1024 ); MergeFile mf; mf.file = ∈ mf.buffer = inb; mf.length = in.size(); mf.score = 0; inb->seek( sizeof(UINT32)*2 ); while( !mf.finished() && values.size() <= 256 ) { mf.readScore(); std::map<double,int>::iterator iter = values.find( mf.score ); if( iter == values.end() ) { values.insert( std::make_pair( mf.score, values.size() ) ); } } delete mf.buffer; return values.size() <= 256; }
void convert_intscore_to_long_binary( indri::file::File& outfile, const char* infile ) { std::ifstream in; in.open( infile ); indri::file::SequentialWriteBuffer* outb = new indri::file::SequentialWriteBuffer( outfile, 1024*1024 ); while( !in.eof() ) { int document; double score; in >> document >> score; outb->write( (const void*) &document, sizeof(UINT32) ); outb->write( (const void*) &score, sizeof(double) ); } outb->flush(); delete outb; outfile.close(); in.close(); }