indri::index::DiskTermListFileIterator::DiskTermListFileIterator( indri::file::File& termListFile ) : _termListFile(termListFile), _buffer(_termListFile, 1024*1024), _fileSize(termListFile.size()), _finished(false), _currentDocument(0) { }
indri::index::DiskDocListFileIterator::DiskDocListFileIterator( indri::file::File& docListFile, int fieldCount ) : _file( new indri::file::SequentialReadBuffer( docListFile ) ), _fileLength( docListFile.size() ), _fieldCount( fieldCount ), _iterator( _file, 0, 0 ), _finished( false ) { _termData = (indri::index::TermData*) malloc( ::termdata_size( fieldCount ) ); }
void sort_run( indri::file::File& out, indri::file::File& in, size_t memory ) { // read the data in UINT64 length = in.size(); char* data = new char[length]; in.read( data, 0, length ); in.close(); qsort( data, length / 12, 12, sort_comparator ); out.write( data, 0, length ); delete[] data; }
void install_prior( const std::string& indexPath, const std::string& priorName, indri::file::File& priorFile ) { std::string priorDirectory = indri::file::Path::combine( indexPath, "prior" ); std::string priorPath = indri::file::Path::combine( priorDirectory, priorName ); // make sure there's a prior directory in the index if( indri::file::Path::exists( priorDirectory ) == false ) { indri::file::Path::make( priorDirectory ); } // if there's a old prior there with this name, remove it if( indri::file::Path::exists( priorPath ) ) { lemur_compat::remove( priorPath.c_str() ); } // copy the file indri::file::File output; output.create( priorPath ); size_t length = priorFile.size(); copy_region( output, priorFile, 0, length ); output.close(); }
bool extract_compression_table( std::map<double, int>& values, indri::file::File& in ) { indri::file::SequentialReadBuffer* inb = new indri::file::SequentialReadBuffer( in, 512*1024 ); MergeFile mf; mf.file = ∈ mf.buffer = inb; mf.length = in.size(); mf.score = 0; inb->seek( sizeof(UINT32)*2 ); while( !mf.finished() && values.size() <= 256 ) { mf.readScore(); std::map<double,int>::iterator iter = values.find( mf.score ); if( iter == values.end() ) { values.insert( std::make_pair( mf.score, values.size() ) ); } } delete mf.buffer; return values.size() <= 256; }
void sort_file( indri::file::File& out, indri::file::File& in, size_t memory, int totalDocuments ) { UINT64 length = in.size(); size_t rounded = (memory / 12) * 12; UINT64 total = 0; std::vector<std::string> temporaries; while( length > total ) { UINT64 chunk = lemur_compat::min<UINT64>( rounded, length - total ); indri::file::File tempIn; indri::file::File tempOut; std::string nameIn; std::string nameOut; tempIn.openTemporary( nameIn ); tempOut.openTemporary( nameOut ); // make a sorted run copy_region( tempIn, in, total, chunk ); sort_run( tempOut, tempIn, memory ); tempIn.close(); tempOut.close(); lemur_compat::remove( nameIn.c_str() ); temporaries.push_back( nameOut ); total += chunk; } in.close(); merge_sorted_runs( out, temporaries, totalDocuments ); for( size_t i=0; i<temporaries.size(); i++ ) { lemur_compat::remove( temporaries[i].c_str() ); } }