void IndexWriter::_writeBatch( indri::file::SequentialWriteBuffer* buffer, int document, int length, indri::utility::Buffer& data ) { assert( length < 100*1000*1000 ); _writeSkip( buffer, document, length ); if( data.position() != 0 ) { buffer->write( data.front(), data.position() ); data.clear(); } }
static void zlib_read_document( z_stream_s& stream, indri::file::File& infile, UINT64 offset, indri::utility::Buffer& outputBuffer ) { // read in data from the file until the stream ends // split up the data as necessary // decompress positional info // read some data char inputBuffer[INPUT_BUFFER_SIZE]; outputBuffer.grow( INPUT_BUFFER_SIZE ); outputBuffer.write( sizeof(indri::api::ParsedDocument) ); stream.avail_in = 0; stream.avail_out = 0; while(true) { if( !stream.avail_in ) { UINT64 readSize = infile.read( inputBuffer, offset, sizeof inputBuffer ); offset += readSize; stream.avail_in = readSize; stream.next_in = (Bytef*) inputBuffer; } stream.avail_out = outputBuffer.size() - outputBuffer.position(); stream.next_out = (Bytef*) outputBuffer.write( outputBuffer.size() - outputBuffer.position() ); int result = inflate( &stream, Z_NO_FLUSH ); outputBuffer.unwrite( stream.avail_out ); if( result == Z_STREAM_END ) { result = inflate( &stream, Z_FINISH ); if( result < 0 ) LEMUR_THROW( result, "Something bad happened while trying to finish decompressing a document." ); inflateEnd( &stream ); break; } if( result < 0 ) { LEMUR_THROW( result, "Something bad happened while trying to decompress a document." ); } if( stream.avail_out == 0 ) { outputBuffer.grow(); } } }
bool lemur::file::SortMergeTextFiles::_readLine(FILE *_in, char*& beginLine, size_t& lineLength, indri::utility::Buffer &_buffer ) { lineLength = 0; size_t actual; // make a buffer of a reasonable size so we're not always allocating if( _buffer.size() < 1024*1024 ) _buffer.grow( 1024*1024 ); // if we're running out of room, add 25MB if( (_buffer.size() - _buffer.position()) < 512*1024 ) { _buffer.grow( _buffer.size() + 1024*1024*25 ); } if( _buffer.position() ) { // get rid of null terminator from previous call _buffer.unwrite(1); } size_t readAmount = _buffer.size() - _buffer.position() - 2; // fetch next document line char* buffer = _buffer.write( readAmount ); char* result = fgets( buffer, (int)readAmount, _in ); if(!result) { return false; } actual = strlen(buffer); lineLength += actual; _buffer.unwrite( readAmount - actual ); // all finished reading *_buffer.write(1) = 0; beginLine = _buffer.front() + _buffer.position() - lineLength - 1; // strip the \n off the end if (beginLine[lineLength-1]=='\n') { beginLine[lineLength-1]=0; } return true; }
void IndexWriter::_addInvertedListData( indri::utility::greedy_vector<WriterIndexContext*>& lists, indri::index::TermData* termData, indri::utility::Buffer& listBuffer, UINT64& endOffset ) { indri::utility::greedy_vector<WriterIndexContext*>::iterator iter; const int minimumSkip = 1<<12; // 4k int documentsWritten = 0; const float topdocsFraction = 0.01f; bool hasTopdocs = termData->corpus.documentCount > TOPDOCS_DOCUMENT_COUNT; bool isFrequent = termData->corpus.totalCount > FREQUENT_TERM_COUNT; int topdocsCount = hasTopdocs ? int(termData->corpus.documentCount * 0.01) : 0; int topdocsSpace = hasTopdocs ? ((topdocsCount*3*sizeof(UINT32)) + sizeof(int)) : 0; // write a control byte char control = (hasTopdocs ? 0x01 : 0) | (isFrequent ? 0x02 : 0); _invertedOutput->write( &control, 1 ); UINT64 initialPosition = _invertedOutput->tell(); // leave some room for the topdocs list if( hasTopdocs ) { _invertedOutput->seek( topdocsSpace + initialPosition ); } // maintain a list of top documents std::priority_queue<DocListIterator::TopDocument, std::vector<DocListIterator::TopDocument>, DocListIterator::TopDocument::greater> topdocs; double threshold = 0; int lastDocument = 0; int positions = 0; int docs = 0; // for each matching list: for( iter = lists.begin(); iter != lists.end(); ++iter ) { indri::index::DocListFileIterator::DocListData* listData = (*iter)->iterator->currentEntry(); DocListIterator* iterator = listData->iterator; Index* index = (*iter)->index; indri::utility::RVLCompressStream stream( listBuffer ); int listDocs = 0; int listPositions = 0; while( !iterator->finished() ) { // get the latest entry from the list DocListIterator::DocumentData* documentData = iterator->currentEntry(); // add to document counter docs++; listDocs++; // update the topdocs list if( hasTopdocs ) { int length = index->documentLength( documentData->document ); int count = documentData->positions.size(); // compute DocListIterator::TopDocument::greater (current, top()) // if false, no reason to insert this entry. // note that the test is inverted. // int(length * threshold) <= count is equivalent to // count/length > topdocs.top().count/topdocs.top().length // but we use < to force breaking a tie in favor of keeping // the first seen document. if( int(length * threshold) < count || topdocs.size() < topdocsCount ) { // form a topdocs entry for this document DocListIterator::TopDocument topDocument( documentData->document, count, length ); topdocs.push( topDocument ); while( topdocs.size() > topdocsCount ) topdocs.pop(); threshold = topdocs.top().count / double(topdocs.top().length); } } if( listBuffer.position() > minimumSkip ) { // time to write in a skip _writeBatch( _invertedOutput, documentData->document, listBuffer.position(), listBuffer ); // delta encode documents by batch lastDocument = 0; } assert( documentData->document > lastDocument ); // write this entry out to the list stream << documentData->document - lastDocument; stream << (int) documentData->positions.size(); lastDocument = documentData->document; int lastPosition = 0; for( int i=0; i<documentData->positions.size(); i++ ) { stream << (documentData->positions[i] - lastPosition); lastPosition = documentData->positions[i]; positions++; listPositions++; } iterator->nextEntry(); } indri::index::TermData* td = iterator->termData(); assert( listPositions == td->corpus.totalCount ); assert( listDocs == td->corpus.documentCount ); } assert( docs == termData->corpus.documentCount ); assert( positions == termData->corpus.totalCount ); // write in the final skip info _writeBatch( _invertedOutput, -1, listBuffer.position(), listBuffer ); UINT64 finalPosition = _invertedOutput->tell(); if( hasTopdocs ) { _invertedOutput->seek( initialPosition ); _invertedOutput->write( &topdocsCount, sizeof(int) ); assert( topdocs.size() == topdocsCount ); // write these into the topdocs list in order from smallest fraction to largest fraction, // where fraction = c(w;D)/|D| while( topdocs.size() ) { DocListIterator::TopDocument topDocument = topdocs.top(); _invertedOutput->write( &topDocument.document, sizeof(int) ); _invertedOutput->write( &topDocument.count, sizeof(int) ); _invertedOutput->write( &topDocument.length, sizeof(int) ); topdocs.pop(); } assert( (_invertedOutput->tell() - initialPosition) == topdocsSpace ); _invertedOutput->seek( finalPosition ); } endOffset = finalPosition; }
void _fetchText( indri::utility::greedy_vector<TagExtent *>& tags, indri::utility::greedy_vector<char*>& terms ) { // now, fetch the additional terms char line[65536]; _buffer.clear(); for( int i=0; i<_count; i++ ) { // LINK _in.getline( line, sizeof line-1 ); // LINKDOCNO _in.getline( line, sizeof line-1 ); // TEXT= _in.getline( line, sizeof line-1 ); int textLen = strlen(line+6); strcpy( _buffer.write(textLen+1), line+6 ); _buffer.unwrite(1); assert( *(_buffer.front()+_buffer.position()-1) == '\"' && "Last character should be a quote" ); } *(_buffer.write(1)) = 0; // now there's a bunch of text in _buffer, space separated, with each // link separated by a " symbol char* beginWord = 0; int beginIndex = 0; char* buffer = _buffer.front(); for( unsigned int i=0; i<_buffer.position(); i++ ) { if( isalnum(buffer[i]) && !beginWord ) { beginWord = buffer+i; if(!beginIndex) beginIndex = terms.size(); } else if( isspace(buffer[i]) ) { buffer[i] = 0; if( beginWord ) terms.push_back( beginWord ); beginWord = 0; } else if( buffer[i] == '\"' ) { buffer[i] = 0; if( beginWord ) terms.push_back( beginWord ); beginWord = 0; TagExtent * extent = new TagExtent; extent->name = "inlink"; extent->begin = beginIndex; extent->end = terms.size(); extent->number = 0; extent->parent = 0; assert( extent->begin <= extent->end ); if( beginIndex ) { tags.push_back(extent); if( terms.size() > 125000 ) break; } beginIndex = 0; } } }