void IndexWriter::_writeBatch( indri::file::SequentialWriteBuffer* buffer, int document, int length, indri::utility::Buffer& data ) { assert( length < 100*1000*1000 ); _writeSkip( buffer, document, length ); if( data.position() != 0 ) { buffer->write( data.front(), data.position() ); data.clear(); } }
bool lemur::file::SortMergeTextFiles::_readLine(FILE *_in, char*& beginLine, size_t& lineLength, indri::utility::Buffer &_buffer ) { lineLength = 0; size_t actual; // make a buffer of a reasonable size so we're not always allocating if( _buffer.size() < 1024*1024 ) _buffer.grow( 1024*1024 ); // if we're running out of room, add 25MB if( (_buffer.size() - _buffer.position()) < 512*1024 ) { _buffer.grow( _buffer.size() + 1024*1024*25 ); } if( _buffer.position() ) { // get rid of null terminator from previous call _buffer.unwrite(1); } size_t readAmount = _buffer.size() - _buffer.position() - 2; // fetch next document line char* buffer = _buffer.write( readAmount ); char* result = fgets( buffer, (int)readAmount, _in ); if(!result) { return false; } actual = strlen(buffer); lineLength += actual; _buffer.unwrite( readAmount - actual ); // all finished reading *_buffer.write(1) = 0; beginLine = _buffer.front() + _buffer.position() - lineLength - 1; // strip the \n off the end if (beginLine[lineLength-1]=='\n') { beginLine[lineLength-1]=0; } return true; }
void _fetchText( indri::utility::greedy_vector<TagExtent *>& tags, indri::utility::greedy_vector<char*>& terms ) { // now, fetch the additional terms char line[65536]; _buffer.clear(); for( int i=0; i<_count; i++ ) { // LINK _in.getline( line, sizeof line-1 ); // LINKDOCNO _in.getline( line, sizeof line-1 ); // TEXT= _in.getline( line, sizeof line-1 ); int textLen = strlen(line+6); strcpy( _buffer.write(textLen+1), line+6 ); _buffer.unwrite(1); assert( *(_buffer.front()+_buffer.position()-1) == '\"' && "Last character should be a quote" ); } *(_buffer.write(1)) = 0; // now there's a bunch of text in _buffer, space separated, with each // link separated by a " symbol char* beginWord = 0; int beginIndex = 0; char* buffer = _buffer.front(); for( unsigned int i=0; i<_buffer.position(); i++ ) { if( isalnum(buffer[i]) && !beginWord ) { beginWord = buffer+i; if(!beginIndex) beginIndex = terms.size(); } else if( isspace(buffer[i]) ) { buffer[i] = 0; if( beginWord ) terms.push_back( beginWord ); beginWord = 0; } else if( buffer[i] == '\"' ) { buffer[i] = 0; if( beginWord ) terms.push_back( beginWord ); beginWord = 0; TagExtent * extent = new TagExtent; extent->name = "inlink"; extent->begin = beginIndex; extent->end = terms.size(); extent->number = 0; extent->parent = 0; assert( extent->begin <= extent->end ); if( beginIndex ) { tags.push_back(extent); if( terms.size() > 125000 ) break; } beginIndex = 0; } } }