示例#1
0
void IndexWriter::_writeBatch( indri::file::SequentialWriteBuffer* buffer, int document, int length, indri::utility::Buffer& data ) {
  assert( length < 100*1000*1000 );
  _writeSkip( buffer, document, length );
  if( data.position() != 0 ) {
    buffer->write( data.front(), data.position() );
    data.clear();
  }
}
示例#2
0
bool lemur::file::SortMergeTextFiles::_readLine(FILE *_in, char*& beginLine, size_t& lineLength, indri::utility::Buffer &_buffer ) {
  lineLength = 0;
  size_t actual;

  // make a buffer of a reasonable size so we're not always allocating
  if( _buffer.size() < 1024*1024 )
    _buffer.grow( 1024*1024 );
  // if we're running out of room, add 25MB
  if( (_buffer.size() -  _buffer.position()) < 512*1024 ) {
    _buffer.grow( _buffer.size() + 1024*1024*25 );
  }
  if( _buffer.position() ) {
    // get rid of null terminator from previous call
    _buffer.unwrite(1);
  }

  size_t readAmount = _buffer.size() - _buffer.position() - 2;

  // fetch next document line
  char* buffer = _buffer.write( readAmount );
  char* result = fgets( buffer, (int)readAmount, _in );

  if(!result) {
    return false;
  }

  actual = strlen(buffer);
  lineLength += actual;
  _buffer.unwrite( readAmount - actual );

  // all finished reading
  *_buffer.write(1) = 0;
  beginLine = _buffer.front() + _buffer.position() - lineLength - 1;

  // strip the \n off the end
  if (beginLine[lineLength-1]=='\n') {
    beginLine[lineLength-1]=0;
  }

  return true;
}
      void _fetchText( indri::utility::greedy_vector<TagExtent *>& tags, indri::utility::greedy_vector<char*>& terms ) {
        // now, fetch the additional terms
        char line[65536];
        _buffer.clear();

        for( int i=0; i<_count; i++ ) {
          // LINK
          _in.getline( line, sizeof line-1 );

          // LINKDOCNO 
          _in.getline( line, sizeof line-1 );
          
          // TEXT=
          _in.getline( line, sizeof line-1 );
          int textLen = strlen(line+6);
          strcpy( _buffer.write(textLen+1), line+6 );
          _buffer.unwrite(1);
          
          assert( *(_buffer.front()+_buffer.position()-1) == '\"' && "Last character should be a quote" );
        }
        *(_buffer.write(1)) = 0;

        // now there's a bunch of text in _buffer, space separated, with each
        // link separated by a " symbol

        char* beginWord = 0;
        int beginIndex = 0;
        char* buffer = _buffer.front();

        for( unsigned int i=0; i<_buffer.position(); i++ ) {
          if( isalnum(buffer[i]) && !beginWord ) {
            beginWord = buffer+i;

            if(!beginIndex)
              beginIndex = terms.size();
          } else if( isspace(buffer[i]) ) {
            buffer[i] = 0;
            if( beginWord )
              terms.push_back( beginWord );
            beginWord = 0;
          } else if( buffer[i] == '\"' ) {
            buffer[i] = 0;
            if( beginWord )
              terms.push_back( beginWord );
            beginWord = 0;
        
            TagExtent * extent = new TagExtent;
            extent->name = "inlink";
            extent->begin = beginIndex;
            extent->end = terms.size();
            extent->number = 0;
            extent->parent = 0;

            assert( extent->begin <= extent->end );

            if( beginIndex ) {
              tags.push_back(extent);
              if( terms.size() > 125000 )
                break;
            }


            beginIndex = 0;
          }

        }

      }