Exemple #1
0
  void _mergeData() {
    // fill in tags from the document vector fields
    // unnecessary to filter out any fields to remove, as they will
    // be ignored when indexing.
    indri::utility::greedy_vector<indri::index::FieldExtent>& fields = _docIter->currentEntry()->fields() ;
    tl.clear();
    for (size_t i = 0; i < fields.size(); i++) {
      const indri::index::FieldExtent& field = fields[i];
      std::string fieldName = _index->field(fields[i].id);;
    
      tl.addTag(fieldName.c_str(), fieldName.c_str(), field.begin);
      tl.endTag(fieldName.c_str(), fieldName.c_str(), field.end);
    }
    //stuff it into the parsed doc
    tl.writeTagList(parsed->tags);

    // fill in terms from the document text so that they will stop/stem
    // correctly when added to the new repository.
    // Potentially issues with url injection here...
    // probably best not to do this with trecweb/html docs....
    // TODO: test this
    termBuffer.clear();
    termBuffer.grow( parsed->textLength * 2 );

    for (size_t i = 0; i < parsed->positions.size(); i++ ) {
      int start = parsed->positions[i].begin;
      int end = parsed->positions[i].end;
      int token_len = end - start;
      const char *token = parsed->text + start;
      char* write_loc = termBuffer.write( token_len + 1 );
      strncpy( write_loc, token, token_len );
      write_loc[token_len] = '\0';
      parsed->terms.push_back( write_loc );
    }
  }
Exemple #2
0
void IndexWriter::_writeBatch( indri::file::SequentialWriteBuffer* buffer, int document, int length, indri::utility::Buffer& data ) {
  assert( length < 100*1000*1000 );
  _writeSkip( buffer, document, length );
  if( data.position() != 0 ) {
    buffer->write( data.front(), data.position() );
    data.clear();
  }
}
 void open( const std::string& anchorFile ) {
   _in.close();
   _in.clear();
   _in.open( anchorFile.c_str() );
   _buffer.clear();
   _readDocumentHeader();
 }
static void zlib_read_document( z_stream_s& stream, indri::file::File& infile, UINT64 offset, indri::utility::Buffer& outputBuffer ) {
  // read in data from the file until the stream ends
  // split up the data as necessary
  // decompress positional info

  // read some data
  char inputBuffer[INPUT_BUFFER_SIZE];
  outputBuffer.grow( INPUT_BUFFER_SIZE );
  outputBuffer.write( sizeof(indri::api::ParsedDocument) );

  stream.avail_in = 0;
  stream.avail_out = 0;
  
  while(true) {
    if( !stream.avail_in ) {
      UINT64 readSize = infile.read( inputBuffer, offset, sizeof inputBuffer );
      offset += readSize; 
      
      stream.avail_in = readSize;
      stream.next_in = (Bytef*) inputBuffer;
    }

    stream.avail_out = outputBuffer.size() - outputBuffer.position();
    stream.next_out = (Bytef*) outputBuffer.write( outputBuffer.size() - outputBuffer.position() );

    int result = inflate( &stream, Z_NO_FLUSH );
    outputBuffer.unwrite( stream.avail_out );

    if( result == Z_STREAM_END ) {
      result = inflate( &stream, Z_FINISH );
      
      if( result < 0 )
        LEMUR_THROW( result, "Something bad happened while trying to finish decompressing a document." );

      inflateEnd( &stream );
      break;
    }

    if( result < 0 ) {
      LEMUR_THROW( result, "Something bad happened while trying to decompress a document." );
    }

    if( stream.avail_out == 0 ) {
      outputBuffer.grow();
    }
  }
}
Exemple #5
0
      indri::api::ParsedDocument* transform( indri::api::ParsedDocument* document ) {
        // find the url metadata pair
        indri::utility::greedy_vector<indri::parse::MetadataPair>::iterator iter;
       
        iter = std::find_if( document->metadata.begin(),
                             document->metadata.end(),
                             indri::parse::MetadataPair::key_equal( "url" ) );   
       
        // no need to stick around if there is no url                     
        if( iter == document->metadata.end() )
          return document;                          
        
        // need to copy this into the buffer and parse it:
        _buffer.clear();                                   
        _buffer.grow( iter->valueLength + 1 );
        char* urlText = _buffer.write( iter->valueLength );                  
	// pushes the buffer pointer, trash in urlText
	//        memcpy( _buffer.write( iter->valueLength ), iter->value, iter->valueLength );
        memcpy( urlText, iter->value, iter->valueLength );
        *_buffer.write(1) = '\0';
        
        // now we're pointing to the copied urlText, so we can start parsing
        int urlStart = (int)document->terms.size();
        char* c = urlText;    
        bool lastSkipped = true; 
        bool foundSlash = false;
        int remainingStart = -1;
        
        // skip the beginning stuff (http://)
        for( c = urlText; *c; c++ ) {
          if( *c == '/' && c[1] && c[1] == '/' ) {
            urlText = c + 2;                            
          }
        }
        int cnt = 0;
        
        // now, try to find the 
        for( c = urlText; *c; c++ ) {
          if( (*c >= 'A' && *c <= 'Z') ||
              (*c >= 'a' && *c <= 'z') ||
              (*c >= '0' && *c <= '9') ) 
          {
            if( lastSkipped ) {
	      lastSkipped = false;
              document->terms.push_back( c );
              cnt++;
            }
          } else if( *c == '/' && remainingStart < 0 ) {
            *c = 0;
	    lastSkipped = true;
            remainingStart = document->terms.size();
          } else {
	    lastSkipped = true;
            *c = 0;
          }            
        }

        // put in phony positions entries
        int tokEnd = document->positions.size() ? document->positions[document->positions.size()-1].end : 0;
        for (size_t n = document->terms.size()-cnt; n < document->terms.size(); n++) {
          TermExtent extent;
          extent.begin = tokEnd++; // hope this doesn't run off the end
          extent.end = tokEnd;
          document->positions.push_back( extent );
        }

        // the URL text is now parsed and stored in the document
        // all we need to do now is put some tags around the text.
        TagExtent *url = new TagExtent;
        url->begin = urlStart;
        url->end = document->terms.size();
        url->name = "url";
        url->number = 0;
        document->tags.push_back(url);
                        
        TagExtent *domain = new TagExtent;
        domain->begin = urlStart;
        domain->end = (remainingStart >= 0) ? remainingStart : document->terms.size();
        domain->name = "urldomain";      
        domain->number = 0;
        document->tags.push_back(domain);
        
        if( remainingStart > 0 ) {
          indri::parse::TagExtent *urlpath = new TagExtent;
          urlpath->begin = remainingStart;
          urlpath->end = document->terms.size();
          urlpath->name = "urlpath";
          urlpath->number = 0;
          document->tags.push_back(urlpath);
        }
  
        return document;
      }
bool lemur::file::SortMergeTextFiles::_readLine(FILE *_in, char*& beginLine, size_t& lineLength, indri::utility::Buffer &_buffer ) {
  lineLength = 0;
  size_t actual;

  // make a buffer of a reasonable size so we're not always allocating
  if( _buffer.size() < 1024*1024 )
    _buffer.grow( 1024*1024 );
  // if we're running out of room, add 25MB
  if( (_buffer.size() -  _buffer.position()) < 512*1024 ) {
    _buffer.grow( _buffer.size() + 1024*1024*25 );
  }
  if( _buffer.position() ) {
    // get rid of null terminator from previous call
    _buffer.unwrite(1);
  }

  size_t readAmount = _buffer.size() - _buffer.position() - 2;

  // fetch next document line
  char* buffer = _buffer.write( readAmount );
  char* result = fgets( buffer, (int)readAmount, _in );

  if(!result) {
    return false;
  }

  actual = strlen(buffer);
  lineLength += actual;
  _buffer.unwrite( readAmount - actual );

  // all finished reading
  *_buffer.write(1) = 0;
  beginLine = _buffer.front() + _buffer.position() - lineLength - 1;

  // strip the \n off the end
  if (beginLine[lineLength-1]=='\n') {
    beginLine[lineLength-1]=0;
  }

  return true;
}
Exemple #7
0
void IndexWriter::_addInvertedListData( indri::utility::greedy_vector<WriterIndexContext*>& lists, indri::index::TermData* termData, indri::utility::Buffer& listBuffer, UINT64& endOffset ) {
  indri::utility::greedy_vector<WriterIndexContext*>::iterator iter;
  const int minimumSkip = 1<<12; // 4k
  int documentsWritten = 0;

  const float topdocsFraction = 0.01f;
  bool hasTopdocs = termData->corpus.documentCount > TOPDOCS_DOCUMENT_COUNT;
  bool isFrequent = termData->corpus.totalCount > FREQUENT_TERM_COUNT;
  int topdocsCount = hasTopdocs ? int(termData->corpus.documentCount * 0.01) : 0;
  int topdocsSpace = hasTopdocs ? ((topdocsCount*3*sizeof(UINT32)) + sizeof(int)) : 0;

  // write a control byte
  char control = (hasTopdocs ? 0x01 : 0) | (isFrequent ? 0x02 : 0);
  _invertedOutput->write( &control, 1 );

  UINT64 initialPosition = _invertedOutput->tell();

  // leave some room for the topdocs list
  if( hasTopdocs ) {
    _invertedOutput->seek( topdocsSpace + initialPosition );
  }

  // maintain a list of top documents
  std::priority_queue<DocListIterator::TopDocument,
    std::vector<DocListIterator::TopDocument>,
    DocListIterator::TopDocument::greater> topdocs;

  double threshold = 0;

  int lastDocument = 0;
  int positions = 0;
  int docs = 0;

  // for each matching list:
  for( iter = lists.begin(); iter != lists.end(); ++iter ) {
    indri::index::DocListFileIterator::DocListData* listData = (*iter)->iterator->currentEntry();
    DocListIterator* iterator = listData->iterator;
    Index* index = (*iter)->index;
    indri::utility::RVLCompressStream stream( listBuffer );

    int listDocs = 0;
    int listPositions = 0;

    while( !iterator->finished() ) {
      // get the latest entry from the list
      DocListIterator::DocumentData* documentData = iterator->currentEntry();

      // add to document counter
      docs++; listDocs++;

      // update the topdocs list
      if( hasTopdocs ) {
        int length = index->documentLength( documentData->document );
        int count = documentData->positions.size();

        // compute DocListIterator::TopDocument::greater (current, top())
        // if false, no reason to insert this entry.
        // note that the test is inverted. 
        //  int(length * threshold) <= count is equivalent to
        // count/length > topdocs.top().count/topdocs.top().length
        // but we use < to force breaking a tie in favor of keeping
        // the first seen document.
        if( int(length * threshold) < count || topdocs.size() < topdocsCount ) {
          // form a topdocs entry for this document
          DocListIterator::TopDocument topDocument( documentData->document,
                                                    count,
                                                    length );
            topdocs.push( topDocument );
            while( topdocs.size() > topdocsCount )
              topdocs.pop();

          threshold = topdocs.top().count / double(topdocs.top().length);
        }
      }
      
      if( listBuffer.position() > minimumSkip ) {
        // time to write in a skip
        _writeBatch( _invertedOutput, documentData->document, listBuffer.position(), listBuffer );

        // delta encode documents by batch
        lastDocument = 0;
      }

      assert( documentData->document > lastDocument );

      // write this entry out to the list
      stream << documentData->document - lastDocument;
      stream << (int) documentData->positions.size();
      lastDocument = documentData->document;

      int lastPosition = 0;

      for( int i=0; i<documentData->positions.size(); i++ ) {
        stream << (documentData->positions[i] - lastPosition);
        lastPosition = documentData->positions[i];
        positions++; listPositions++;
      }

      iterator->nextEntry();
    }

    indri::index::TermData* td = iterator->termData();

    assert( listPositions == td->corpus.totalCount );
    assert( listDocs == td->corpus.documentCount );
  }

  assert( docs == termData->corpus.documentCount );
  assert( positions == termData->corpus.totalCount );

  // write in the final skip info
  _writeBatch( _invertedOutput, -1, listBuffer.position(), listBuffer );
  UINT64 finalPosition = _invertedOutput->tell();

  if( hasTopdocs ) {
    _invertedOutput->seek( initialPosition );
    _invertedOutput->write( &topdocsCount, sizeof(int) );
    assert( topdocs.size() == topdocsCount );

    // write these into the topdocs list in order from smallest fraction to largest fraction,
    // where fraction = c(w;D)/|D|
    while( topdocs.size() ) {
      DocListIterator::TopDocument topDocument = topdocs.top();
      _invertedOutput->write( &topDocument.document, sizeof(int) );
      _invertedOutput->write( &topDocument.count, sizeof(int) );
      _invertedOutput->write( &topDocument.length, sizeof(int) );
      topdocs.pop();
    }
    
    assert( (_invertedOutput->tell() - initialPosition) == topdocsSpace );
    _invertedOutput->seek( finalPosition );
  }

  endOffset = finalPosition;
}
      void _fetchText( indri::utility::greedy_vector<TagExtent *>& tags, indri::utility::greedy_vector<char*>& terms ) {
        // now, fetch the additional terms
        char line[65536];
        _buffer.clear();

        for( int i=0; i<_count; i++ ) {
          // LINK
          _in.getline( line, sizeof line-1 );

          // LINKDOCNO 
          _in.getline( line, sizeof line-1 );
          
          // TEXT=
          _in.getline( line, sizeof line-1 );
          int textLen = strlen(line+6);
          strcpy( _buffer.write(textLen+1), line+6 );
          _buffer.unwrite(1);
          
          assert( *(_buffer.front()+_buffer.position()-1) == '\"' && "Last character should be a quote" );
        }
        *(_buffer.write(1)) = 0;

        // now there's a bunch of text in _buffer, space separated, with each
        // link separated by a " symbol

        char* beginWord = 0;
        int beginIndex = 0;
        char* buffer = _buffer.front();

        for( unsigned int i=0; i<_buffer.position(); i++ ) {
          if( isalnum(buffer[i]) && !beginWord ) {
            beginWord = buffer+i;

            if(!beginIndex)
              beginIndex = terms.size();
          } else if( isspace(buffer[i]) ) {
            buffer[i] = 0;
            if( beginWord )
              terms.push_back( beginWord );
            beginWord = 0;
          } else if( buffer[i] == '\"' ) {
            buffer[i] = 0;
            if( beginWord )
              terms.push_back( beginWord );
            beginWord = 0;
        
            TagExtent * extent = new TagExtent;
            extent->name = "inlink";
            extent->begin = beginIndex;
            extent->end = terms.size();
            extent->number = 0;
            extent->parent = 0;

            assert( extent->begin <= extent->end );

            if( beginIndex ) {
              tags.push_back(extent);
              if( terms.size() > 125000 )
                break;
            }


            beginIndex = 0;
          }

        }

      }
Exemple #9
0
 FieldModifier() : fce(NULL), parsed(NULL), _docIter(NULL)
 {
   termBuffer.grow(1024*1024);
 }