Exemple #1
0
void IndexWriter::_addInvertedListData( indri::utility::greedy_vector<WriterIndexContext*>& lists, indri::index::TermData* termData, indri::utility::Buffer& listBuffer, UINT64& endOffset ) {
  indri::utility::greedy_vector<WriterIndexContext*>::iterator iter;
  const int minimumSkip = 1<<12; // 4k
  int documentsWritten = 0;

  const float topdocsFraction = 0.01f;
  bool hasTopdocs = termData->corpus.documentCount > TOPDOCS_DOCUMENT_COUNT;
  bool isFrequent = termData->corpus.totalCount > FREQUENT_TERM_COUNT;
  int topdocsCount = hasTopdocs ? int(termData->corpus.documentCount * 0.01) : 0;
  int topdocsSpace = hasTopdocs ? ((topdocsCount*3*sizeof(UINT32)) + sizeof(int)) : 0;

  // write a control byte
  char control = (hasTopdocs ? 0x01 : 0) | (isFrequent ? 0x02 : 0);
  _invertedOutput->write( &control, 1 );

  UINT64 initialPosition = _invertedOutput->tell();

  // leave some room for the topdocs list
  if( hasTopdocs ) {
    _invertedOutput->seek( topdocsSpace + initialPosition );
  }

  // maintain a list of top documents
  std::priority_queue<DocListIterator::TopDocument,
    std::vector<DocListIterator::TopDocument>,
    DocListIterator::TopDocument::greater> topdocs;

  double threshold = 0;

  int lastDocument = 0;
  int positions = 0;
  int docs = 0;

  // for each matching list:
  for( iter = lists.begin(); iter != lists.end(); ++iter ) {
    indri::index::DocListFileIterator::DocListData* listData = (*iter)->iterator->currentEntry();
    DocListIterator* iterator = listData->iterator;
    Index* index = (*iter)->index;
    indri::utility::RVLCompressStream stream( listBuffer );

    int listDocs = 0;
    int listPositions = 0;

    while( !iterator->finished() ) {
      // get the latest entry from the list
      DocListIterator::DocumentData* documentData = iterator->currentEntry();

      // add to document counter
      docs++; listDocs++;

      // update the topdocs list
      if( hasTopdocs ) {
        int length = index->documentLength( documentData->document );
        int count = documentData->positions.size();

        // compute DocListIterator::TopDocument::greater (current, top())
        // if false, no reason to insert this entry.
        // note that the test is inverted. 
        //  int(length * threshold) <= count is equivalent to
        // count/length > topdocs.top().count/topdocs.top().length
        // but we use < to force breaking a tie in favor of keeping
        // the first seen document.
        if( int(length * threshold) < count || topdocs.size() < topdocsCount ) {
          // form a topdocs entry for this document
          DocListIterator::TopDocument topDocument( documentData->document,
                                                    count,
                                                    length );
            topdocs.push( topDocument );
            while( topdocs.size() > topdocsCount )
              topdocs.pop();

          threshold = topdocs.top().count / double(topdocs.top().length);
        }
      }
      
      if( listBuffer.position() > minimumSkip ) {
        // time to write in a skip
        _writeBatch( _invertedOutput, documentData->document, listBuffer.position(), listBuffer );

        // delta encode documents by batch
        lastDocument = 0;
      }

      assert( documentData->document > lastDocument );

      // write this entry out to the list
      stream << documentData->document - lastDocument;
      stream << (int) documentData->positions.size();
      lastDocument = documentData->document;

      int lastPosition = 0;

      for( int i=0; i<documentData->positions.size(); i++ ) {
        stream << (documentData->positions[i] - lastPosition);
        lastPosition = documentData->positions[i];
        positions++; listPositions++;
      }

      iterator->nextEntry();
    }

    indri::index::TermData* td = iterator->termData();

    assert( listPositions == td->corpus.totalCount );
    assert( listDocs == td->corpus.documentCount );
  }

  assert( docs == termData->corpus.documentCount );
  assert( positions == termData->corpus.totalCount );

  // write in the final skip info
  _writeBatch( _invertedOutput, -1, listBuffer.position(), listBuffer );
  UINT64 finalPosition = _invertedOutput->tell();

  if( hasTopdocs ) {
    _invertedOutput->seek( initialPosition );
    _invertedOutput->write( &topdocsCount, sizeof(int) );
    assert( topdocs.size() == topdocsCount );

    // write these into the topdocs list in order from smallest fraction to largest fraction,
    // where fraction = c(w;D)/|D|
    while( topdocs.size() ) {
      DocListIterator::TopDocument topDocument = topdocs.top();
      _invertedOutput->write( &topDocument.document, sizeof(int) );
      _invertedOutput->write( &topDocument.count, sizeof(int) );
      _invertedOutput->write( &topDocument.length, sizeof(int) );
      topdocs.pop();
    }
    
    assert( (_invertedOutput->tell() - initialPosition) == topdocsSpace );
    _invertedOutput->seek( finalPosition );
  }

  endOffset = finalPosition;
}