Example #1
0
void IndexWriter::_writeDirectLists( WriterIndexContext* context,
                                     indri::file::SequentialWriteBuffer* directOutput,
                                     indri::file::SequentialWriteBuffer* lengthsOutput,
                                     indri::file::SequentialWriteBuffer* dataOutput ) {

  VocabularyIterator* vocabulary = context->index->frequentVocabularyIterator();
  indri::index::Index* index = context->index;
  
  vocabulary->startIteration();

  while( !vocabulary->finished() ) {
    indri::index::DiskTermData* diskTermData = vocabulary->currentEntry();

    context->oldFrequent->add( diskTermData->termID, diskTermData->termData->term );
    vocabulary->nextEntry();
  }

  delete vocabulary;
  vocabulary = 0;

  TermListFileIterator* iterator = index->termListFileIterator();
  TermTranslator* translator = _buildTermTranslator( _infrequentTermsReader,
                                                     _frequentTermsReader,
                                                     *context->oldFrequent,
                                                     context->oldInfrequent,
                                                     *context->newlyFrequent,
                                                     index,
                                                     context->bitmap );
  iterator->startIteration();
  TermList writeList;
  indri::utility::Buffer outputBuffer( 256*1024 );

  indri::index::DocumentDataIterator* dataIterator = context->index->documentDataIterator();
  dataIterator->startIteration();

  while( !iterator->finished() ) {
    writeList.clear();
    TermList* list = iterator->currentEntry();
    assert( list );

    int currentTerm;
    int translated;

    // copy and translate terms
    for( int i=0; i<list->terms().size(); i++ ) {
      currentTerm = list->terms()[i];
      assert( currentTerm >= 0 );
      assert( currentTerm <= index->uniqueTermCount() );
      translated = (*translator)( currentTerm );
      assert( translated > 0 || (translated == 0 && currentTerm == 0) );

      writeList.addTerm( translated );
    }

    // copy field data
    int fieldCount = list->fields().size();
    const indri::utility::greedy_vector<indri::index::FieldExtent>& fields = list->fields();

    for( int i=0; i<fieldCount; i++ ) {
      writeList.addField( fields[i] );
    }
  
    // record the start position
    size_t writeStart = outputBuffer.position();
    UINT32 length = 0;

    // write the list, leaving room for a length count
    outputBuffer.write( sizeof(UINT32) );
    writeList.write( outputBuffer );

    // record the end position, compute length
    size_t writeEnd = outputBuffer.position();
    length = writeEnd - (writeStart + sizeof(UINT32));

    // store length
    assert( outputBuffer.position() >= (sizeof(UINT32) + length + writeStart) );
    memcpy( outputBuffer.front() + writeStart, &length, sizeof(UINT32) );
    assert( dataIterator );

    // get a copy of the document data
    assert( dataIterator );
    assert( !dataIterator->finished() );
    indri::index::DocumentData documentData = *dataIterator->currentEntry();

    // store offset information
    documentData.byteLength = length;
    documentData.offset = directOutput->tell() + writeStart + sizeof(UINT32);
    // tell has to happen before a write or the offset will be wrong.
    if( outputBuffer.position() > 128*1024 ) {
      directOutput->write( outputBuffer.front(), outputBuffer.position() );
      outputBuffer.clear();
    }


    dataOutput->write( &documentData, sizeof(DocumentData) );
    int termLength = documentData.totalLength;
    assert( termLength >= 0 );
    lengthsOutput->write( &termLength, sizeof(UINT32) );
    
    iterator->nextEntry();
    dataIterator->nextEntry();
  }

  delete iterator;
  delete dataIterator;
  delete translator;
  directOutput->write( outputBuffer.front(), outputBuffer.position() );
  directOutput->flush();
  lengthsOutput->flush();
  outputBuffer.clear();
}