Example #1
0
 void run() {
     IndexDetails& id = addIndexWithInfo();
     // Create a SortPhaseOne.
     SortPhaseOne phaseOne;
     phaseOne.sorter.reset( new BSONObjExternalSorter( id.idxInterface(),
                                                       BSON( "a" << 1 ) ) );
     // Add index keys to the phaseOne.
     int32_t nKeys = 130;
     for( int32_t i = 0; i < nKeys; ++i ) {
         phaseOne.sorter->add( BSON( "a" << i ), /* dummy disk loc */ DiskLoc(), false );
     }
     phaseOne.nkeys = phaseOne.n = nKeys;
     phaseOne.sorter->sort( false );
     // Set up remaining arguments.
     set<DiskLoc> dups;
     CurOp* op = cc().curop();
     ProgressMeterHolder pm (op->setMessage("BuildBottomUp",
                                            "BuildBottomUp Progress",
                                            nKeys,
                                            nKeys));
     pm.finished();
     Timer timer;
     // The index's root has not yet been set.
     ASSERT( id.head.isNull() );
     // Finish building the index.
     buildBottomUpPhases2And3<V1>( true,
                                   id,
                                   *phaseOne.sorter,
                                   false,
                                   dups,
                                   op,
                                   &phaseOne,
                                   pm,
                                   timer,
                                   true );
     // The index's root is set after the build is complete.
     ASSERT( !id.head.isNull() );
     // Create a cursor over the index.
     scoped_ptr<BtreeCursor> cursor(
             BtreeCursor::make( nsdetails( _ns ),
                                id,
                                BSON( "" << -1 ),    // startKey below minimum key.
                                BSON( "" << nKeys ), // endKey above maximum key.
                                true,                // endKeyInclusive true.
                                1                    // direction forward.
                                ) );
     // Check that the keys in the index are the expected ones.
     int32_t expectedKey = 0;
     for( ; cursor->ok(); cursor->advance(), ++expectedKey ) {
         ASSERT_EQUALS( expectedKey, cursor->currKey().firstElement().number() );
     }
     ASSERT_EQUALS( nKeys, expectedKey );
 }
Example #2
0
    void buildBottomUpPhases2And3(bool dupsAllowed, IndexDetails& idx, BSONObjExternalSorter& sorter, 
        bool dropDups, set<DiskLoc> &dupsToDrop, CurOp * op, SortPhaseOne *phase1, ProgressMeterHolder &pm,
        Timer& t
        )
    {
        BtreeBuilder<V> btBuilder(dupsAllowed, idx);
        BSONObj keyLast;
        auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator();
        verify( pm == op->setMessage( "index: (2/3) btree bottom up" , phase1->nkeys , 10 ) );
        while( i->more() ) {
            RARELY killCurrentOp.checkForInterrupt();
            BSONObjExternalSorter::Data d = i->next();

            try {
                if ( !dupsAllowed && dropDups ) {
                    LastError::Disabled led( lastError.get() );
                    btBuilder.addKey(d.first, d.second);
                }
                else {
                    btBuilder.addKey(d.first, d.second);                    
                }
            }
            catch( AssertionException& e ) {
                if ( dupsAllowed ) {
                    // unknown exception??
                    throw;
                }

                if( e.interrupted() ) {
                    killCurrentOp.checkForInterrupt();
                }

                if ( ! dropDups )
                    throw;

                /* we could queue these on disk, but normally there are very few dups, so instead we
                    keep in ram and have a limit.
                */
                dupsToDrop.insert(d.second);
                uassert( 10092 , "too may dups on index build with dropDups=true", dupsToDrop.size() < 1000000 );
            }
            pm.hit();
        }
        pm.finished();
        op->setMessage( "index: (3/3) btree-middle" );
        log(t.seconds() > 10 ? 0 : 1 ) << "\t done building bottom layer, going to commit" << endl;
        btBuilder.commit();
        if ( btBuilder.getn() != phase1->nkeys && ! dropDups ) {
            warning() << "not all entries were added to the index, probably some keys were too large" << endl;
        }
    }
Example #3
0
 void run() {
     // It's necessary to index sufficient keys that a RARELY condition will be triggered.
     int32_t nDocs = 130;
     // Add some data to the collection.
     for( int32_t i = 0; i < nDocs; ++i ) {
         _client.insert( _ns, BSON( "a" << i ) );
     }
     IndexDetails& id = addIndexWithInfo();
     // Create a SortPhaseOne.
     SortPhaseOne phaseOne;
     ProgressMeterHolder pm (cc().curop()->setMessage("InterruptAddKeysToPhaseOne",
                                                      "InterruptAddKeysToPhaseOne Progress",
                                                      nDocs,
                                                      nDocs));
     // Register a request to kill the current operation.
     cc().curop()->kill();
     if ( _mayInterrupt ) {
         // Add keys to phaseOne.
         ASSERT_THROWS( BtreeBasedBuilder::addKeysToPhaseOne( nsdetails(_ns), _ns,
                                           id,
                                           BSON( "a" << 1 ),
                                           &phaseOne,
                                           nDocs,
                                           pm.get(),
                                           _mayInterrupt,
                                           nsdetails(_ns)->idxNo(id) ),
                        UserException );
         // Not all keys were added to phaseOne due to the interrupt.
         ASSERT( static_cast<uint64_t>( nDocs ) > phaseOne.n );
     }
     else {
         // Add keys to phaseOne.
         BtreeBasedBuilder::addKeysToPhaseOne( nsdetails(_ns), _ns,
                            id,
                            BSON( "a" << 1 ),
                            &phaseOne,
                            nDocs,
                            pm.get(),
                            _mayInterrupt, nsdetails(_ns)->idxNo(id) );
         // All keys were added to phaseOne despite to the kill request, because
         // mayInterrupt == false.
         ASSERT_EQUALS( static_cast<uint64_t>( nDocs ), phaseOne.n );
     }
 }
Example #4
0
        void run() {
            // Add some data to the collection.
            int32_t nDocs = 130;
            for( int32_t i = 0; i < nDocs; ++i ) {
                _client.insert( _ns, BSON( "a" << i ) );
            }

            IndexDescriptor* id = addIndexWithInfo();
            // Create a SortPhaseOne.
            SortPhaseOne phaseOne;
            ProgressMeterHolder pm (cc().curop()->setMessage("AddKeysToPhaseOne",
                                                             "AddKeysToPhaseOne Progress",
                                                             nDocs,
                                                             nDocs));
            // Add keys to phaseOne.
            BtreeBasedBuilder::addKeysToPhaseOne( collection(),
                                                  id,
                                                  BSON( "a" << 1 ),
                                                  &phaseOne,
                                                  pm.get(), true );
            // Keys for all documents were added to phaseOne.
            ASSERT_EQUALS( static_cast<uint64_t>( nDocs ), phaseOne.n );
        }
Example #5
0
 void run() {
     IndexDescriptor* id = addIndexWithInfo();
     // Create a SortPhaseOne.
     SortPhaseOne phaseOne;
     phaseOne.sorter.reset(new BSONObjExternalSorter(_aFirstSort));
     // It's necessary to index sufficient keys that a RARELY condition will be triggered,
     // but few enough keys that the btree builder will not create an internal node and check
     // for an interrupt internally (which would cause this test to pass spuriously).
     int32_t nKeys = 130;
     // Add index keys to the phaseOne.
     for( int32_t i = 0; i < nKeys; ++i ) {
         phaseOne.sorter->add( BSON( "a" << i ), /* dummy disk loc */ DiskLoc(), false );
     }
     phaseOne.nkeys = phaseOne.n = nKeys;
     phaseOne.sorter->sort( false );
     // Set up remaining arguments.
     set<DiskLoc> dups;
     CurOp* op = cc().curop();
     ProgressMeterHolder pm (op->setMessage("InterruptBuildBottomUp",
                                            "InterruptBuildBottomUp Progress",
                                            nKeys,
                                            nKeys));
     pm.finished();
     Timer timer;
     // The index's root has not yet been set.
     ASSERT( id->getHead().isNull() );
     // Register a request to kill the current operation.
     cc().curop()->kill();
     if ( _mayInterrupt ) {
         // The build is aborted due to the kill request.
         ASSERT_THROWS
                 ( buildBottomUpPhases2And3<V1>( true,
                                                 id,
                                                 *phaseOne.sorter,
                                                 false,
                                                 dups,
                                                 op,
                                                 &phaseOne,
                                                 pm,
                                                 timer,
                                                 _mayInterrupt ),
                   UserException );
         // The root of the index is not set because the build did not complete.
         ASSERT( id->getHead().isNull() );
     }
     else {
         // The build is aborted despite the kill request because mayInterrupt == false.
         buildBottomUpPhases2And3<V1>( true,
                                       id,
                                       *phaseOne.sorter,
                                       false,
                                       dups,
                                       op,
                                       &phaseOne,
                                       pm,
                                       timer,
                                       _mayInterrupt );
         // The index's root is set after the build is complete.
         ASSERT( !id->getHead().isNull() );
     }
 }
Status IndexBuildInterceptor::drainWritesIntoIndex(OperationContext* opCtx,
                                                   const InsertDeleteOptions& options,
                                                   RecoveryUnit::ReadSource readSource) {
    invariant(!opCtx->lockState()->inAWriteUnitOfWork());

    // Callers may request to read at a specific timestamp so that no drained writes are timestamped
    // earlier than their original write timestamp. Also ensure that leaving this function resets
    // the ReadSource to its original value.
    auto resetReadSourceGuard =
        makeGuard([ opCtx, prevReadSource = opCtx->recoveryUnit()->getTimestampReadSource() ] {
            opCtx->recoveryUnit()->abandonSnapshot();
            opCtx->recoveryUnit()->setTimestampReadSource(prevReadSource);
        });

    if (readSource != RecoveryUnit::ReadSource::kUnset) {
        opCtx->recoveryUnit()->abandonSnapshot();
        opCtx->recoveryUnit()->setTimestampReadSource(readSource);
    } else {
        resetReadSourceGuard.dismiss();
    }

    // These are used for logging only.
    int64_t totalDeleted = 0;
    int64_t totalInserted = 0;
    Timer timer;

    const int64_t appliedAtStart = _numApplied;

    // Set up the progress meter. This will never be completely accurate, because more writes can be
    // read from the side writes table than are observed before draining.
    static const char* curopMessage = "Index Build: draining writes received during build";
    ProgressMeterHolder progress;
    {
        stdx::unique_lock<Client> lk(*opCtx->getClient());
        progress.set(CurOp::get(opCtx)->setProgress_inlock(curopMessage));
    }

    // Force the progress meter to log at the end of every batch. By default, the progress meter
    // only logs after a large number of calls to hit(), but since we batch inserts by up to
    // 1000 records, progress would rarely be displayed.
    progress->reset(_sideWritesCounter.load() - appliedAtStart /* total */,
                    3 /* secondsBetween */,
                    1 /* checkInterval */);

    // Buffer operations into batches to insert per WriteUnitOfWork. Impose an upper limit on the
    // number of documents and the total size of the batch.
    const int32_t kBatchMaxSize = 1000;
    const int64_t kBatchMaxBytes = BSONObjMaxInternalSize;

    int64_t batchSizeBytes = 0;

    std::vector<SideWriteRecord> batch;
    batch.reserve(kBatchMaxSize);

    // Hold on to documents that would exceed the per-batch memory limit. Always insert this first
    // into the next batch.
    boost::optional<SideWriteRecord> stashed;

    auto cursor = _sideWritesTable->rs()->getCursor(opCtx);

    bool atEof = false;
    while (!atEof) {
        opCtx->checkForInterrupt();

        // Stashed records should be inserted into a batch first.
        if (stashed) {
            invariant(batch.empty());
            batch.push_back(std::move(stashed.get()));
            stashed.reset();
        }

        auto record = cursor->next();

        if (record) {
            RecordId currentRecordId = record->id;
            BSONObj docOut = record->data.toBson().getOwned();

            // If the total batch size in bytes would be too large, stash this document and let the
            // current batch insert.
            int objSize = docOut.objsize();
            if (batchSizeBytes + objSize > kBatchMaxBytes) {
                invariant(!stashed);

                // Stash this document to be inserted in the next batch.
                stashed.emplace(currentRecordId, std::move(docOut));
            } else {
                batchSizeBytes += objSize;
                batch.emplace_back(currentRecordId, std::move(docOut));

                // Continue if there is more room in the batch.
                if (batch.size() < kBatchMaxSize) {
                    continue;
                }
            }
        } else {
            atEof = true;
            if (batch.empty())
                break;
        }

        invariant(!batch.empty());

        cursor->save();

        // If we are here, either we have reached the end of the table or the batch is full, so
        // insert everything in one WriteUnitOfWork, and delete each inserted document from the side
        // writes table.
        auto status = writeConflictRetry(opCtx, "index build drain", _indexCatalogEntry->ns(), [&] {
            WriteUnitOfWork wuow(opCtx);
            for (auto& operation : batch) {
                auto status =
                    _applyWrite(opCtx, operation.second, options, &totalInserted, &totalDeleted);
                if (!status.isOK()) {
                    return status;
                }

                // Delete the document from the table as soon as it has been inserted into the
                // index. This ensures that no key is ever inserted twice and no keys are skipped.
                _sideWritesTable->rs()->deleteRecord(opCtx, operation.first);
            }

            // For rollback to work correctly, these writes need to be timestamped. The actual time
            // is not important, as long as it not older than the most recent visible side write.
            IndexTimestampHelper::setGhostCommitTimestampForWrite(
                opCtx, NamespaceString(_indexCatalogEntry->ns()));

            wuow.commit();
            return Status::OK();
        });
        if (!status.isOK()) {
            return status;
        }

        progress->hit(batch.size());

        // Lock yielding will only happen if we are holding intent locks.
        _tryYield(opCtx);
        cursor->restore();

        // Account for more writes coming in during a batch.
        progress->setTotalWhileRunning(_sideWritesCounter.loadRelaxed() - appliedAtStart);

        _numApplied += batch.size();
        batch.clear();
        batchSizeBytes = 0;
    }

    progress->finished();

    int logLevel = (_numApplied - appliedAtStart > 0) ? 0 : 1;
    LOG(logLevel) << "index build: drain applied " << (_numApplied - appliedAtStart)
                  << " side writes (inserted: " << totalInserted << ", deleted: " << totalDeleted
                  << ") for '" << _indexCatalogEntry->descriptor()->indexName() << "' in "
                  << timer.millis() << " ms";

    return Status::OK();
}
Example #7
0
Status AbstractIndexAccessMethod::commitBulk(OperationContext* opCtx,
                                             BulkBuilder* bulk,
                                             bool mayInterrupt,
                                             bool dupsAllowed,
                                             set<RecordId>* dupRecords,
                                             std::vector<BSONObj>* dupKeysInserted) {
    // Cannot simultaneously report uninserted duplicates 'dupRecords' and inserted duplicates
    // 'dupKeysInserted'.
    invariant(!(dupRecords && dupKeysInserted));

    Timer timer;

    std::unique_ptr<BulkBuilder::Sorter::Iterator> it(bulk->done());

    static const char* message = "Index Build: inserting keys from external sorter into index";
    ProgressMeterHolder pm;
    {
        stdx::unique_lock<Client> lk(*opCtx->getClient());
        pm.set(CurOp::get(opCtx)->setProgress_inlock(
            message, bulk->getKeysInserted(), 3 /* secondsBetween */));
    }

    auto builder = std::unique_ptr<SortedDataBuilderInterface>(
        _newInterface->getBulkBuilder(opCtx, dupsAllowed));

    bool checkIndexKeySize = shouldCheckIndexKeySize(opCtx);

    BSONObj previousKey;
    const Ordering ordering = Ordering::make(_descriptor->keyPattern());

    while (it->more()) {
        if (mayInterrupt) {
            opCtx->checkForInterrupt();
        }

        WriteUnitOfWork wunit(opCtx);

        // Get the next datum and add it to the builder.
        BulkBuilder::Sorter::Data data = it->next();

        // Before attempting to insert, perform a duplicate key check.
        bool isDup = false;
        if (_descriptor->unique()) {
            isDup = data.first.woCompare(previousKey, ordering) == 0;
            if (isDup && !dupsAllowed) {
                if (dupRecords) {
                    dupRecords->insert(data.second);
                    continue;
                }
                return buildDupKeyErrorStatus(data.first,
                                              _descriptor->parentNS(),
                                              _descriptor->indexName(),
                                              _descriptor->keyPattern());
            }
        }

        Status status = checkIndexKeySize ? checkKeySize(data.first) : Status::OK();
        if (status.isOK()) {
            StatusWith<SpecialFormatInserted> ret = builder->addKey(data.first, data.second);
            status = ret.getStatus();
            if (status.isOK() && ret.getValue() == SpecialFormatInserted::LongTypeBitsInserted)
                _btreeState->setIndexKeyStringWithLongTypeBitsExistsOnDisk(opCtx);
        }

        if (!status.isOK()) {
            // Duplicates are checked before inserting.
            invariant(status.code() != ErrorCodes::DuplicateKey);

            // Overlong key that's OK to skip?
            // TODO SERVER-36385: Remove this when there is no KeyTooLong error.
            if (status.code() == ErrorCodes::KeyTooLong && ignoreKeyTooLong()) {
                continue;
            }

            return status;
        }

        previousKey = data.first.getOwned();

        if (isDup && dupsAllowed && dupKeysInserted) {
            dupKeysInserted->push_back(data.first.getOwned());
        }

        // If we're here either it's a dup and we're cool with it or the addKey went just fine.
        pm.hit();
        wunit.commit();
    }

    pm.finished();

    log() << "index build: inserted keys from external sorter into index in " << timer.seconds()
          << " seconds";

    WriteUnitOfWork wunit(opCtx);
    SpecialFormatInserted specialFormatInserted = builder->commit(mayInterrupt);
    // It's ok to insert KeyStrings with long TypeBits but we need to mark the feature
    // tracker bit so that downgrade binary which cannot read the long TypeBits fails to
    // start up.
    if (specialFormatInserted == SpecialFormatInserted::LongTypeBitsInserted)
        _btreeState->setIndexKeyStringWithLongTypeBitsExistsOnDisk(opCtx);
    wunit.commit();
    return Status::OK();
}