void run() { IndexDetails& id = addIndexWithInfo(); // Create a SortPhaseOne. SortPhaseOne phaseOne; phaseOne.sorter.reset( new BSONObjExternalSorter( id.idxInterface(), BSON( "a" << 1 ) ) ); // Add index keys to the phaseOne. int32_t nKeys = 130; for( int32_t i = 0; i < nKeys; ++i ) { phaseOne.sorter->add( BSON( "a" << i ), /* dummy disk loc */ DiskLoc(), false ); } phaseOne.nkeys = phaseOne.n = nKeys; phaseOne.sorter->sort( false ); // Set up remaining arguments. set<DiskLoc> dups; CurOp* op = cc().curop(); ProgressMeterHolder pm (op->setMessage("BuildBottomUp", "BuildBottomUp Progress", nKeys, nKeys)); pm.finished(); Timer timer; // The index's root has not yet been set. ASSERT( id.head.isNull() ); // Finish building the index. buildBottomUpPhases2And3<V1>( true, id, *phaseOne.sorter, false, dups, op, &phaseOne, pm, timer, true ); // The index's root is set after the build is complete. ASSERT( !id.head.isNull() ); // Create a cursor over the index. scoped_ptr<BtreeCursor> cursor( BtreeCursor::make( nsdetails( _ns ), id, BSON( "" << -1 ), // startKey below minimum key. BSON( "" << nKeys ), // endKey above maximum key. true, // endKeyInclusive true. 1 // direction forward. ) ); // Check that the keys in the index are the expected ones. int32_t expectedKey = 0; for( ; cursor->ok(); cursor->advance(), ++expectedKey ) { ASSERT_EQUALS( expectedKey, cursor->currKey().firstElement().number() ); } ASSERT_EQUALS( nKeys, expectedKey ); }
void buildBottomUpPhases2And3(bool dupsAllowed, IndexDetails& idx, BSONObjExternalSorter& sorter, bool dropDups, set<DiskLoc> &dupsToDrop, CurOp * op, SortPhaseOne *phase1, ProgressMeterHolder &pm, Timer& t ) { BtreeBuilder<V> btBuilder(dupsAllowed, idx); BSONObj keyLast; auto_ptr<BSONObjExternalSorter::Iterator> i = sorter.iterator(); verify( pm == op->setMessage( "index: (2/3) btree bottom up" , phase1->nkeys , 10 ) ); while( i->more() ) { RARELY killCurrentOp.checkForInterrupt(); BSONObjExternalSorter::Data d = i->next(); try { if ( !dupsAllowed && dropDups ) { LastError::Disabled led( lastError.get() ); btBuilder.addKey(d.first, d.second); } else { btBuilder.addKey(d.first, d.second); } } catch( AssertionException& e ) { if ( dupsAllowed ) { // unknown exception?? throw; } if( e.interrupted() ) { killCurrentOp.checkForInterrupt(); } if ( ! dropDups ) throw; /* we could queue these on disk, but normally there are very few dups, so instead we keep in ram and have a limit. */ dupsToDrop.insert(d.second); uassert( 10092 , "too may dups on index build with dropDups=true", dupsToDrop.size() < 1000000 ); } pm.hit(); } pm.finished(); op->setMessage( "index: (3/3) btree-middle" ); log(t.seconds() > 10 ? 0 : 1 ) << "\t done building bottom layer, going to commit" << endl; btBuilder.commit(); if ( btBuilder.getn() != phase1->nkeys && ! dropDups ) { warning() << "not all entries were added to the index, probably some keys were too large" << endl; } }
void run() { // It's necessary to index sufficient keys that a RARELY condition will be triggered. int32_t nDocs = 130; // Add some data to the collection. for( int32_t i = 0; i < nDocs; ++i ) { _client.insert( _ns, BSON( "a" << i ) ); } IndexDetails& id = addIndexWithInfo(); // Create a SortPhaseOne. SortPhaseOne phaseOne; ProgressMeterHolder pm (cc().curop()->setMessage("InterruptAddKeysToPhaseOne", "InterruptAddKeysToPhaseOne Progress", nDocs, nDocs)); // Register a request to kill the current operation. cc().curop()->kill(); if ( _mayInterrupt ) { // Add keys to phaseOne. ASSERT_THROWS( BtreeBasedBuilder::addKeysToPhaseOne( nsdetails(_ns), _ns, id, BSON( "a" << 1 ), &phaseOne, nDocs, pm.get(), _mayInterrupt, nsdetails(_ns)->idxNo(id) ), UserException ); // Not all keys were added to phaseOne due to the interrupt. ASSERT( static_cast<uint64_t>( nDocs ) > phaseOne.n ); } else { // Add keys to phaseOne. BtreeBasedBuilder::addKeysToPhaseOne( nsdetails(_ns), _ns, id, BSON( "a" << 1 ), &phaseOne, nDocs, pm.get(), _mayInterrupt, nsdetails(_ns)->idxNo(id) ); // All keys were added to phaseOne despite to the kill request, because // mayInterrupt == false. ASSERT_EQUALS( static_cast<uint64_t>( nDocs ), phaseOne.n ); } }
void run() { // Add some data to the collection. int32_t nDocs = 130; for( int32_t i = 0; i < nDocs; ++i ) { _client.insert( _ns, BSON( "a" << i ) ); } IndexDescriptor* id = addIndexWithInfo(); // Create a SortPhaseOne. SortPhaseOne phaseOne; ProgressMeterHolder pm (cc().curop()->setMessage("AddKeysToPhaseOne", "AddKeysToPhaseOne Progress", nDocs, nDocs)); // Add keys to phaseOne. BtreeBasedBuilder::addKeysToPhaseOne( collection(), id, BSON( "a" << 1 ), &phaseOne, pm.get(), true ); // Keys for all documents were added to phaseOne. ASSERT_EQUALS( static_cast<uint64_t>( nDocs ), phaseOne.n ); }
void run() { IndexDescriptor* id = addIndexWithInfo(); // Create a SortPhaseOne. SortPhaseOne phaseOne; phaseOne.sorter.reset(new BSONObjExternalSorter(_aFirstSort)); // It's necessary to index sufficient keys that a RARELY condition will be triggered, // but few enough keys that the btree builder will not create an internal node and check // for an interrupt internally (which would cause this test to pass spuriously). int32_t nKeys = 130; // Add index keys to the phaseOne. for( int32_t i = 0; i < nKeys; ++i ) { phaseOne.sorter->add( BSON( "a" << i ), /* dummy disk loc */ DiskLoc(), false ); } phaseOne.nkeys = phaseOne.n = nKeys; phaseOne.sorter->sort( false ); // Set up remaining arguments. set<DiskLoc> dups; CurOp* op = cc().curop(); ProgressMeterHolder pm (op->setMessage("InterruptBuildBottomUp", "InterruptBuildBottomUp Progress", nKeys, nKeys)); pm.finished(); Timer timer; // The index's root has not yet been set. ASSERT( id->getHead().isNull() ); // Register a request to kill the current operation. cc().curop()->kill(); if ( _mayInterrupt ) { // The build is aborted due to the kill request. ASSERT_THROWS ( buildBottomUpPhases2And3<V1>( true, id, *phaseOne.sorter, false, dups, op, &phaseOne, pm, timer, _mayInterrupt ), UserException ); // The root of the index is not set because the build did not complete. ASSERT( id->getHead().isNull() ); } else { // The build is aborted despite the kill request because mayInterrupt == false. buildBottomUpPhases2And3<V1>( true, id, *phaseOne.sorter, false, dups, op, &phaseOne, pm, timer, _mayInterrupt ); // The index's root is set after the build is complete. ASSERT( !id->getHead().isNull() ); } }
Status IndexBuildInterceptor::drainWritesIntoIndex(OperationContext* opCtx, const InsertDeleteOptions& options, RecoveryUnit::ReadSource readSource) { invariant(!opCtx->lockState()->inAWriteUnitOfWork()); // Callers may request to read at a specific timestamp so that no drained writes are timestamped // earlier than their original write timestamp. Also ensure that leaving this function resets // the ReadSource to its original value. auto resetReadSourceGuard = makeGuard([ opCtx, prevReadSource = opCtx->recoveryUnit()->getTimestampReadSource() ] { opCtx->recoveryUnit()->abandonSnapshot(); opCtx->recoveryUnit()->setTimestampReadSource(prevReadSource); }); if (readSource != RecoveryUnit::ReadSource::kUnset) { opCtx->recoveryUnit()->abandonSnapshot(); opCtx->recoveryUnit()->setTimestampReadSource(readSource); } else { resetReadSourceGuard.dismiss(); } // These are used for logging only. int64_t totalDeleted = 0; int64_t totalInserted = 0; Timer timer; const int64_t appliedAtStart = _numApplied; // Set up the progress meter. This will never be completely accurate, because more writes can be // read from the side writes table than are observed before draining. static const char* curopMessage = "Index Build: draining writes received during build"; ProgressMeterHolder progress; { stdx::unique_lock<Client> lk(*opCtx->getClient()); progress.set(CurOp::get(opCtx)->setProgress_inlock(curopMessage)); } // Force the progress meter to log at the end of every batch. By default, the progress meter // only logs after a large number of calls to hit(), but since we batch inserts by up to // 1000 records, progress would rarely be displayed. progress->reset(_sideWritesCounter.load() - appliedAtStart /* total */, 3 /* secondsBetween */, 1 /* checkInterval */); // Buffer operations into batches to insert per WriteUnitOfWork. Impose an upper limit on the // number of documents and the total size of the batch. const int32_t kBatchMaxSize = 1000; const int64_t kBatchMaxBytes = BSONObjMaxInternalSize; int64_t batchSizeBytes = 0; std::vector<SideWriteRecord> batch; batch.reserve(kBatchMaxSize); // Hold on to documents that would exceed the per-batch memory limit. Always insert this first // into the next batch. boost::optional<SideWriteRecord> stashed; auto cursor = _sideWritesTable->rs()->getCursor(opCtx); bool atEof = false; while (!atEof) { opCtx->checkForInterrupt(); // Stashed records should be inserted into a batch first. if (stashed) { invariant(batch.empty()); batch.push_back(std::move(stashed.get())); stashed.reset(); } auto record = cursor->next(); if (record) { RecordId currentRecordId = record->id; BSONObj docOut = record->data.toBson().getOwned(); // If the total batch size in bytes would be too large, stash this document and let the // current batch insert. int objSize = docOut.objsize(); if (batchSizeBytes + objSize > kBatchMaxBytes) { invariant(!stashed); // Stash this document to be inserted in the next batch. stashed.emplace(currentRecordId, std::move(docOut)); } else { batchSizeBytes += objSize; batch.emplace_back(currentRecordId, std::move(docOut)); // Continue if there is more room in the batch. if (batch.size() < kBatchMaxSize) { continue; } } } else { atEof = true; if (batch.empty()) break; } invariant(!batch.empty()); cursor->save(); // If we are here, either we have reached the end of the table or the batch is full, so // insert everything in one WriteUnitOfWork, and delete each inserted document from the side // writes table. auto status = writeConflictRetry(opCtx, "index build drain", _indexCatalogEntry->ns(), [&] { WriteUnitOfWork wuow(opCtx); for (auto& operation : batch) { auto status = _applyWrite(opCtx, operation.second, options, &totalInserted, &totalDeleted); if (!status.isOK()) { return status; } // Delete the document from the table as soon as it has been inserted into the // index. This ensures that no key is ever inserted twice and no keys are skipped. _sideWritesTable->rs()->deleteRecord(opCtx, operation.first); } // For rollback to work correctly, these writes need to be timestamped. The actual time // is not important, as long as it not older than the most recent visible side write. IndexTimestampHelper::setGhostCommitTimestampForWrite( opCtx, NamespaceString(_indexCatalogEntry->ns())); wuow.commit(); return Status::OK(); }); if (!status.isOK()) { return status; } progress->hit(batch.size()); // Lock yielding will only happen if we are holding intent locks. _tryYield(opCtx); cursor->restore(); // Account for more writes coming in during a batch. progress->setTotalWhileRunning(_sideWritesCounter.loadRelaxed() - appliedAtStart); _numApplied += batch.size(); batch.clear(); batchSizeBytes = 0; } progress->finished(); int logLevel = (_numApplied - appliedAtStart > 0) ? 0 : 1; LOG(logLevel) << "index build: drain applied " << (_numApplied - appliedAtStart) << " side writes (inserted: " << totalInserted << ", deleted: " << totalDeleted << ") for '" << _indexCatalogEntry->descriptor()->indexName() << "' in " << timer.millis() << " ms"; return Status::OK(); }
Status AbstractIndexAccessMethod::commitBulk(OperationContext* opCtx, BulkBuilder* bulk, bool mayInterrupt, bool dupsAllowed, set<RecordId>* dupRecords, std::vector<BSONObj>* dupKeysInserted) { // Cannot simultaneously report uninserted duplicates 'dupRecords' and inserted duplicates // 'dupKeysInserted'. invariant(!(dupRecords && dupKeysInserted)); Timer timer; std::unique_ptr<BulkBuilder::Sorter::Iterator> it(bulk->done()); static const char* message = "Index Build: inserting keys from external sorter into index"; ProgressMeterHolder pm; { stdx::unique_lock<Client> lk(*opCtx->getClient()); pm.set(CurOp::get(opCtx)->setProgress_inlock( message, bulk->getKeysInserted(), 3 /* secondsBetween */)); } auto builder = std::unique_ptr<SortedDataBuilderInterface>( _newInterface->getBulkBuilder(opCtx, dupsAllowed)); bool checkIndexKeySize = shouldCheckIndexKeySize(opCtx); BSONObj previousKey; const Ordering ordering = Ordering::make(_descriptor->keyPattern()); while (it->more()) { if (mayInterrupt) { opCtx->checkForInterrupt(); } WriteUnitOfWork wunit(opCtx); // Get the next datum and add it to the builder. BulkBuilder::Sorter::Data data = it->next(); // Before attempting to insert, perform a duplicate key check. bool isDup = false; if (_descriptor->unique()) { isDup = data.first.woCompare(previousKey, ordering) == 0; if (isDup && !dupsAllowed) { if (dupRecords) { dupRecords->insert(data.second); continue; } return buildDupKeyErrorStatus(data.first, _descriptor->parentNS(), _descriptor->indexName(), _descriptor->keyPattern()); } } Status status = checkIndexKeySize ? checkKeySize(data.first) : Status::OK(); if (status.isOK()) { StatusWith<SpecialFormatInserted> ret = builder->addKey(data.first, data.second); status = ret.getStatus(); if (status.isOK() && ret.getValue() == SpecialFormatInserted::LongTypeBitsInserted) _btreeState->setIndexKeyStringWithLongTypeBitsExistsOnDisk(opCtx); } if (!status.isOK()) { // Duplicates are checked before inserting. invariant(status.code() != ErrorCodes::DuplicateKey); // Overlong key that's OK to skip? // TODO SERVER-36385: Remove this when there is no KeyTooLong error. if (status.code() == ErrorCodes::KeyTooLong && ignoreKeyTooLong()) { continue; } return status; } previousKey = data.first.getOwned(); if (isDup && dupsAllowed && dupKeysInserted) { dupKeysInserted->push_back(data.first.getOwned()); } // If we're here either it's a dup and we're cool with it or the addKey went just fine. pm.hit(); wunit.commit(); } pm.finished(); log() << "index build: inserted keys from external sorter into index in " << timer.seconds() << " seconds"; WriteUnitOfWork wunit(opCtx); SpecialFormatInserted specialFormatInserted = builder->commit(mayInterrupt); // It's ok to insert KeyStrings with long TypeBits but we need to mark the feature // tracker bit so that downgrade binary which cannot read the long TypeBits fails to // start up. if (specialFormatInserted == SpecialFormatInserted::LongTypeBitsInserted) _btreeState->setIndexKeyStringWithLongTypeBitsExistsOnDisk(opCtx); wunit.commit(); return Status::OK(); }