// static void QueryYield::yieldAllLocks(OperationContext* opCtx, stdx::function<void()> whileYieldingFn, const NamespaceString& planExecNS) { // Things have to happen here in a specific order: // * Release lock mgr locks // * Go to sleep // * Call the whileYieldingFn // * Reacquire lock mgr locks Locker* locker = opCtx->lockState(); Locker::LockSnapshot snapshot; // Nothing was unlocked, just return, yielding is pointless. if (!locker->saveLockStateAndUnlock(&snapshot)) { return; } // Top-level locks are freed, release any potential low-level (storage engine-specific // locks). If we are yielding, we are at a safe place to do so. opCtx->recoveryUnit()->abandonSnapshot(); // Track the number of yields in CurOp. CurOp::get(opCtx)->yielded(); MONGO_FAIL_POINT_BLOCK(setYieldAllLocksHang, config) { StringData ns{config.getData().getStringField("namespace")}; if (ns.empty() || ns == planExecNS.ns()) { MONGO_FAIL_POINT_PAUSE_WHILE_SET(setYieldAllLocksHang); } }
void IndexBuildInterceptor::_tryYield(OperationContext* opCtx) { // Never yield while holding locks that prevent writes to the collection: only yield while // holding intent locks. This check considers all locks in the hierarchy that would cover this // mode. const NamespaceString nss(_indexCatalogEntry->ns()); if (opCtx->lockState()->isCollectionLockedForMode(nss, MODE_S)) { return; } DEV { invariant(!opCtx->lockState()->isCollectionLockedForMode(nss, MODE_X)); invariant(!opCtx->lockState()->isDbLockedForMode(nss.db(), MODE_X)); } // Releasing locks means a new snapshot should be acquired when restored. opCtx->recoveryUnit()->abandonSnapshot(); auto locker = opCtx->lockState(); Locker::LockSnapshot snapshot; invariant(locker->saveLockStateAndUnlock(&snapshot)); // Track the number of yields in CurOp. CurOp::get(opCtx)->yielded(); MONGO_FAIL_POINT_BLOCK(hangDuringIndexBuildDrainYield, config) { StringData ns{config.getData().getStringField("namespace")}; if (ns == _indexCatalogEntry->ns()) { log() << "Hanging index build during drain yield"; MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangDuringIndexBuildDrainYield); } }
Status PlanYieldPolicy::yield(stdx::function<void()> beforeYieldingFn, stdx::function<void()> whileYieldingFn) { invariant(_planYielding); invariant(canAutoYield()); // After we finish yielding (or in any early return), call resetTimer() to prevent yielding // again right away. We delay the resetTimer() call so that the clock doesn't start ticking // until after we return from the yield. ON_BLOCK_EXIT([this]() { resetTimer(); }); _forceYield = false; OperationContext* opCtx = _planYielding->getOpCtx(); invariant(opCtx); invariant(!opCtx->lockState()->inAWriteUnitOfWork()); // Can't use writeConflictRetry since we need to call saveState before reseting the transaction. for (int attempt = 1; true; attempt++) { try { // All YIELD_AUTO plans will get here eventually when the elapsed tracker triggers // that it's time to yield. Whether or not we will actually yield, we need to check // if this operation has been interrupted. if (_policy == PlanExecutor::YIELD_AUTO) { MONGO_FAIL_POINT_PAUSE_WHILE_SET(setCheckForInterruptHang); auto interruptStatus = opCtx->checkForInterruptNoAssert(); if (!interruptStatus.isOK()) { return interruptStatus; } } try { _planYielding->saveState(); } catch (const WriteConflictException&) { invariant(!"WriteConflictException not allowed in saveState"); } if (_policy == PlanExecutor::WRITE_CONFLICT_RETRY_ONLY) { // Just reset the snapshot. Leave all LockManager locks alone. opCtx->recoveryUnit()->abandonSnapshot(); } else { // Release and reacquire locks. if (beforeYieldingFn) beforeYieldingFn(); QueryYield::yieldAllLocks(opCtx, whileYieldingFn, _planYielding->nss()); } return _planYielding->restoreStateWithoutRetrying(); } catch (const WriteConflictException&) { CurOp::get(opCtx)->debug().writeConflicts++; WriteConflictException::logAndBackoff( attempt, "plan execution restoreState", _planYielding->nss().ns()); // retry } } }
Status PlanYieldPolicy::yieldOrInterrupt(stdx::function<void()> beforeYieldingFn, stdx::function<void()> whileYieldingFn) { if (_policy == PlanExecutor::INTERRUPT_ONLY) { ON_BLOCK_EXIT([this]() { resetTimer(); }); OperationContext* opCtx = _planYielding->getOpCtx(); invariant(opCtx); MONGO_FAIL_POINT_PAUSE_WHILE_SET(setCheckForInterruptHang); return opCtx->checkForInterruptNoAssert(); } return yield(beforeYieldingFn, whileYieldingFn); }
Status dropCollection(OperationContext* opCtx, const NamespaceString& collectionName, BSONObjBuilder& result, const repl::OpTime& dropOpTime, DropCollectionSystemCollectionMode systemCollectionMode) { if (!serverGlobalParams.quiet.load()) { log() << "CMD: drop " << collectionName; } return writeConflictRetry(opCtx, "drop", collectionName.ns(), [&] { AutoGetDb autoDb(opCtx, collectionName.db(), MODE_X); Database* const db = autoDb.getDb(); Collection* coll = db ? db->getCollection(opCtx, collectionName) : nullptr; auto view = db && !coll ? db->getViewCatalog()->lookup(opCtx, collectionName.ns()) : nullptr; if (MONGO_FAIL_POINT(hangDuringDropCollection)) { log() << "hangDuringDropCollection fail point enabled. Blocking until fail point is " "disabled."; MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangDuringDropCollection); } if (!db || (!coll && !view)) { return Status(ErrorCodes::NamespaceNotFound, "ns not found"); } const bool shardVersionCheck = true; OldClientContext context(opCtx, collectionName.ns(), shardVersionCheck); bool userInitiatedWritesAndNotPrimary = opCtx->writesAreReplicated() && !repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesFor(opCtx, collectionName); if (userInitiatedWritesAndNotPrimary) { return Status(ErrorCodes::NotMaster, str::stream() << "Not primary while dropping collection " << collectionName); } WriteUnitOfWork wunit(opCtx); if (!result.hasField("ns")) { result.append("ns", collectionName.ns()); } if (coll) { invariant(!view); int numIndexes = coll->getIndexCatalog()->numIndexesTotal(opCtx); BackgroundOperation::assertNoBgOpInProgForNs(collectionName.ns()); Status s = systemCollectionMode == DropCollectionSystemCollectionMode::kDisallowSystemCollectionDrops ? db->dropCollection(opCtx, collectionName.ns(), dropOpTime) : db->dropCollectionEvenIfSystem(opCtx, collectionName, dropOpTime); if (!s.isOK()) { return s; } result.append("nIndexesWas", numIndexes); } else { invariant(view); Status status = db->dropView(opCtx, collectionName.ns()); if (!status.isOK()) { return status; } } wunit.commit(); return Status::OK(); }); }
Status MigrationSourceManager::commitDonateChunk(OperationContext* txn) { invariant(!txn->lockState()->isLocked()); invariant(_state == kCriticalSection); auto scopedGuard = MakeGuard([&] { cleanupOnError(txn); }); // Tell the recipient shard to fetch the latest changes Status commitCloneStatus = _cloneDriver->commitClone(txn); if (MONGO_FAIL_POINT(failMigrationCommit) && commitCloneStatus.isOK()) { commitCloneStatus = {ErrorCodes::InternalError, "Failing _recvChunkCommit due to failpoint."}; } if (!commitCloneStatus.isOK()) { return {commitCloneStatus.code(), str::stream() << "commit clone failed due to " << commitCloneStatus.toString()}; } // Generate the next collection version. ChunkVersion uncommittedCollVersion = _committedMetadata->getCollVersion(); uncommittedCollVersion.incMajor(); // applyOps preparation for reflecting the uncommitted metadata on the config server // Preconditions BSONArrayBuilder preCond; { BSONObjBuilder b; b.append("ns", ChunkType::ConfigNS); b.append("q", BSON("query" << BSON(ChunkType::ns(_args.getNss().ns())) << "orderby" << BSON(ChunkType::DEPRECATED_lastmod() << -1))); { BSONObjBuilder bb(b.subobjStart("res")); // TODO: For backwards compatibility, we can't yet require an epoch here bb.appendTimestamp(ChunkType::DEPRECATED_lastmod(), _committedMetadata->getCollVersion().toLong()); bb.done(); } preCond.append(b.obj()); } // Update for the chunk which is being donated BSONArrayBuilder updates; { BSONObjBuilder op; op.append("op", "u"); op.appendBool("b", false); // No upserting op.append("ns", ChunkType::ConfigNS); BSONObjBuilder n(op.subobjStart("o")); n.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), _args.getMinKey())); uncommittedCollVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod()); n.append(ChunkType::ns(), _args.getNss().ns()); n.append(ChunkType::min(), _args.getMinKey()); n.append(ChunkType::max(), _args.getMaxKey()); n.append(ChunkType::shard(), _args.getToShardId()); n.done(); BSONObjBuilder q(op.subobjStart("o2")); q.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), _args.getMinKey())); q.done(); updates.append(op.obj()); } // Update for the chunk being moved // Version at which the next highest lastmod will be set. If the chunk being moved is the last // in the shard, nextVersion is that chunk's lastmod otherwise the highest version is from the // chunk being bumped on the FROM-shard. ChunkVersion nextVersion = uncommittedCollVersion; // If we have chunks left on the FROM shard, update the version of one of them as well. We can // figure that out by grabbing the metadata as it has been changed. if (_committedMetadata->getNumChunks() > 1) { ChunkType bumpChunk; invariant(_committedMetadata->getDifferentChunk(_args.getMinKey(), &bumpChunk)); BSONObj bumpMin = bumpChunk.getMin(); BSONObj bumpMax = bumpChunk.getMax(); nextVersion.incMinor(); dassert(bumpMin.woCompare(_args.getMinKey()) != 0); BSONObjBuilder op; op.append("op", "u"); op.appendBool("b", false); op.append("ns", ChunkType::ConfigNS); BSONObjBuilder n(op.subobjStart("o")); n.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), bumpMin)); nextVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod()); n.append(ChunkType::ns(), _args.getNss().ns()); n.append(ChunkType::min(), bumpMin); n.append(ChunkType::max(), bumpMax); n.append(ChunkType::shard(), _args.getFromShardId()); n.done(); BSONObjBuilder q(op.subobjStart("o2")); q.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), bumpMin)); q.done(); updates.append(op.obj()); log() << "moveChunk updating self version to: " << nextVersion << " through " << bumpMin << " -> " << bumpMax << " for collection '" << _args.getNss().ns() << "'"; } else { log() << "moveChunk moved last chunk out for collection '" << _args.getNss().ns() << "'"; } MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangBeforeCommitMigration); Status applyOpsStatus = grid.catalogClient(txn)->applyChunkOpsDeprecated( txn, updates.arr(), preCond.arr(), _args.getNss().ns(), nextVersion); if (MONGO_FAIL_POINT(failCommitMigrationCommand)) { applyOpsStatus = Status(ErrorCodes::InternalError, "Failpoint 'failCommitMigrationCommand' generated error"); } if (applyOpsStatus.isOK()) { // Now that applyOps succeeded and the new collection version is committed, update the // collection metadata to the new collection version and forget the migrated chunk. ScopedTransaction scopedXact(txn, MODE_IX); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X); ChunkType migratingChunkToForget; migratingChunkToForget.setMin(_args.getMinKey()); migratingChunkToForget.setMax(_args.getMaxKey()); _committedMetadata = _committedMetadata->cloneMigrate(migratingChunkToForget, uncommittedCollVersion); auto css = CollectionShardingState::get(txn, _args.getNss().ns()); css->setMetadata(_committedMetadata); } else { // This could be an unrelated error (e.g. network error). Check whether the metadata update // succeeded by refreshing the collection metadata from the config server and checking that // the original chunks no longer exist. warning() << "Migration metadata commit may have failed: refreshing metadata to check" << causedBy(applyOpsStatus); // Need to get the latest optime in case the refresh request goes to a secondary -- // otherwise the read won't wait for the write that applyChunkOpsDeprecated may have done. Status status = grid.catalogClient(txn)->logChange( txn, "moveChunk.validating", _args.getNss().ns(), BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey() << "from" << _args.getFromShardId() << "to" << _args.getToShardId())); if (!status.isOK()) { fassertStatusOK( 40137, {status.code(), str::stream() << "applyOps failed to commit chunk [" << _args.getMinKey() << "," << _args.getMaxKey() << ") due to " << causedBy(applyOpsStatus) << ", and updating the optime with a write before refreshing the " << "metadata also failed: " << causedBy(status)}); } ShardingState* const shardingState = ShardingState::get(txn); ChunkVersion shardVersion; Status refreshStatus = shardingState->refreshMetadataNow(txn, _args.getNss().ns(), &shardVersion); fassertStatusOK(34431, {refreshStatus.code(), str::stream() << "applyOps failed to commit chunk [" << _args.getMinKey() << "," << _args.getMaxKey() << ") due to " << causedBy(applyOpsStatus) << ", and refreshing collection metadata failed: " << causedBy(refreshStatus)}); { ScopedTransaction scopedXact(txn, MODE_IS); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS); auto css = CollectionShardingState::get(txn, _args.getNss()); std::shared_ptr<CollectionMetadata> refreshedMetadata = css->getMetadata(); if (refreshedMetadata->keyBelongsToMe(_args.getMinKey())) { invariant(refreshedMetadata->getCollVersion() == _committedMetadata->getCollVersion()); // After refresh, the collection metadata indicates that the donor shard still owns // the chunk, so no migration changes were written to the config server metadata. return {applyOpsStatus.code(), str::stream() << "Migration was not committed, applyOps failed: " << causedBy(applyOpsStatus)}; } ChunkVersion refreshedCollectionVersion = refreshedMetadata->getCollVersion(); if (!refreshedCollectionVersion.equals(nextVersion)) { // The refreshed collection metadata's collection version does not match the control // chunk's updated collection version, which should now be the highest. The control // chunk was not committed, but the migrated chunk was. This state is not // recoverable. fassertStatusOK(40138, {applyOpsStatus.code(), str::stream() << "Migration was partially committed, state is " << "unrecoverable. applyOps error: " << causedBy(applyOpsStatus)}); } } } MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangBeforeLeavingCriticalSection); scopedGuard.Dismiss(); _cleanup(txn); grid.catalogClient(txn)->logChange(txn, "moveChunk.commit", _args.getNss().ns(), BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey() << "from" << _args.getFromShardId() << "to" << _args.getToShardId())); return Status::OK(); }