ChunkVersion forceShardFilteringMetadataRefresh(OperationContext* opCtx, const NamespaceString& nss, bool forceRefreshFromThisThread) { invariant(!opCtx->lockState()->isLocked()); invariant(!opCtx->getClient()->isInDirectClient()); auto const shardingState = ShardingState::get(opCtx); invariant(shardingState->canAcceptShardedCommands()); const auto routingInfo = uassertStatusOK(Grid::get(opCtx)->catalogCache()->getCollectionRoutingInfoWithRefresh( opCtx, nss, forceRefreshFromThisThread)); const auto cm = routingInfo.cm(); if (!cm) { // No chunk manager, so unsharded. // Exclusive collection lock needed since we're now changing the metadata AutoGetCollection autoColl(opCtx, nss, MODE_IX, MODE_X); auto css = CollectionShardingState::get(opCtx, nss); css->refreshMetadata(opCtx, nullptr); return ChunkVersion::UNSHARDED(); } { AutoGetCollection autoColl(opCtx, nss, MODE_IS); auto metadata = CollectionShardingState::get(opCtx, nss)->getMetadata(opCtx); // We already have newer version if (metadata && metadata->getCollVersion().epoch() == cm->getVersion().epoch() && metadata->getCollVersion() >= cm->getVersion()) { LOG(1) << "Skipping refresh of metadata for " << nss << " " << metadata->getCollVersion() << " with an older " << cm->getVersion(); return metadata->getShardVersion(); } } // Exclusive collection lock needed since we're now changing the metadata AutoGetCollection autoColl(opCtx, nss, MODE_IX, MODE_X); auto css = CollectionShardingState::get(opCtx, nss); auto metadata = css->getMetadata(opCtx); // We already have newer version if (metadata && metadata->getCollVersion().epoch() == cm->getVersion().epoch() && metadata->getCollVersion() >= cm->getVersion()) { LOG(1) << "Skipping refresh of metadata for " << nss << " " << metadata->getCollVersion() << " with an older " << cm->getVersion(); return metadata->getShardVersion(); } std::unique_ptr<CollectionMetadata> newCollectionMetadata = stdx::make_unique<CollectionMetadata>(cm, shardingState->getShardName()); css->refreshMetadata(opCtx, std::move(newCollectionMetadata)); return css->getMetadata(opCtx)->getShardVersion(); }
Status MigrationSourceManager::startClone(OperationContext* txn) { invariant(!txn->lockState()->isLocked()); invariant(_state == kCreated); auto scopedGuard = MakeGuard([&] { cleanupOnError(txn); }); grid.catalogClient(txn)->logChange(txn, "moveChunk.start", _args.getNss().ns(), BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey() << "from" << _args.getFromShardId() << "to" << _args.getToShardId())); _cloneDriver = stdx::make_unique<MigrationChunkClonerSourceLegacy>( _args, _committedMetadata->getKeyPattern()); { // Register for notifications from the replication subsystem ScopedTransaction scopedXact(txn, MODE_IX); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X); auto css = CollectionShardingState::get(txn, _args.getNss().ns()); css->setMigrationSourceManager(txn, this); } Status startCloneStatus = _cloneDriver->startClone(txn); if (!startCloneStatus.isOK()) { return startCloneStatus; } _state = kCloning; scopedGuard.Dismiss(); return Status::OK(); }
BSONObj ActiveMigrationsRegistry::getActiveMigrationStatusReport(OperationContext* opCtx) { boost::optional<NamespaceString> nss; { stdx::lock_guard<stdx::mutex> lk(_mutex); if (_activeMoveChunkState) { nss = _activeMoveChunkState->args.getNss(); } } // The state of the MigrationSourceManager could change between taking and releasing the mutex // above and then taking the collection lock here, but that's fine because it isn't important to // return information on a migration that just ended or started. This is just best effort and // desireable for reporting, and then diagnosing, migrations that are stuck. if (nss) { // Lock the collection so nothing changes while we're getting the migration report. AutoGetCollection autoColl(opCtx, nss.get(), MODE_IS); auto css = CollectionShardingState::get(opCtx, nss.get()); if (css->getMigrationSourceManager()) { return css->getMigrationSourceManager()->getMigrationStatusReport(); } } return BSONObj(); }
void OpObserverShardingImpl::shardObserveTransactionPrepareOrUnpreparedCommit( OperationContext* opCtx, const std::vector<repl::ReplOperation>& stmts) { for (const auto stmt : stmts) { auto const nss = stmt.getNss(); AutoGetCollection autoColl(opCtx, nss, MODE_IS); auto csr = CollectionShardingRuntime::get(opCtx, nss); auto csrLock = CollectionShardingRuntime::CSRLock::lock(opCtx, csr); auto msm = MigrationSourceManager::get(csr, csrLock); if (!msm) { continue; } auto const opType = stmt.getOpType(); // We pass an empty opTime to observers because retryable write history doesn't care about // writes in transactions. if (opType == repl::OpTypeEnum::kInsert) { msm->getCloner()->onInsertOp(opCtx, stmt.getObject(), {}); } else if (opType == repl::OpTypeEnum::kUpdate) { if (auto updateDoc = stmt.getObject2()) { msm->getCloner()->onUpdateOp( opCtx, stmt.getPreImageDocumentKey(), *updateDoc, {}, {}); } } else if (opType == repl::OpTypeEnum::kDelete) { if (isMigratingWithCSRLock(csr, csrLock, stmt.getObject())) { msm->getCloner()->onDeleteOp( opCtx, getDocumentKey(opCtx, nss, stmt.getObject()), {}, {}); } } } }
int reap(OperationContext* opCtx) override { auto const coord = mongo::repl::ReplicationCoordinator::get(opCtx); Handler handler(opCtx, *_collection); if (!handler.initialize()) { return 0; } AutoGetCollection autoColl( opCtx, NamespaceString::kSessionTransactionsTableNamespace, MODE_IS); // Only start reaping if the shard or config server node is currently the primary if (!coord->canAcceptWritesForDatabase( opCtx, NamespaceString::kSessionTransactionsTableNamespace.db())) { return 0; } DBDirectClient client(opCtx); auto query = makeQuery(opCtx->getServiceContext()->getFastClockSource()->now()); auto cursor = client.query( NamespaceString::kSessionTransactionsTableNamespace, query, 0, 0, &kIdProjection); while (cursor->more()) { auto transactionSession = SessionsCollectionFetchResultIndividualResult::parse( "TransactionSession"_sd, cursor->next()); handler.handleLsid(transactionSession.get_id()); } // Before the handler goes out of scope, flush its last batch to disk and collect stats. return handler.finalize(); }
void MigrationSourceManager::_cleanup(OperationContext* txn) { invariant(_state != kDone); { // Unregister from the collection's sharding state ScopedTransaction scopedXact(txn, MODE_IX); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X); auto css = CollectionShardingState::get(txn, _args.getNss().ns()); // The migration source manager is not visible anymore after it is unregistered from the // collection css->clearMigrationSourceManager(txn); // Leave the critical section. if (_state == kCriticalSection) { _critSecSignal->set(); } } // Decrement the metadata op counter outside of the collection lock in order to hold it for as // short as possible. if (_state == kCriticalSection) { ShardingStateRecovery::endMetadataOp(txn); } if (_cloneDriver) { _cloneDriver->cancelClone(txn); _cloneDriver.reset(); } _state = kDone; }
Status ShardingStateRecovery::recover(OperationContext* opCtx) { if (serverGlobalParams.clusterRole != ClusterRole::ShardServer) { return Status::OK(); } BSONObj recoveryDocBSON; try { AutoGetCollection autoColl(opCtx, NamespaceString::kConfigCollectionNamespace, MODE_IS); if (!Helpers::findOne( opCtx, autoColl.getCollection(), RecoveryDocument::getQuery(), recoveryDocBSON)) { return Status::OK(); } } catch (const DBException& ex) { return ex.toStatus(); } const auto recoveryDocStatus = RecoveryDocument::fromBSON(recoveryDocBSON); if (!recoveryDocStatus.isOK()) return recoveryDocStatus.getStatus(); const auto recoveryDoc = std::move(recoveryDocStatus.getValue()); log() << "Sharding state recovery process found document " << redact(recoveryDoc.toBSON()); ShardingState* const shardingState = ShardingState::get(opCtx); invariant(shardingState->enabled()); if (!recoveryDoc.getMinOpTimeUpdaters()) { // Treat the minOpTime as up-to-date grid.advanceConfigOpTime(recoveryDoc.getMinOpTime()); return Status::OK(); } log() << "Sharding state recovery document indicates there were " << recoveryDoc.getMinOpTimeUpdaters() << " metadata change operations in flight. Contacting the config server primary in order " "to retrieve the most recent opTime."; // Need to fetch the latest uptime from the config server, so do a logging write Status status = grid.catalogClient(opCtx)->logChange(opCtx, "Sharding minOpTime recovery", NamespaceString::kConfigCollectionNamespace.ns(), recoveryDocBSON, ShardingCatalogClient::kMajorityWriteConcern); if (!status.isOK()) return status; log() << "Sharding state recovered. New config server opTime is " << grid.configOpTime(); // Finally, clear the recovery document so next time we don't need to recover status = modifyRecoveryDocument(opCtx, RecoveryDocument::Clear, kLocalWriteConcern); if (!status.isOK()) { warning() << "Failed to reset sharding state recovery document due to " << redact(status); } return Status::OK(); }
Status ShardingStateRecovery::recover(OperationContext* txn) { BSONObj recoveryDocBSON; try { AutoGetCollection autoColl(txn, NamespaceString::kConfigCollectionNamespace, MODE_IS); if (!Helpers::findOne( txn, autoColl.getCollection(), RecoveryDocument::getQuery(), recoveryDocBSON)) { return Status::OK(); } } catch (const DBException& ex) { return ex.toStatus(); } const auto recoveryDocStatus = RecoveryDocument::fromBSON(recoveryDocBSON); if (!recoveryDocStatus.isOK()) return recoveryDocStatus.getStatus(); const auto recoveryDoc = std::move(recoveryDocStatus.getValue()); log() << "Sharding state recovery process found document " << recoveryDoc.toBSON(); // Make sure the sharding state is initialized ShardingState* const shardingState = ShardingState::get(txn); shardingState->initialize(txn, recoveryDoc.getConfigsvr().toString()); shardingState->setShardName(recoveryDoc.getShardName()); if (!recoveryDoc.getMinOpTimeUpdaters()) { // Treat the minOpTime as up-to-date grid.shardRegistry()->advanceConfigOpTime(recoveryDoc.getMinOpTime()); return Status::OK(); } log() << "Sharding state recovery document indicates there were " << recoveryDoc.getMinOpTimeUpdaters() << " metadata change operations in flight. Contacting the config server primary in order " "to retrieve the most recent opTime."; // Need to fetch the latest uptime from the config server, so do a logging write Status status = grid.catalogManager(txn)->logChange(txn, "Sharding recovery thread", "Sharding minOpTime recovery", NamespaceString::kConfigCollectionNamespace.ns(), recoveryDocBSON); if (!status.isOK()) return status; log() << "Sharding state recovered. New config server opTime is " << grid.shardRegistry()->getConfigOpTime(); // Finally, clear the recovery document so next time we don't need to recover status = modifyRecoveryDocument(txn, RecoveryDocument::Clear, kMajorityWriteConcern); if (!status.isOK()) { warning() << "Failed to reset sharding state recovery document due to " << status; } return Status::OK(); }
boost::optional<UUID> MongoDSessionCatalog::getTransactionTableUUID(OperationContext* opCtx) { AutoGetCollection autoColl(opCtx, NamespaceString::kSessionTransactionsTableNamespace, MODE_IS); const auto coll = autoColl.getCollection(); if (!coll) { return boost::none; } return coll->uuid(); }
bool Helpers::getLast(OperationContext* txn, const char *ns, BSONObj& result) { AutoGetCollectionForRead autoColl(txn, ns); auto_ptr<PlanExecutor> exec(InternalPlanner::collectionScan(txn, ns, autoColl.getCollection(), InternalPlanner::BACKWARD)); PlanExecutor::ExecState state = exec->getNext(&result, NULL); return PlanExecutor::ADVANCED == state; }
void MockReplCoordServerFixture::insertOplogEntry(const repl::OplogEntry& entry) { AutoGetCollection autoColl(opCtx(), NamespaceString::kRsOplogNamespace, MODE_IX); auto coll = autoColl.getCollection(); ASSERT_TRUE(coll != nullptr); auto status = coll->insertDocument(opCtx(), InsertStatement(entry.toBSON()), &CurOp::get(opCtx())->debug(), /* fromMigrate */ false); ASSERT_OK(status); }
bool Helpers::getLast(OperationContext* txn, const char* ns, BSONObj& result) { AutoGetCollectionForRead autoColl(txn, ns); unique_ptr<PlanExecutor> exec(InternalPlanner::collectionScan( txn, ns, autoColl.getCollection(), PlanExecutor::YIELD_MANUAL, InternalPlanner::BACKWARD)); PlanExecutor::ExecState state = exec->getNext(&result, NULL); if (PlanExecutor::ADVANCED == state) { result = result.getOwned(); return true; } return false; }
/** * Due to SERVER-23274, versions 3.2.0 through 3.2.4 of MongoDB incorrectly mark the final output * collections of aggregations with $out stages as temporary on most replica set secondaries. Rather * than risk deleting collections that the user did not intend to be temporary when newer nodes * start up or get promoted to be replica set primaries, newer nodes clear the temp flags left by * these versions. */ bool isSubjectToSERVER23299(OperationContext* txn) { // We are already called under global X lock as part of the startup sequence invariant(txn->lockState()->isW()); if (storageGlobalParams.readOnly) { return false; } // Ensure that the local database is open since we are still early in the server startup // sequence dbHolder().openDb(txn, startupLogCollectionName.db()); // Only used as a shortcut to obtain a reference to the startup log collection AutoGetCollection autoColl(txn, startupLogCollectionName, MODE_IS); // No startup log or an empty one means either that the user was not running an affected // version, or that they manually deleted the startup collection since they last started an // affected version. LOG(1) << "Checking node for SERVER-23299 eligibility"; if (!autoColl.getCollection()) { LOG(1) << "Didn't find " << startupLogCollectionName; return false; } LOG(1) << "Checking node for SERVER-23299 applicability - reading startup log"; BSONObj lastStartupLogDoc; if (!Helpers::getLast(txn, startupLogCollectionName.ns().c_str(), lastStartupLogDoc)) { return false; } std::vector<int> versionComponents; try { for (auto elem : lastStartupLogDoc["buildinfo"]["versionArray"].Obj()) { versionComponents.push_back(elem.Int()); } uassert(40050, str::stream() << "Expected three elements in buildinfo.versionArray; found " << versionComponents.size(), versionComponents.size() >= 3); } catch (const DBException& ex) { log() << "Last entry of " << startupLogCollectionName << " has no well-formed buildinfo.versionArray field; ignoring " << causedBy(ex); return false; } LOG(1) << "Checking node for SERVER-23299 applicability - checking version 3.2.x for x in [0, 4]"; if (versionComponents[0] != 3) return false; if (versionComponents[1] != 2) return false; if (versionComponents[2] > 4) return false; LOG(1) << "Node eligible for SERVER-23299"; return true; }
void DocumentSourceCursor::loadBatch() { if (!_exec) { dispose(); return; } // We have already validated the sharding version when we constructed the PlanExecutor // so we shouldn't check it again. const NamespaceString nss(_ns); AutoGetCollectionForRead autoColl(pExpCtx->opCtx, nss); _exec->restoreState(pExpCtx->opCtx); int memUsageBytes = 0; BSONObj obj; PlanExecutor::ExecState state; while ((state = _exec->getNext(&obj, NULL)) == PlanExecutor::ADVANCED) { if (_dependencies) { _currentBatch.push_back(_dependencies->extractFields(obj)); } else { _currentBatch.push_back(Document::fromBsonWithMetaData(obj)); } if (_limit) { if (++_docsAddedToBatches == _limit->getLimit()) { break; } verify(_docsAddedToBatches < _limit->getLimit()); } memUsageBytes += _currentBatch.back().getApproximateSize(); if (memUsageBytes > MaxBytesToReturnToClientAtOnce) { // End this batch and prepare PlanExecutor for yielding. _exec->saveState(); return; } } // If we got here, there won't be any more documents, so destroy the executor. Can't use // dispose since we want to keep the _currentBatch. _exec.reset(); uassert(16028, "collection or index disappeared when cursor yielded", state != PlanExecutor::DEAD); uassert(17285, "cursor encountered an error: " + WorkingSetCommon::toStatusString(obj), state != PlanExecutor::EXEC_ERROR); massert(17286, str::stream() << "Unexpected return from PlanExecutor::getNext: " << state, state == PlanExecutor::IS_EOF || state == PlanExecutor::ADVANCED); }
void MigrationChunkClonerSourceLegacy::_cleanup(OperationContext* txn) { { stdx::lock_guard<stdx::mutex> sl(_mutex); _cloneCompleted = true; } ScopedTransaction scopedXact(txn, MODE_IS); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS); if (_deleteNotifyExec) { _deleteNotifyExec.reset(); } }
void RollbackResyncsCollectionOptionsTest::resyncCollectionOptionsTest( CollectionOptions localCollOptions, BSONObj remoteCollOptionsObj, BSONObj collModCmd, std::string collName) { createOplog(_opCtx.get()); auto dbName = "test"; auto nss = NamespaceString(dbName, collName); auto coll = _createCollection(_opCtx.get(), nss.toString(), localCollOptions); auto commonOpUuid = unittest::assertGet(UUID::parse("f005ba11-cafe-bead-f00d-123456789abc")); auto commonOpBson = BSON("ts" << Timestamp(1, 1) << "t" << 1LL << "op" << "n" << "o" << BSONObj() << "ns" << "rollback_test.test" << "ui" << commonOpUuid); auto commonOperation = std::make_pair(commonOpBson, RecordId(1)); auto collectionModificationOperation = makeCommandOp(Timestamp(Seconds(2), 0), coll->uuid(), nss.toString(), collModCmd, 2); RollbackSourceWithCollectionOptions rollbackSource( std::unique_ptr<OplogInterface>(new OplogInterfaceMock({commonOperation})), remoteCollOptionsObj); ASSERT_OK(syncRollback(_opCtx.get(), OplogInterfaceMock({collectionModificationOperation, commonOperation}), rollbackSource, {}, _coordinator, _replicationProcess.get())); // Make sure the collection options are correct. AutoGetCollectionForReadCommand autoColl(_opCtx.get(), NamespaceString(nss.toString())); auto collAfterRollbackOptions = autoColl.getCollection()->getCatalogEntry()->getCollectionOptions(_opCtx.get()); BSONObjBuilder expectedOptionsBob; if (localCollOptions.uuid) { localCollOptions.uuid.get().appendToBuilder(&expectedOptionsBob, "uuid"); } expectedOptionsBob.appendElements(remoteCollOptionsObj); ASSERT_BSONOBJ_EQ(expectedOptionsBob.obj(), collAfterRollbackOptions.toBSON()); }
Status onShardVersionMismatch(OperationContext* opCtx, const NamespaceString& nss, ChunkVersion shardVersionReceived, bool forceRefreshFromThisThread) noexcept { invariant(!opCtx->lockState()->isLocked()); invariant(!opCtx->getClient()->isInDirectClient()); auto const shardingState = ShardingState::get(opCtx); invariant(shardingState->canAcceptShardedCommands()); LOG(2) << "Metadata refresh requested for " << nss.ns() << " at shard version " << shardVersionReceived; ShardingStatistics::get(opCtx).countStaleConfigErrors.addAndFetch(1); // Ensure any ongoing migrations have completed before trying to do the refresh. This wait is // just an optimization so that MongoS does not exhaust its maximum number of StaleConfig retry // attempts while the migration is being committed. try { auto& oss = OperationShardingState::get(opCtx); oss.waitForMigrationCriticalSectionSignal(opCtx); } catch (const DBException& ex) { return ex.toStatus(); } const auto currentShardVersion = [&] { AutoGetCollection autoColl(opCtx, nss, MODE_IS); const auto currentMetadata = CollectionShardingState::get(opCtx, nss)->getMetadata(opCtx); if (currentMetadata) { return currentMetadata->getShardVersion(); } return ChunkVersion::UNSHARDED(); }(); if (currentShardVersion.epoch() == shardVersionReceived.epoch() && currentShardVersion.majorVersion() >= shardVersionReceived.majorVersion()) { // Don't need to remotely reload if we're in the same epoch and the requested version is // smaller than the one we know about. This means that the remote side is behind. return Status::OK(); } try { forceShardFilteringMetadataRefresh(opCtx, nss, forceRefreshFromThisThread); return Status::OK(); } catch (const DBException& ex) { log() << "Failed to refresh metadata for collection" << nss << causedBy(redact(ex)); return ex.toStatus(); } }
bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { BSONElement first = cmdObj.firstElement(); uassert( 28528, str::stream() << "Argument to listIndexes must be of type String, not " << typeName(first.type()), first.type() == String); const NamespaceString ns(parseNs(dbname, cmdObj)); uassert( 28529, str::stream() << "Argument to listIndexes must be a collection name, " << "not the empty string", !ns.coll().empty()); AutoGetCollectionForRead autoColl(txn, ns); if (!autoColl.getDb()) { return appendCommandStatus( result, Status( ErrorCodes::NamespaceNotFound, "no database" ) ); } const Collection* collection = autoColl.getCollection(); if (!collection) { return appendCommandStatus( result, Status( ErrorCodes::NamespaceNotFound, "no collection" ) ); } const CollectionCatalogEntry* cce = collection->getCatalogEntry(); invariant(cce); vector<string> indexNames; cce->getAllIndexes( txn, &indexNames ); BSONArrayBuilder arr; for ( size_t i = 0; i < indexNames.size(); i++ ) { arr.append( cce->getIndexSpec( txn, indexNames[i] ) ); } result.append( "indexes", arr.arr() ); return true; }
bool Helpers::getLast(OperationContext* opCtx, const char* ns, BSONObj& result) { AutoGetCollectionForReadCommand autoColl(opCtx, NamespaceString(ns)); auto exec = InternalPlanner::collectionScan( opCtx, ns, autoColl.getCollection(), PlanExecutor::NO_YIELD, InternalPlanner::BACKWARD); PlanExecutor::ExecState state = exec->getNext(&result, NULL); // Non-yielding collection scans from InternalPlanner will never error. invariant(PlanExecutor::ADVANCED == state || PlanExecutor::IS_EOF == state); if (PlanExecutor::ADVANCED == state) { result = result.getOwned(); return true; } return false; }
Status dropIndexes(OperationContext* opCtx, const NamespaceString& nss, const BSONObj& cmdObj, BSONObjBuilder* result) { return writeConflictRetry(opCtx, "dropIndexes", nss.db(), [opCtx, &nss, &cmdObj, result] { AutoGetCollection autoColl(opCtx, nss, MODE_IX, MODE_X); bool userInitiatedWritesAndNotPrimary = opCtx->writesAreReplicated() && !repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesFor(opCtx, nss); if (userInitiatedWritesAndNotPrimary) { return Status(ErrorCodes::NotMaster, str::stream() << "Not primary while dropping indexes in " << nss); } if (!serverGlobalParams.quiet.load()) { LOG(0) << "CMD: dropIndexes " << nss << ": " << cmdObj[kIndexFieldName].toString(false); } // If db/collection does not exist, short circuit and return. Database* db = autoColl.getDb(); Collection* collection = autoColl.getCollection(); if (!collection) { if (db && ViewCatalog::get(db)->lookup(opCtx, nss.ns())) { return Status(ErrorCodes::CommandNotSupportedOnView, str::stream() << "Cannot drop indexes on view " << nss); } return Status(ErrorCodes::NamespaceNotFound, "ns not found"); } BackgroundOperation::assertNoBgOpInProgForNs(nss); IndexBuildsCoordinator::get(opCtx)->assertNoIndexBuildInProgForCollection( collection->uuid().get()); WriteUnitOfWork wunit(opCtx); OldClientContext ctx(opCtx, nss.ns()); Status status = wrappedRun(opCtx, collection, cmdObj, result); if (!status.isOK()) { return status; } wunit.commit(); return Status::OK(); }); }
Value DocumentSourceCursor::serialize(bool explain) const { // we never parse a documentSourceCursor, so we only serialize for explain if (!explain) return Value(); // Get planner-level explain info from the underlying PlanExecutor. BSONObjBuilder explainBuilder; Status explainStatus(ErrorCodes::InternalError, ""); { const NamespaceString nss(_ns); AutoGetCollectionForRead autoColl(pExpCtx->opCtx, nss); massert(17392, "No _exec. Were we disposed before explained?", _exec); _exec->restoreState(pExpCtx->opCtx); explainStatus = Explain::explainStages(pExpCtx->opCtx, _exec.get(), ExplainCommon::QUERY_PLANNER, &explainBuilder); _exec->saveState(); } MutableDocument out; out["query"] = Value(_query); if (!_sort.isEmpty()) out["sort"] = Value(_sort); if (_limit) out["limit"] = Value(_limit->getLimit()); if (!_projection.isEmpty()) out["fields"] = Value(_projection); // Add explain results from the query system into the agg explain output. if (explainStatus.isOK()) { BSONObj explainObj = explainBuilder.obj(); invariant(explainObj.hasField("queryPlanner")); out["queryPlanner"] = Value(explainObj["queryPlanner"]); } else { out["planError"] = Value(explainStatus.toString()); } return Value(DOC(getSourceName() << out.freezeToValue())); }
/** * Due to SERVER-23274, versions 3.2.0 through 3.2.4 of MongoDB incorrectly mark the final output * collections of aggregations with $out stages as temporary on most replica set secondaries. Rather * than risk deleting collections that the user did not intend to be temporary when newer nodes * start up or get promoted to be replica set primaries, newer nodes clear the temp flags left by * these versions. */ bool isSubjectToSERVER23299(OperationContext* txn) { if (storageGlobalParams.readOnly) { return false; } dbHolder().openDb(txn, startupLogCollectionName.db()); AutoGetCollectionForRead autoColl(txn, startupLogCollectionName); // No startup log or an empty one means either that the user was not running an affected // version, or that they manually deleted the startup collection since they last started an // affected version. LOG(1) << "Checking node for SERVER-23299 eligibility"; if (!autoColl.getCollection()) { LOG(1) << "Didn't find " << startupLogCollectionName; return false; } LOG(1) << "Checking node for SERVER-23299 applicability - reading startup log"; BSONObj lastStartupLogDoc; if (!Helpers::getLast(txn, startupLogCollectionName.ns().c_str(), lastStartupLogDoc)) { return false; } std::vector<int> versionComponents; try { for (auto elem : lastStartupLogDoc["buildinfo"]["versionArray"].Obj()) { versionComponents.push_back(elem.Int()); } uassert(40050, str::stream() << "Expected three elements in buildinfo.versionArray; found " << versionComponents.size(), versionComponents.size() >= 3); } catch (const DBException& ex) { log() << "Last entry of " << startupLogCollectionName << " has no well-formed buildinfo.versionArray field; ignoring " << causedBy(ex); return false; } LOG(1) << "Checking node for SERVER-23299 applicability - checking version 3.2.x for x in [0, 4]"; if (versionComponents[0] != 3) return false; if (versionComponents[1] != 2) return false; if (versionComponents[2] > 4) return false; LOG(1) << "Node eligible for SERVER-23299"; return true; }
SessionCatalogMigrationSource::SessionCatalogMigrationSource(OperationContext* opCtx, NamespaceString ns) : _ns(std::move(ns)), _rollbackIdAtInit(repl::ReplicationProcess::get(opCtx)->getRollbackID()) { // Exclude entries for transaction. Query query; // Sort is not needed for correctness. This is just for making it easier to write deterministic // tests. query.sort(BSON("_id" << 1)); DBDirectClient client(opCtx); auto cursor = client.query(NamespaceString::kSessionTransactionsTableNamespace, query); while (cursor->more()) { auto nextSession = SessionTxnRecord::parse( IDLParserErrorContext("Session migration cloning"), cursor->next()); if (!nextSession.getLastWriteOpTime().isNull()) { _sessionOplogIterators.push_back( stdx::make_unique<SessionOplogIterator>(std::move(nextSession), _rollbackIdAtInit)); } } { AutoGetCollection autoColl(opCtx, NamespaceString::kRsOplogNamespace, MODE_IX); writeConflictRetry( opCtx, "session migration initialization majority commit barrier", NamespaceString::kRsOplogNamespace.ns(), [&] { const auto message = BSON("sessionMigrateCloneStart" << _ns.ns()); WriteUnitOfWork wuow(opCtx); opCtx->getClient()->getServiceContext()->getOpObserver()->onInternalOpMessage( opCtx, _ns, {}, {}, message); wuow.commit(); }); } auto opTimeToWait = repl::ReplClientInfo::forClient(opCtx->getClient()).getLastOp(); WriteConcernResult result; WriteConcernOptions majority( WriteConcernOptions::kMajority, WriteConcernOptions::SyncMode::UNSET, 0); uassertStatusOK(waitForWriteConcern(opCtx, opTimeToWait, majority, &result)); }
Status CollectionShardingState::waitForClean(OperationContext* opCtx, const NamespaceString& nss, OID const& epoch, ChunkRange orphanRange) { while (true) { boost::optional<CleanupNotification> stillScheduled; { AutoGetCollection autoColl(opCtx, nss, MODE_IX); auto css = CollectionShardingState::get(opCtx, nss); { // First, see if collection was dropped, but do it in a separate scope in order to // not hold reference on it, which would make it appear in use auto metadata = css->_metadataManager->getActiveMetadata(css->_metadataManager); if (!metadata || metadata->getCollVersion().epoch() != epoch) { return {ErrorCodes::StaleShardVersion, "Collection being migrated was dropped"}; } } stillScheduled = css->trackOrphanedDataCleanup(orphanRange); if (!stillScheduled) { log() << "Finished deleting " << nss.ns() << " range " << redact(orphanRange.toString()); return Status::OK(); } } log() << "Waiting for deletion of " << nss.ns() << " range " << orphanRange; Status result = stillScheduled->waitStatus(opCtx); if (!result.isOK()) { return result.withContext(str::stream() << "Failed to delete orphaned " << nss.ns() << " range " << orphanRange.toString()); } } MONGO_UNREACHABLE; }
Status MigrationSourceManager::enterCriticalSection(OperationContext* txn) { invariant(!txn->lockState()->isLocked()); invariant(_state == kCloneCaughtUp); auto scopedGuard = MakeGuard([&] { cleanupOnError(txn); }); // Mark the shard as running critical operation, which requires recovery on crash Status status = ShardingStateRecovery::startMetadataOp(txn); if (!status.isOK()) { return status; } { ScopedTransaction scopedXact(txn, MODE_IX); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X); auto css = CollectionShardingState::get(txn, _args.getNss().ns()); if (!css->getMetadata() || !css->getMetadata()->getCollVersion().equals(_committedMetadata->getCollVersion())) { return {ErrorCodes::IncompatibleShardingMetadata, str::stream() << "Sharding metadata changed while holding distributed lock. Expected: " << _committedMetadata->getCollVersion().toString() << ", actual: " << css->getMetadata()->getCollVersion().toString()}; } // IMPORTANT: After this line, the critical section is in place and needs to be rolled back // if anything fails, which would prevent commit to the config servers. _critSecSignal = std::make_shared<Notification<void>>(); } log() << "Successfully entered critical section."; _state = kCriticalSection; scopedGuard.Dismiss(); return Status::OK(); }
/* static */ Status CollectionShardingState::waitForClean(OperationContext* opCtx, NamespaceString nss, OID const& epoch, ChunkRange orphanRange) { do { auto stillScheduled = boost::optional<CleanupNotification>(); { AutoGetCollection autoColl(opCtx, nss, MODE_IX); // First, see if collection was dropped. auto css = CollectionShardingState::get(opCtx, nss); { auto metadata = css->_metadataManager->getActiveMetadata(css->_metadataManager); if (!metadata || metadata->getCollVersion().epoch() != epoch) { return {ErrorCodes::StaleShardVersion, "Collection being migrated was dropped"}; } } // drop metadata stillScheduled = css->trackOrphanedDataCleanup(orphanRange); if (!stillScheduled) { log() << "Finished deleting " << nss.ns() << " range " << redact(orphanRange.toString()); return Status::OK(); } } // drop collection lock log() << "Waiting for deletion of " << nss.ns() << " range " << orphanRange; Status result = stillScheduled->waitStatus(opCtx); if (!result.isOK()) { return Status{result.code(), str::stream() << "Failed to delete orphaned " << nss.ns() << " range " << orphanRange.toString() << ": " << result.reason()}; } } while (true); MONGO_UNREACHABLE; }
bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result) { BSONElement first = cmdObj.firstElement(); uassert(28528, str::stream() << "Argument to listIndexes must be of type String, not " << typeName(first.type()), first.type() == String); StringData collectionName = first.valueStringData(); uassert(28529, str::stream() << "Argument to listIndexes must be a collection name, " << "not the empty string", !collectionName.empty()); const NamespaceString ns(dbname, collectionName); const long long defaultBatchSize = std::numeric_limits<long long>::max(); long long batchSize; Status parseCursorStatus = parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize); if (!parseCursorStatus.isOK()) { return appendCommandStatus(result, parseCursorStatus); } AutoGetCollectionForRead autoColl(txn, ns); if (!autoColl.getDb()) { return appendCommandStatus(result, Status(ErrorCodes::NamespaceNotFound, "no database")); } const Collection* collection = autoColl.getCollection(); if (!collection) { return appendCommandStatus(result, Status(ErrorCodes::NamespaceNotFound, "no collection")); } const CollectionCatalogEntry* cce = collection->getCatalogEntry(); invariant(cce); vector<string> indexNames; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { indexNames.clear(); cce->getAllIndexes(txn, &indexNames); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns()); std::unique_ptr<WorkingSet> ws(new WorkingSet()); std::unique_ptr<QueuedDataStage> root(new QueuedDataStage(ws.get())); for (size_t i = 0; i < indexNames.size(); i++) { BSONObj indexSpec; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { indexSpec = cce->getIndexSpec(txn, indexNames[i]); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns()); WorkingSetID id = ws->allocate(); WorkingSetMember* member = ws->get(id); member->keyData.clear(); member->loc = RecordId(); member->obj = Snapshotted<BSONObj>(SnapshotId(), indexSpec.getOwned()); member->transitionToOwnedObj(); root->pushBack(id); } std::string cursorNamespace = str::stream() << dbname << ".$cmd." << name << "." << ns.coll(); dassert(NamespaceString(cursorNamespace).isValid()); dassert(NamespaceString(cursorNamespace).isListIndexesCursorNS()); dassert(ns == NamespaceString(cursorNamespace).getTargetNSForListIndexes()); auto statusWithPlanExecutor = PlanExecutor::make( txn, std::move(ws), std::move(root), cursorNamespace, PlanExecutor::YIELD_MANUAL); if (!statusWithPlanExecutor.isOK()) { return appendCommandStatus(result, statusWithPlanExecutor.getStatus()); } std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue()); BSONArrayBuilder firstBatch; const int byteLimit = MaxBytesToReturnToClientAtOnce; for (long long objCount = 0; objCount < batchSize && firstBatch.len() < byteLimit; objCount++) { BSONObj next; PlanExecutor::ExecState state = exec->getNext(&next, NULL); if (state == PlanExecutor::IS_EOF) { break; } invariant(state == PlanExecutor::ADVANCED); firstBatch.append(next); } CursorId cursorId = 0LL; if (!exec->isEOF()) { exec->saveState(); ClientCursor* cursor = new ClientCursor( CursorManager::getGlobalCursorManager(), exec.release(), cursorNamespace); cursorId = cursor->cursorid(); } appendCursorResponseObject(cursorId, cursorNamespace, firstBatch.arr(), &result); return true; }
bool ShardingInitializationMongoD::initializeShardingAwarenessIfNeeded(OperationContext* opCtx) { invariant(!opCtx->lockState()->isLocked()); // In sharded readOnly mode, we ignore the shardIdentity document on disk and instead *require* // a shardIdentity document to be passed through --overrideShardIdentity if (storageGlobalParams.readOnly) { if (serverGlobalParams.clusterRole == ClusterRole::ShardServer) { uassert(ErrorCodes::InvalidOptions, "If started with --shardsvr in queryableBackupMode, a shardIdentity document " "must be provided through --overrideShardIdentity", !serverGlobalParams.overrideShardIdentity.isEmpty()); auto overrideShardIdentity = uassertStatusOK(ShardIdentityType::fromShardIdentityDocument( serverGlobalParams.overrideShardIdentity)); { // Global lock is required to call initializeFromShardIdentity Lock::GlobalWrite lk(opCtx); initializeFromShardIdentity(opCtx, overrideShardIdentity); } return true; } else { // Error if --overrideShardIdentity is used but *not* started with --shardsvr uassert(ErrorCodes::InvalidOptions, str::stream() << "Not started with --shardsvr, but a shardIdentity document was provided " "through --overrideShardIdentity: " << serverGlobalParams.overrideShardIdentity, serverGlobalParams.overrideShardIdentity.isEmpty()); return false; } MONGO_UNREACHABLE; } // In sharded *non*-readOnly mode, error if --overrideShardIdentity is provided uassert(ErrorCodes::InvalidOptions, str::stream() << "--overrideShardIdentity is only allowed in sharded " "queryableBackupMode. If not in queryableBackupMode, you can edit " "the shardIdentity document by starting the server *without* " "--shardsvr, manually updating the shardIdentity document in the " << NamespaceString::kServerConfigurationNamespace.toString() << " collection, and restarting the server with --shardsvr.", serverGlobalParams.overrideShardIdentity.isEmpty()); // Use the shardIdentity document on disk if one exists, but it is okay if no shardIdentity // document is available at all (sharding awareness will be initialized when a shardIdentity // document is inserted) BSONObj shardIdentityBSON; const bool foundShardIdentity = [&] { AutoGetCollection autoColl(opCtx, NamespaceString::kServerConfigurationNamespace, MODE_IS); return Helpers::findOne(opCtx, autoColl.getCollection(), BSON("_id" << ShardIdentityType::IdName), shardIdentityBSON); }(); if (serverGlobalParams.clusterRole == ClusterRole::ShardServer) { if (!foundShardIdentity) { warning() << "Started with --shardsvr, but no shardIdentity document was found on " "disk in " << NamespaceString::kServerConfigurationNamespace << ". This most likely means this server has not yet been added to a " "sharded cluster."; return false; } invariant(!shardIdentityBSON.isEmpty()); auto shardIdentity = uassertStatusOK(ShardIdentityType::fromShardIdentityDocument(shardIdentityBSON)); { // Global lock is required to call initializeFromShardIdentity Lock::GlobalWrite lk(opCtx); initializeFromShardIdentity(opCtx, shardIdentity); } return true; } else { // Warn if a shardIdentity document is found on disk but *not* started with --shardsvr. if (!shardIdentityBSON.isEmpty()) { warning() << "Not started with --shardsvr, but a shardIdentity document was found " "on disk in " << NamespaceString::kServerConfigurationNamespace << ": " << shardIdentityBSON; } return false; } }
Status ShardingStateRecovery::recover(OperationContext* txn) { if (serverGlobalParams.clusterRole != ClusterRole::ShardServer) { return Status::OK(); } BSONObj recoveryDocBSON; try { AutoGetCollection autoColl(txn, NamespaceString::kConfigCollectionNamespace, MODE_IS); if (!Helpers::findOne( txn, autoColl.getCollection(), RecoveryDocument::getQuery(), recoveryDocBSON)) { return Status::OK(); } } catch (const DBException& ex) { return ex.toStatus(); } const auto recoveryDocStatus = RecoveryDocument::fromBSON(recoveryDocBSON); if (!recoveryDocStatus.isOK()) return recoveryDocStatus.getStatus(); const auto recoveryDoc = std::move(recoveryDocStatus.getValue()); log() << "Sharding state recovery process found document " << recoveryDoc.toBSON(); // Make sure the sharding state is initialized ShardingState* const shardingState = ShardingState::get(txn); // For backwards compatibility. Shards added by v3.4 cluster should have been initialized by // the shard identity document. // TODO(SERER-25276): Remove this after 3.4 since 3.4 shards should always have ShardingState // initialized by this point. if (!shardingState->enabled()) { shardingState->initializeFromConfigConnString(txn, recoveryDoc.getConfigsvr().toString()); shardingState->setShardName(recoveryDoc.getShardName()); } if (!recoveryDoc.getMinOpTimeUpdaters()) { // Treat the minOpTime as up-to-date grid.advanceConfigOpTime(recoveryDoc.getMinOpTime()); return Status::OK(); } log() << "Sharding state recovery document indicates there were " << recoveryDoc.getMinOpTimeUpdaters() << " metadata change operations in flight. Contacting the config server primary in order " "to retrieve the most recent opTime."; // Need to fetch the latest uptime from the config server, so do a logging write Status status = grid.catalogClient(txn)->logChange(txn, "Sharding minOpTime recovery", NamespaceString::kConfigCollectionNamespace.ns(), recoveryDocBSON, ShardingCatalogClient::kMajorityWriteConcern); if (!status.isOK()) return status; log() << "Sharding state recovered. New config server opTime is " << grid.configOpTime(); // Finally, clear the recovery document so next time we don't need to recover status = modifyRecoveryDocument(txn, RecoveryDocument::Clear, kLocalWriteConcern); if (!status.isOK()) { warning() << "Failed to reset sharding state recovery document due to " << status; } return Status::OK(); }
Status MigrationChunkClonerSourceLegacy::_storeCurrentLocs(OperationContext* txn) { ScopedTransaction scopedXact(txn, MODE_IS); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS); Collection* const collection = autoColl.getCollection(); if (!collection) { return {ErrorCodes::NamespaceNotFound, str::stream() << "Collection " << _args.getNss().ns() << " does not exist."}; } // Allow multiKey based on the invariant that shard keys must be single-valued. Therefore, any // multi-key index prefixed by shard key cannot be multikey over the shard key fields. IndexDescriptor* idx = collection->getIndexCatalog()->findShardKeyPrefixedIndex(txn, _shardKeyPattern.toBSON(), false); // requireSingleKey if (!idx) { return {ErrorCodes::IndexNotFound, str::stream() << "can't find index with prefix " << _shardKeyPattern.toBSON() << " in storeCurrentLocs for " << _args.getNss().ns()}; } // Install the stage, which will listen for notifications on the collection { stdx::lock_guard<stdx::mutex> sl(_mutex); invariant(!_deleteNotifyExec); // Takes ownership of 'ws' and 'dns'. auto statusWithPlanExecutor = PlanExecutor::make(txn, stdx::make_unique<WorkingSet>(), stdx::make_unique<DeleteNotificationStage>(this, txn), collection, PlanExecutor::YIELD_MANUAL); invariant(statusWithPlanExecutor.isOK()); _deleteNotifyExec = std::move(statusWithPlanExecutor.getValue()); _deleteNotifyExec->registerExec(collection); } // Assume both min and max non-empty, append MinKey's to make them fit chosen index const KeyPattern kp(idx->keyPattern()); BSONObj min = Helpers::toKeyFormat(kp.extendRangeBound(_args.getMinKey(), false)); BSONObj max = Helpers::toKeyFormat(kp.extendRangeBound(_args.getMaxKey(), false)); std::unique_ptr<PlanExecutor> exec(InternalPlanner::indexScan(txn, collection, idx, min, max, false, // endKeyInclusive PlanExecutor::YIELD_MANUAL)); // We can afford to yield here because any change to the base data that we might miss is already // being queued and will migrate in the 'transferMods' stage. exec->setYieldPolicy(PlanExecutor::YIELD_AUTO, collection); // Use the average object size to estimate how many objects a full chunk would carry do that // while traversing the chunk's range using the sharding index, below there's a fair amount of // slack before we determine a chunk is too large because object sizes will vary. unsigned long long maxRecsWhenFull; long long avgRecSize; const long long totalRecs = collection->numRecords(txn); if (totalRecs > 0) { avgRecSize = collection->dataSize(txn) / totalRecs; maxRecsWhenFull = _args.getMaxChunkSizeBytes() / avgRecSize; maxRecsWhenFull = std::min((unsigned long long)(Chunk::MaxObjectPerChunk + 1), 130 * maxRecsWhenFull / 100 /* slack */); } else { avgRecSize = 0; maxRecsWhenFull = Chunk::MaxObjectPerChunk + 1; } // Do a full traversal of the chunk and don't stop even if we think it is a large chunk we want // the number of records to better report, in that case. bool isLargeChunk = false; unsigned long long recCount = 0; BSONObj obj; RecordId recordId; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, &recordId))) { if (!isLargeChunk) { stdx::lock_guard<stdx::mutex> lk(_mutex); _cloneLocs.insert(recordId); } if (++recCount > maxRecsWhenFull) { isLargeChunk = true; // Continue on despite knowing that it will fail, just to get the correct value for // recCount } } if (PlanExecutor::DEAD == state || PlanExecutor::FAILURE == state) { return {ErrorCodes::InternalError, str::stream() << "Executor error while scanning for documents belonging to chunk: " << WorkingSetCommon::toStatusString(obj)}; } exec.reset(); if (isLargeChunk) { return { ErrorCodes::ChunkTooBig, str::stream() << "Cannot move chunk: the maximum number of documents for a chunk is " << maxRecsWhenFull << ", the maximum chunk size is " << _args.getMaxChunkSizeBytes() << ", average document size is " << avgRecSize << ". Found " << recCount << " documents in chunk " << " ns: " << _args.getNss().ns() << " " << _args.getMinKey() << " -> " << _args.getMaxKey()}; } _averageObjectSizeForCloneLocs = static_cast<uint64_t>(collection->averageObjectSize(txn) + 12); return Status::OK(); }