OpTime ReplicationCoordinatorExternalStateImpl::onTransitionToPrimary(OperationContext* txn, bool isV1ElectionProtocol) { invariant(txn->lockState()->isW()); // Clear the appliedThrough marker so on startup we'll use the top of the oplog. This must be // done before we add anything to our oplog. invariant(_storageInterface->getOplogDeleteFromPoint(txn).isNull()); _storageInterface->setAppliedThrough(txn, {}); if (isV1ElectionProtocol) { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction scopedXact(txn, MODE_X); WriteUnitOfWork wuow(txn); txn->getClient()->getServiceContext()->getOpObserver()->onOpMessage( txn, BSON("msg" << "new primary")); wuow.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END( txn, "logging transition to primary to oplog", "local.oplog.rs"); } const auto opTimeToReturn = fassertStatusOK(28665, loadLastOpTime(txn)); _shardingOnTransitionToPrimaryHook(txn); _dropAllTempCollections(txn); return opTimeToReturn; }
Status MigrationSourceManager::startClone(OperationContext* txn) { invariant(!txn->lockState()->isLocked()); invariant(_state == kCreated); auto scopedGuard = MakeGuard([&] { cleanupOnError(txn); }); grid.catalogClient(txn)->logChange(txn, "moveChunk.start", _args.getNss().ns(), BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey() << "from" << _args.getFromShardId() << "to" << _args.getToShardId())); _cloneDriver = stdx::make_unique<MigrationChunkClonerSourceLegacy>( _args, _committedMetadata->getKeyPattern()); { // Register for notifications from the replication subsystem ScopedTransaction scopedXact(txn, MODE_IX); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X); auto css = CollectionShardingState::get(txn, _args.getNss().ns()); css->setMigrationSourceManager(txn, this); } Status startCloneStatus = _cloneDriver->startClone(txn); if (!startCloneStatus.isOK()) { return startCloneStatus; } _state = kCloning; scopedGuard.Dismiss(); return Status::OK(); }
Status verifySystemIndexes(OperationContext* txn) { const NamespaceString systemUsers = AuthorizationManager::usersCollectionNamespace; // Make sure the old unique index from v2.4 on system.users doesn't exist. ScopedTransaction scopedXact(txn, MODE_IX); AutoGetDb autoDb(txn, systemUsers.db(), MODE_X); if (!autoDb.getDb()) { return Status::OK(); } Collection* collection = autoDb.getDb()->getCollection(NamespaceString(systemUsers)); if (!collection) { return Status::OK(); } IndexCatalog* indexCatalog = collection->getIndexCatalog(); IndexDescriptor* oldIndex = NULL; if (indexCatalog && (oldIndex = indexCatalog->findIndexByKeyPattern(txn, v1SystemUsersKeyPattern))) { return Status(ErrorCodes::AuthSchemaIncompatible, "Old 2.4 style user index identified. " "The authentication schema needs to be updated by " "running authSchemaUpgrade on a 2.6 server."); } return Status::OK(); }
bool run(OperationContext* txn, const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { ScopedTransaction scopedXact(txn, MODE_IS); AutoGetDb autoDb(txn, dbname, MODE_S); const Database* d = autoDb.getDb(); const DatabaseCatalogEntry* dbEntry = NULL; list<string> names; if ( d ) { dbEntry = d->getDatabaseCatalogEntry(); dbEntry->getCollectionNamespaces( &names ); names.sort(); } scoped_ptr<MatchExpression> matcher; if ( jsobj["filter"].isABSONObj() ) { StatusWithMatchExpression parsed = MatchExpressionParser::parse( jsobj["filter"].Obj() ); if ( !parsed.isOK() ) { return appendCommandStatus( result, parsed.getStatus() ); } matcher.reset( parsed.getValue() ); } BSONArrayBuilder arr; for ( list<string>::const_iterator i = names.begin(); i != names.end(); ++i ) { string ns = *i; StringData collection = nsToCollectionSubstring( ns ); if ( collection == "system.namespaces" ) { continue; } BSONObjBuilder b; b.append( "name", collection ); CollectionOptions options = dbEntry->getCollectionCatalogEntry( txn, ns )->getCollectionOptions(txn); b.append( "options", options.toBSON() ); BSONObj maybe = b.obj(); if ( matcher && !matcher->matchesBSON( maybe ) ) { continue; } arr.append( maybe ); } result.append( "collections", arr.arr() ); return true; }
void restartInProgressIndexesFromLastShutdown(OperationContext* txn) { txn->getClient()->getAuthorizationSession()->grantInternalAuthorization(); std::vector<std::string> dbNames; StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); storageEngine->listDatabases( &dbNames ); try { std::list<std::string> collNames; for (std::vector<std::string>::const_iterator dbName = dbNames.begin(); dbName < dbNames.end(); ++dbName) { ScopedTransaction scopedXact(txn, MODE_IS); AutoGetDb autoDb(txn, *dbName, MODE_S); Database* db = autoDb.getDb(); db->getDatabaseCatalogEntry()->getCollectionNamespaces(&collNames); } checkNS(txn, collNames); } catch (const DBException& e) { error() << "Index verification did not complete: " << e.toString(); fassertFailedNoTrace(18643); } LOG(1) << "checking complete" << endl; }
Status ReplicationCoordinatorExternalStateImpl::initializeReplSetStorage(OperationContext* txn, const BSONObj& config) { try { createOplog(txn); MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction scopedXact(txn, MODE_X); Lock::GlobalWrite globalWrite(txn->lockState()); WriteUnitOfWork wuow(txn); Helpers::putSingleton(txn, configCollectionName, config); const auto msgObj = BSON("msg" << "initiating set"); getGlobalServiceContext()->getOpObserver()->onOpMessage(txn, msgObj); wuow.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "initiate oplog entry", "local.oplog.rs"); // This initializes the minvalid document with a null "ts" because older versions (<=3.2) // get angry if the minValid document is present but doesn't have a "ts" field. // Consider removing this once we no longer need to support downgrading to 3.2. _storageInterface->setMinValidToAtLeast(txn, {}); FeatureCompatibilityVersion::setIfCleanStartup(txn, _storageInterface); } catch (const DBException& ex) { return ex.toStatus(); } return Status::OK(); }
void MigrationSourceManager::_cleanup(OperationContext* txn) { invariant(_state != kDone); { // Unregister from the collection's sharding state ScopedTransaction scopedXact(txn, MODE_IX); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X); auto css = CollectionShardingState::get(txn, _args.getNss().ns()); // The migration source manager is not visible anymore after it is unregistered from the // collection css->clearMigrationSourceManager(txn); // Leave the critical section. if (_state == kCriticalSection) { _critSecSignal->set(); } } // Decrement the metadata op counter outside of the collection lock in order to hold it for as // short as possible. if (_state == kCriticalSection) { ShardingStateRecovery::endMetadataOp(txn); } if (_cloneDriver) { _cloneDriver->cancelClone(txn); _cloneDriver.reset(); } _state = kDone; }
void ReplicationCoordinatorExternalStateImpl::initiateOplog(OperationContext* txn) { createOplog(txn); ScopedTransaction scopedXact(txn, MODE_X); Lock::GlobalWrite globalWrite(txn->lockState()); WriteUnitOfWork wuow(txn); getGlobalServiceContext()->getOpObserver()->onOpMessage(txn, BSON("msg" << "initiating set")); wuow.commit(); }
void ReplicationCoordinatorExternalStateImpl::initiateOplog(OperationContext* txn) { createOplog(txn); MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction scopedXact(txn, MODE_X); Lock::GlobalWrite globalWrite(txn->lockState()); WriteUnitOfWork wuow(txn); getGlobalServiceContext()->getOpObserver()->onOpMessage(txn, BSON("msg" << "initiating set")); wuow.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "initiate oplog entry", "local.oplog.rs"); }
void MigrationChunkClonerSourceLegacy::_cleanup(OperationContext* txn) { { stdx::lock_guard<stdx::mutex> sl(_mutex); _cloneCompleted = true; } ScopedTransaction scopedXact(txn, MODE_IS); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS); if (_deleteNotifyExec) { _deleteNotifyExec.reset(); } }
Status ReplicationCoordinatorExternalStateImpl::initializeReplSetStorage(OperationContext* txn, const BSONObj& config, bool updateReplOpTime) { try { createOplog(txn, rsOplogName, true); MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction scopedXact(txn, MODE_X); Lock::GlobalWrite globalWrite(txn->lockState()); WriteUnitOfWork wuow(txn); Helpers::putSingleton(txn, configCollectionName, config); const auto msgObj = BSON("msg" << "initiating set"); if (updateReplOpTime) { getGlobalServiceContext()->getOpObserver()->onOpMessage(txn, msgObj); } else { // 'updateReplOpTime' is false when called from the replSetInitiate command when the // server is running with replication disabled. We bypass onOpMessage to invoke // _logOp directly so that we can override the replication mode and keep _logO from // updating the replication coordinator's op time (illegal operation when // replication is not enabled). repl::oplogCheckCloseDatabase(txn, nullptr); repl::_logOp(txn, "n", "", msgObj, nullptr, false, rsOplogName, ReplicationCoordinator::modeReplSet, updateReplOpTime); repl::oplogCheckCloseDatabase(txn, nullptr); } wuow.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "initiate oplog entry", "local.oplog.rs"); } catch (const DBException& ex) { return ex.toStatus(); } return Status::OK(); }
Status ReplicationCoordinatorExternalStateImpl::initializeReplSetStorage(OperationContext* txn, const BSONObj& config) { try { createOplog(txn); MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction scopedXact(txn, MODE_X); Lock::GlobalWrite globalWrite(txn->lockState()); WriteUnitOfWork wuow(txn); Helpers::putSingleton(txn, configCollectionName, config); const auto msgObj = BSON("msg" << "initiating set"); getGlobalServiceContext()->getOpObserver()->onOpMessage(txn, msgObj); wuow.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "initiate oplog entry", "local.oplog.rs"); } catch (const DBException& ex) { return ex.toStatus(); } return Status::OK(); }
Status emptyCapped(OperationContext* txn, const NamespaceString& collectionName) { ScopedTransaction scopedXact(txn, MODE_IX); AutoGetDb autoDb(txn, collectionName.db(), MODE_X); bool userInitiatedWritesAndNotPrimary = txn->writesAreReplicated() && !repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase( collectionName.db()); if (userInitiatedWritesAndNotPrimary) { return Status(ErrorCodes::NotMaster, str::stream() << "Not primary while truncating collection " << collectionName.ns()); } Database* db = autoDb.getDb(); massert(13429, "no such database", db); Collection* collection = db->getCollection(collectionName); massert(28584, "no such collection", collection); std::vector<BSONObj> indexes = stopIndexBuildsEmptyCapped(txn, db, collectionName); WriteUnitOfWork wuow(txn); Status status = collection->truncate(txn); if (!status.isOK()) { return status; } IndexBuilder::restoreIndexes(txn, indexes); getGlobalServiceContext()->getOpObserver()->onEmptyCapped(txn, collection->ns()); wuow.commit(); return Status::OK(); }
Status MigrationSourceManager::enterCriticalSection(OperationContext* txn) { invariant(!txn->lockState()->isLocked()); invariant(_state == kCloneCaughtUp); auto scopedGuard = MakeGuard([&] { cleanupOnError(txn); }); // Mark the shard as running critical operation, which requires recovery on crash Status status = ShardingStateRecovery::startMetadataOp(txn); if (!status.isOK()) { return status; } { ScopedTransaction scopedXact(txn, MODE_IX); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X); auto css = CollectionShardingState::get(txn, _args.getNss().ns()); if (!css->getMetadata() || !css->getMetadata()->getCollVersion().equals(_committedMetadata->getCollVersion())) { return {ErrorCodes::IncompatibleShardingMetadata, str::stream() << "Sharding metadata changed while holding distributed lock. Expected: " << _committedMetadata->getCollVersion().toString() << ", actual: " << css->getMetadata()->getCollVersion().toString()}; } // IMPORTANT: After this line, the critical section is in place and needs to be rolled back // if anything fails, which would prevent commit to the config servers. _critSecSignal = std::make_shared<Notification<void>>(); } log() << "Successfully entered critical section."; _state = kCriticalSection; scopedGuard.Dismiss(); return Status::OK(); }
virtual void subthread(int tnumber) { Client::initThread("mongomutextest"); const ServiceContext::UniqueOperationContext txnPtr = cc().makeOperationContext(); OperationContext& txn = *txnPtr; sleepmillis(0); for (int i = 0; i < N; i++) { int x = std::rand(); bool sometimes = (x % 15 == 0); if (i % 7 == 0) { Lock::GlobalRead r(txn.lockState()); // nested test Lock::GlobalRead r2(txn.lockState()); } else if (i % 7 == 1) { Lock::GlobalRead r(txn.lockState()); ASSERT(txn.lockState()->isReadLocked()); } else if (i % 7 == 4 && tnumber == 1 /*only one upgrader legal*/) { Lock::GlobalWrite w(txn.lockState()); ASSERT(txn.lockState()->isW()); if (i % 7 == 2) { Lock::TempRelease t(txn.lockState()); } } else if (i % 7 == 2) { Lock::GlobalWrite w(txn.lockState()); ASSERT(txn.lockState()->isW()); if (sometimes) { Lock::TempRelease t(txn.lockState()); } } else if (i % 7 == 3) { Lock::GlobalWrite w(txn.lockState()); { Lock::TempRelease t(txn.lockState()); } Lock::GlobalRead r(txn.lockState()); ASSERT(txn.lockState()->isW()); if (sometimes) { Lock::TempRelease t(txn.lockState()); } } else if (i % 7 == 5) { { ScopedTransaction scopedXact(&txn, MODE_IS); Lock::DBLock r(txn.lockState(), "foo", MODE_S); } { ScopedTransaction scopedXact(&txn, MODE_IS); Lock::DBLock r(txn.lockState(), "bar", MODE_S); } } else if (i % 7 == 6) { if (i > N / 2) { int q = i % 11; if (q == 0) { ScopedTransaction scopedXact(&txn, MODE_IS); Lock::DBLock r(txn.lockState(), "foo", MODE_S); ASSERT(txn.lockState()->isDbLockedForMode("foo", MODE_S)); Lock::DBLock r2(txn.lockState(), "foo", MODE_S); ASSERT(txn.lockState()->isDbLockedForMode("foo", MODE_S)); Lock::DBLock r3(txn.lockState(), "local", MODE_S); ASSERT(txn.lockState()->isDbLockedForMode("foo", MODE_S)); ASSERT(txn.lockState()->isDbLockedForMode("local", MODE_S)); } else if (q == 1) { // test locking local only -- with no preceding lock { ScopedTransaction scopedXact(&txn, MODE_IS); Lock::DBLock x(txn.lockState(), "local", MODE_S); } { ScopedTransaction scopedXact(&txn, MODE_IX); Lock::DBLock x(txn.lockState(), "local", MODE_X); // No actual writing here, so no WriteUnitOfWork if (sometimes) { Lock::TempRelease t(txn.lockState()); } } } else if (q == 1) { { ScopedTransaction scopedXact(&txn, MODE_IS); Lock::DBLock x(txn.lockState(), "admin", MODE_S); } { ScopedTransaction scopedXact(&txn, MODE_IX); Lock::DBLock x(txn.lockState(), "admin", MODE_X); } } else if (q == 3) { ScopedTransaction scopedXact(&txn, MODE_IX); Lock::DBLock x(txn.lockState(), "foo", MODE_X); Lock::DBLock y(txn.lockState(), "admin", MODE_S); } else if (q == 4) { ScopedTransaction scopedXact(&txn, MODE_IS); Lock::DBLock x(txn.lockState(), "foo2", MODE_S); Lock::DBLock y(txn.lockState(), "admin", MODE_S); } else { ScopedTransaction scopedXact(&txn, MODE_IX); Lock::DBLock w(txn.lockState(), "foo", MODE_X); { Lock::TempRelease t(txn.lockState()); } Lock::DBLock r2(txn.lockState(), "foo", MODE_S); Lock::DBLock r3(txn.lockState(), "local", MODE_S); } } else { ScopedTransaction scopedXact(&txn, MODE_IS); Lock::DBLock r(txn.lockState(), "foo", MODE_S); Lock::DBLock r2(txn.lockState(), "foo", MODE_S); Lock::DBLock r3(txn.lockState(), "local", MODE_S); } } pm.hit(); } }
Status MigrationChunkClonerSourceLegacy::_storeCurrentLocs(OperationContext* txn) { ScopedTransaction scopedXact(txn, MODE_IS); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS); Collection* const collection = autoColl.getCollection(); if (!collection) { return {ErrorCodes::NamespaceNotFound, str::stream() << "Collection " << _args.getNss().ns() << " does not exist."}; } // Allow multiKey based on the invariant that shard keys must be single-valued. Therefore, any // multi-key index prefixed by shard key cannot be multikey over the shard key fields. IndexDescriptor* idx = collection->getIndexCatalog()->findShardKeyPrefixedIndex(txn, _shardKeyPattern.toBSON(), false); // requireSingleKey if (!idx) { return {ErrorCodes::IndexNotFound, str::stream() << "can't find index with prefix " << _shardKeyPattern.toBSON() << " in storeCurrentLocs for " << _args.getNss().ns()}; } // Install the stage, which will listen for notifications on the collection { stdx::lock_guard<stdx::mutex> sl(_mutex); invariant(!_deleteNotifyExec); // Takes ownership of 'ws' and 'dns'. auto statusWithPlanExecutor = PlanExecutor::make(txn, stdx::make_unique<WorkingSet>(), stdx::make_unique<DeleteNotificationStage>(this, txn), collection, PlanExecutor::YIELD_MANUAL); invariant(statusWithPlanExecutor.isOK()); _deleteNotifyExec = std::move(statusWithPlanExecutor.getValue()); _deleteNotifyExec->registerExec(collection); } // Assume both min and max non-empty, append MinKey's to make them fit chosen index const KeyPattern kp(idx->keyPattern()); BSONObj min = Helpers::toKeyFormat(kp.extendRangeBound(_args.getMinKey(), false)); BSONObj max = Helpers::toKeyFormat(kp.extendRangeBound(_args.getMaxKey(), false)); std::unique_ptr<PlanExecutor> exec(InternalPlanner::indexScan(txn, collection, idx, min, max, false, // endKeyInclusive PlanExecutor::YIELD_MANUAL)); // We can afford to yield here because any change to the base data that we might miss is already // being queued and will migrate in the 'transferMods' stage. exec->setYieldPolicy(PlanExecutor::YIELD_AUTO, collection); // Use the average object size to estimate how many objects a full chunk would carry do that // while traversing the chunk's range using the sharding index, below there's a fair amount of // slack before we determine a chunk is too large because object sizes will vary. unsigned long long maxRecsWhenFull; long long avgRecSize; const long long totalRecs = collection->numRecords(txn); if (totalRecs > 0) { avgRecSize = collection->dataSize(txn) / totalRecs; maxRecsWhenFull = _args.getMaxChunkSizeBytes() / avgRecSize; maxRecsWhenFull = std::min((unsigned long long)(Chunk::MaxObjectPerChunk + 1), 130 * maxRecsWhenFull / 100 /* slack */); } else { avgRecSize = 0; maxRecsWhenFull = Chunk::MaxObjectPerChunk + 1; } // Do a full traversal of the chunk and don't stop even if we think it is a large chunk we want // the number of records to better report, in that case. bool isLargeChunk = false; unsigned long long recCount = 0; BSONObj obj; RecordId recordId; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, &recordId))) { if (!isLargeChunk) { stdx::lock_guard<stdx::mutex> lk(_mutex); _cloneLocs.insert(recordId); } if (++recCount > maxRecsWhenFull) { isLargeChunk = true; // Continue on despite knowing that it will fail, just to get the correct value for // recCount } } if (PlanExecutor::DEAD == state || PlanExecutor::FAILURE == state) { return {ErrorCodes::InternalError, str::stream() << "Executor error while scanning for documents belonging to chunk: " << WorkingSetCommon::toStatusString(obj)}; } exec.reset(); if (isLargeChunk) { return { ErrorCodes::ChunkTooBig, str::stream() << "Cannot move chunk: the maximum number of documents for a chunk is " << maxRecsWhenFull << ", the maximum chunk size is " << _args.getMaxChunkSizeBytes() << ", average document size is " << avgRecSize << ". Found " << recCount << " documents in chunk " << " ns: " << _args.getNss().ns() << " " << _args.getMinKey() << " -> " << _args.getMaxKey()}; } _averageObjectSizeForCloneLocs = static_cast<uint64_t>(collection->averageObjectSize(txn) + 12); return Status::OK(); }
MigrationSourceManager::MigrationSourceManager(OperationContext* txn, MoveChunkRequest request) : _args(std::move(request)), _startTime() { invariant(!txn->lockState()->isLocked()); const auto& oss = OperationShardingState::get(txn); if (!oss.hasShardVersion()) { uasserted(ErrorCodes::InvalidOptions, "collection version is missing"); } // Even though the moveChunk command transmits a value in the operation's shardVersion field, // this value does not actually contain the shard version, but the global collection version. const ChunkVersion expectedCollectionVersion = oss.getShardVersion(_args.getNss()); log() << "Starting chunk migration for " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " with expected collection version " << expectedCollectionVersion; // Now that the collection is locked, snapshot the metadata and fetch the latest versions ShardingState* const shardingState = ShardingState::get(txn); ChunkVersion shardVersion; Status refreshStatus = shardingState->refreshMetadataNow(txn, _args.getNss().ns(), &shardVersion); if (!refreshStatus.isOK()) { uasserted(refreshStatus.code(), str::stream() << "cannot start migrate of chunk " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " due to " << refreshStatus.toString()); } if (shardVersion.majorVersion() == 0) { // If the major version is zero, this means we do not have any chunks locally to migrate in // the first place uasserted(ErrorCodes::IncompatibleShardingMetadata, str::stream() << "cannot start migrate of chunk " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " with zero shard version"); } // Snapshot the committed metadata from the time the migration starts { ScopedTransaction scopedXact(txn, MODE_IS); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS); auto css = CollectionShardingState::get(txn, _args.getNss()); _committedMetadata = css->getMetadata(); } const ChunkVersion collectionVersion = _committedMetadata->getCollVersion(); if (expectedCollectionVersion.epoch() != collectionVersion.epoch()) { throw SendStaleConfigException( _args.getNss().ns(), str::stream() << "cannot move chunk " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " because collection may have been dropped. " << "current epoch: " << collectionVersion.epoch() << ", cmd epoch: " << expectedCollectionVersion.epoch(), expectedCollectionVersion, collectionVersion); } // With nonzero shard version, we must have a coll version >= our shard version invariant(collectionVersion >= shardVersion); // With nonzero shard version, we must have a shard key invariant(!_committedMetadata->getKeyPattern().isEmpty()); ChunkType origChunk; if (!_committedMetadata->getNextChunk(_args.getMinKey(), &origChunk)) { // If this assertion is hit, it means that whoever called the shard moveChunk command // (mongos or the CSRS balancer) did not check whether the chunk actually belongs to this // shard. It is a benign error and does not indicate data corruption. uasserted(40145, str::stream() << "Chunk with bounds " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " is not owned by this shard."); } uassert(40146, str::stream() << "Unable to find chunk with the exact bounds " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " at collection version " << collectionVersion.toString() << ". This indicates corrupted metadata.", origChunk.getMin().woCompare(_args.getMinKey()) == 0 && origChunk.getMax().woCompare(_args.getMaxKey()) == 0); }
virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { if ( cmdObj.firstElement().type() != Array ) { errmsg = "ops has to be an array"; return false; } BSONObj ops = cmdObj.firstElement().Obj(); { // check input BSONObjIterator i( ops ); while ( i.more() ) { BSONElement e = i.next(); if (!_checkOperation(e, errmsg)) { return false; } } } // SERVER-4328 todo : is global ok or does this take a long time? i believe multiple // ns used so locking individually requires more analysis ScopedTransaction scopedXact(txn, MODE_X); Lock::GlobalWrite globalWriteLock(txn->lockState()); if (!fromRepl && !repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase(dbname)) { return appendCommandStatus(result, Status(ErrorCodes::NotMaster, str::stream() << "Not primary while applying ops to database " << dbname)); } // Preconditions check reads the database state, so needs to be done locked if ( cmdObj["preCondition"].type() == Array ) { BSONObjIterator i( cmdObj["preCondition"].Obj() ); while ( i.more() ) { BSONObj f = i.next().Obj(); DBDirectClient db( txn ); BSONObj realres = db.findOne( f["ns"].String() , f["q"].Obj() ); // Apply-ops would never have a $where matcher, so use the default callback, // which will throw an error if $where is found. Matcher m(f["res"].Obj()); if ( ! m.matches( realres ) ) { result.append( "got" , realres ); result.append( "whatFailed" , f ); errmsg = "pre-condition failed"; return false; } } } // apply int num = 0; int errors = 0; BSONObjIterator i( ops ); BSONArrayBuilder ab; const bool alwaysUpsert = cmdObj.hasField("alwaysUpsert") ? cmdObj["alwaysUpsert"].trueValue() : true; while ( i.more() ) { BSONElement e = i.next(); const BSONObj& temp = e.Obj(); // Ignore 'n' operations. const char *opType = temp["op"].valuestrsafe(); if (*opType == 'n') continue; const string ns = temp["ns"].String(); // Run operations under a nested lock as a hack to prevent yielding. // // The list of operations is supposed to be applied atomically; yielding // would break atomicity by allowing an interruption or a shutdown to occur // after only some operations are applied. We are already locked globally // at this point, so taking a DBLock on the namespace creates a nested lock, // and yields are disallowed for operations that hold a nested lock. // // We do not have a wrapping WriteUnitOfWork so it is possible for a journal // commit to happen with a subset of ops applied. // TODO figure out what to do about this. Lock::GlobalWrite globalWriteLockDisallowTempRelease(txn->lockState()); // Ensures that yielding will not happen (see the comment above). DEV { Locker::LockSnapshot lockSnapshot; invariant(!txn->lockState()->saveLockStateAndUnlock(&lockSnapshot)); }; OldClientContext ctx(txn, ns); Status status(ErrorCodes::InternalError, ""); while (true) { try { // We assume that in the WriteConflict retry case, either the op rolls back // any changes it makes or is otherwise safe to rerun. status = repl::applyOperation_inlock(txn, ctx.db(), temp, false, alwaysUpsert); break; } catch (const WriteConflictException& wce) { LOG(2) << "WriteConflictException in applyOps command, retrying."; txn->recoveryUnit()->commitAndRestart(); continue; } } ab.append(status.isOK()); if (!status.isOK()) { errors++; } num++; WriteUnitOfWork wuow(txn); logOpForDbHash(txn, ns.c_str()); wuow.commit(); } result.append( "applied" , num ); result.append( "results" , ab.arr() ); if ( ! fromRepl ) { // We want this applied atomically on slaves // so we re-wrap without the pre-condition for speed string tempNS = str::stream() << dbname << ".$cmd"; // TODO: possibly use mutable BSON to remove preCondition field // once it is available BSONObjIterator iter(cmdObj); BSONObjBuilder cmdBuilder; while (iter.more()) { BSONElement elem(iter.next()); if (strcmp(elem.fieldName(), "preCondition") != 0) { cmdBuilder.append(elem); } } const BSONObj cmdRewritten = cmdBuilder.done(); // We currently always logOp the command regardless of whether the individial ops // succeeded and rely on any failures to also happen on secondaries. This isn't // perfect, but it's what the command has always done and is part of its "correct" // behavior. while (true) { try { WriteUnitOfWork wunit(txn); getGlobalEnvironment()->getOpObserver()->onApplyOps(txn, tempNS, cmdRewritten); wunit.commit(); break; } catch (const WriteConflictException& wce) { LOG(2) << "WriteConflictException while logging applyOps command, retrying."; txn->recoveryUnit()->commitAndRestart(); continue; } } } if (errors != 0) { return false; } return true; }
Status MigrationSourceManager::commitDonateChunk(OperationContext* txn) { invariant(!txn->lockState()->isLocked()); invariant(_state == kCriticalSection); auto scopedGuard = MakeGuard([&] { cleanupOnError(txn); }); // Tell the recipient shard to fetch the latest changes Status commitCloneStatus = _cloneDriver->commitClone(txn); if (MONGO_FAIL_POINT(failMigrationCommit) && commitCloneStatus.isOK()) { commitCloneStatus = {ErrorCodes::InternalError, "Failing _recvChunkCommit due to failpoint."}; } if (!commitCloneStatus.isOK()) { return {commitCloneStatus.code(), str::stream() << "commit clone failed due to " << commitCloneStatus.toString()}; } // Generate the next collection version. ChunkVersion uncommittedCollVersion = _committedMetadata->getCollVersion(); uncommittedCollVersion.incMajor(); // applyOps preparation for reflecting the uncommitted metadata on the config server // Preconditions BSONArrayBuilder preCond; { BSONObjBuilder b; b.append("ns", ChunkType::ConfigNS); b.append("q", BSON("query" << BSON(ChunkType::ns(_args.getNss().ns())) << "orderby" << BSON(ChunkType::DEPRECATED_lastmod() << -1))); { BSONObjBuilder bb(b.subobjStart("res")); // TODO: For backwards compatibility, we can't yet require an epoch here bb.appendTimestamp(ChunkType::DEPRECATED_lastmod(), _committedMetadata->getCollVersion().toLong()); bb.done(); } preCond.append(b.obj()); } // Update for the chunk which is being donated BSONArrayBuilder updates; { BSONObjBuilder op; op.append("op", "u"); op.appendBool("b", false); // No upserting op.append("ns", ChunkType::ConfigNS); BSONObjBuilder n(op.subobjStart("o")); n.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), _args.getMinKey())); uncommittedCollVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod()); n.append(ChunkType::ns(), _args.getNss().ns()); n.append(ChunkType::min(), _args.getMinKey()); n.append(ChunkType::max(), _args.getMaxKey()); n.append(ChunkType::shard(), _args.getToShardId()); n.done(); BSONObjBuilder q(op.subobjStart("o2")); q.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), _args.getMinKey())); q.done(); updates.append(op.obj()); } // Update for the chunk being moved // Version at which the next highest lastmod will be set. If the chunk being moved is the last // in the shard, nextVersion is that chunk's lastmod otherwise the highest version is from the // chunk being bumped on the FROM-shard. ChunkVersion nextVersion = uncommittedCollVersion; // If we have chunks left on the FROM shard, update the version of one of them as well. We can // figure that out by grabbing the metadata as it has been changed. if (_committedMetadata->getNumChunks() > 1) { ChunkType bumpChunk; invariant(_committedMetadata->getDifferentChunk(_args.getMinKey(), &bumpChunk)); BSONObj bumpMin = bumpChunk.getMin(); BSONObj bumpMax = bumpChunk.getMax(); nextVersion.incMinor(); dassert(bumpMin.woCompare(_args.getMinKey()) != 0); BSONObjBuilder op; op.append("op", "u"); op.appendBool("b", false); op.append("ns", ChunkType::ConfigNS); BSONObjBuilder n(op.subobjStart("o")); n.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), bumpMin)); nextVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod()); n.append(ChunkType::ns(), _args.getNss().ns()); n.append(ChunkType::min(), bumpMin); n.append(ChunkType::max(), bumpMax); n.append(ChunkType::shard(), _args.getFromShardId()); n.done(); BSONObjBuilder q(op.subobjStart("o2")); q.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), bumpMin)); q.done(); updates.append(op.obj()); log() << "moveChunk updating self version to: " << nextVersion << " through " << bumpMin << " -> " << bumpMax << " for collection '" << _args.getNss().ns() << "'"; } else { log() << "moveChunk moved last chunk out for collection '" << _args.getNss().ns() << "'"; } MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangBeforeCommitMigration); Status applyOpsStatus = grid.catalogClient(txn)->applyChunkOpsDeprecated( txn, updates.arr(), preCond.arr(), _args.getNss().ns(), nextVersion); if (MONGO_FAIL_POINT(failCommitMigrationCommand)) { applyOpsStatus = Status(ErrorCodes::InternalError, "Failpoint 'failCommitMigrationCommand' generated error"); } if (applyOpsStatus.isOK()) { // Now that applyOps succeeded and the new collection version is committed, update the // collection metadata to the new collection version and forget the migrated chunk. ScopedTransaction scopedXact(txn, MODE_IX); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X); ChunkType migratingChunkToForget; migratingChunkToForget.setMin(_args.getMinKey()); migratingChunkToForget.setMax(_args.getMaxKey()); _committedMetadata = _committedMetadata->cloneMigrate(migratingChunkToForget, uncommittedCollVersion); auto css = CollectionShardingState::get(txn, _args.getNss().ns()); css->setMetadata(_committedMetadata); } else { // This could be an unrelated error (e.g. network error). Check whether the metadata update // succeeded by refreshing the collection metadata from the config server and checking that // the original chunks no longer exist. warning() << "Migration metadata commit may have failed: refreshing metadata to check" << causedBy(applyOpsStatus); // Need to get the latest optime in case the refresh request goes to a secondary -- // otherwise the read won't wait for the write that applyChunkOpsDeprecated may have done. Status status = grid.catalogClient(txn)->logChange( txn, "moveChunk.validating", _args.getNss().ns(), BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey() << "from" << _args.getFromShardId() << "to" << _args.getToShardId())); if (!status.isOK()) { fassertStatusOK( 40137, {status.code(), str::stream() << "applyOps failed to commit chunk [" << _args.getMinKey() << "," << _args.getMaxKey() << ") due to " << causedBy(applyOpsStatus) << ", and updating the optime with a write before refreshing the " << "metadata also failed: " << causedBy(status)}); } ShardingState* const shardingState = ShardingState::get(txn); ChunkVersion shardVersion; Status refreshStatus = shardingState->refreshMetadataNow(txn, _args.getNss().ns(), &shardVersion); fassertStatusOK(34431, {refreshStatus.code(), str::stream() << "applyOps failed to commit chunk [" << _args.getMinKey() << "," << _args.getMaxKey() << ") due to " << causedBy(applyOpsStatus) << ", and refreshing collection metadata failed: " << causedBy(refreshStatus)}); { ScopedTransaction scopedXact(txn, MODE_IS); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS); auto css = CollectionShardingState::get(txn, _args.getNss()); std::shared_ptr<CollectionMetadata> refreshedMetadata = css->getMetadata(); if (refreshedMetadata->keyBelongsToMe(_args.getMinKey())) { invariant(refreshedMetadata->getCollVersion() == _committedMetadata->getCollVersion()); // After refresh, the collection metadata indicates that the donor shard still owns // the chunk, so no migration changes were written to the config server metadata. return {applyOpsStatus.code(), str::stream() << "Migration was not committed, applyOps failed: " << causedBy(applyOpsStatus)}; } ChunkVersion refreshedCollectionVersion = refreshedMetadata->getCollVersion(); if (!refreshedCollectionVersion.equals(nextVersion)) { // The refreshed collection metadata's collection version does not match the control // chunk's updated collection version, which should now be the highest. The control // chunk was not committed, but the migrated chunk was. This state is not // recoverable. fassertStatusOK(40138, {applyOpsStatus.code(), str::stream() << "Migration was partially committed, state is " << "unrecoverable. applyOps error: " << causedBy(applyOpsStatus)}); } } } MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangBeforeLeavingCriticalSection); scopedGuard.Dismiss(); _cleanup(txn); grid.catalogClient(txn)->logChange(txn, "moveChunk.commit", _args.getNss().ns(), BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey() << "from" << _args.getFromShardId() << "to" << _args.getToShardId())); return Status::OK(); }