void DeferredWriter::_worker(InsertStatement stmt) { auto uniqueOpCtx = Client::getCurrent()->makeOperationContext(); OperationContext* opCtx = uniqueOpCtx.get(); auto result = _getCollection(opCtx); if (!result.isOK()) { _logFailure(result.getStatus()); return; } auto agc = std::move(result.getValue()); Collection& collection = *agc->getCollection(); Status status = writeConflictRetry(opCtx, "deferred insert", _nss.ns(), [&] { WriteUnitOfWork wuow(opCtx); Status status = collection.insertDocument(opCtx, stmt, nullptr, false); if (!status.isOK()) { return status; } wuow.commit(); return Status::OK(); }); stdx::lock_guard<stdx::mutex> lock(_mutex); _numBytes -= stmt.doc.objsize(); // If a write to a deferred collection fails, periodically tell the log. if (!status.isOK()) { _logFailure(status); } }
OpTimeWithHash BackgroundSync::_readLastAppliedOpTimeWithHash(OperationContext* opCtx) { BSONObj oplogEntry; try { bool success = writeConflictRetry( opCtx, "readLastAppliedHash", NamespaceString::kRsOplogNamespace.ns(), [&] { Lock::DBLock lk(opCtx, "local", MODE_X); return Helpers::getLast( opCtx, NamespaceString::kRsOplogNamespace.ns().c_str(), oplogEntry); }); if (!success) { // This can happen when we are to do an initial sync. lastHash will be set // after the initial sync is complete. return OpTimeWithHash(0); } } catch (const DBException& ex) { severe() << "Problem reading " << NamespaceString::kRsOplogNamespace.ns() << ": " << redact(ex); fassertFailed(18904); } long long hash; auto status = bsonExtractIntegerField(oplogEntry, kHashFieldName, &hash); if (!status.isOK()) { severe() << "Most recent entry in " << NamespaceString::kRsOplogNamespace.ns() << " is missing or has invalid \"" << kHashFieldName << "\" field. Oplog entry: " << redact(oplogEntry) << ": " << redact(status); fassertFailed(18902); } OplogEntry parsedEntry(oplogEntry); return OpTimeWithHash(hash, parsedEntry.getOpTime()); }
OpTime ReplicationCoordinatorExternalStateImpl::onTransitionToPrimary(OperationContext* opCtx, bool isV1ElectionProtocol) { invariant(opCtx->lockState()->isW()); // Clear the appliedThrough marker so on startup we'll use the top of the oplog. This must be // done before we add anything to our oplog. // We record this update at the 'lastAppliedOpTime'. If there are any outstanding // checkpoints being taken, they should only reflect this write if they see all writes up // to our 'lastAppliedOpTime'. invariant( _replicationProcess->getConsistencyMarkers()->getOplogTruncateAfterPoint(opCtx).isNull()); auto lastAppliedOpTime = repl::ReplicationCoordinator::get(opCtx)->getMyLastAppliedOpTime(); _replicationProcess->getConsistencyMarkers()->clearAppliedThrough( opCtx, lastAppliedOpTime.getTimestamp()); if (isV1ElectionProtocol) { writeConflictRetry(opCtx, "logging transition to primary to oplog", "local.oplog.rs", [&] { WriteUnitOfWork wuow(opCtx); opCtx->getClient()->getServiceContext()->getOpObserver()->onOpMessage( opCtx, BSON("msg" << "new primary")); wuow.commit(); }); } const auto opTimeToReturn = fassertStatusOK(28665, loadLastOpTime(opCtx)); _shardingOnTransitionToPrimaryHook(opCtx); _dropAllTempCollections(opCtx); serverGlobalParams.validateFeaturesAsMaster.store(true); return opTimeToReturn; }
void DatabaseHolderImpl::dropDb(OperationContext* opCtx, Database* db) { invariant(db); // Store the name so we have if for after the db object is deleted auto name = db->name(); LOG(1) << "dropDatabase " << name; invariant(opCtx->lockState()->isDbLockedForMode(name, MODE_X)); BackgroundOperation::assertNoBgOpInProgForDb(name); audit::logDropDatabase(opCtx->getClient(), name); auto const serviceContext = opCtx->getServiceContext(); for (auto collIt = db->begin(opCtx); collIt != db->end(opCtx); ++collIt) { auto coll = *collIt; if (!coll) { break; } Top::get(serviceContext).collectionDropped(coll->ns().ns(), true); } close(opCtx, name); auto const storageEngine = serviceContext->getStorageEngine(); writeConflictRetry(opCtx, "dropDatabase", name, [&] { storageEngine->dropDatabase(opCtx, name).transitional_ignore(); }); }
Status ReplicationCoordinatorExternalStateImpl::initializeReplSetStorage(OperationContext* opCtx, const BSONObj& config) { try { createOplog(opCtx); writeConflictRetry(opCtx, "initiate oplog entry", NamespaceString::kRsOplogNamespace.toString(), [this, &opCtx, &config] { Lock::GlobalWrite globalWrite(opCtx); WriteUnitOfWork wuow(opCtx); Helpers::putSingleton(opCtx, configCollectionName, config); const auto msgObj = BSON("msg" << "initiating set"); _service->getOpObserver()->onOpMessage(opCtx, msgObj); wuow.commit(); // ReplSetTest assumes that immediately after the replSetInitiate // command returns, it can allow other nodes to initial sync with no // retries and they will succeed. Unfortunately, initial sync will // fail if it finds its sync source has an empty oplog. Thus, we // need to wait here until the seed document is visible in our oplog. AutoGetCollection oplog( opCtx, NamespaceString::kRsOplogNamespace, MODE_IS); waitForAllEarlierOplogWritesToBeVisible(opCtx); }); // Set UUIDs for all non-replicated collections. This is necessary for independent replica // sets and config server replica sets started with no data files because collections in // local are created prior to the featureCompatibilityVersion being set to 3.6, so the // collections are not created with UUIDs. We exclude ShardServers when adding UUIDs to // non-replicated collections on the primary because ShardServers are started up by default // with featureCompatibilityVersion 3.4, so we don't want to assign UUIDs to them until the // cluster's featureCompatibilityVersion is explicitly set to 3.6 by the config server. The // below UUID addition for non-replicated collections only occurs on the primary; UUIDs are // added to non-replicated collections on secondaries during InitialSync. When the config // server sets the featureCompatibilityVersion to 3.6, the shard primary will add UUIDs to // all the collections that need them. One special case here is if a shard is already in // featureCompatibilityVersion 3.6 and a new node is started up with --shardsvr and added to // that shard, the new node will still start up with featureCompatibilityVersion 3.4 and // need to have UUIDs added to each collection. These UUIDs are added during InitialSync, // because the new node is a secondary. if (serverGlobalParams.clusterRole != ClusterRole::ShardServer && FeatureCompatibilityVersion::isCleanStartUp()) { auto schemaStatus = updateUUIDSchemaVersionNonReplicated(opCtx, true); if (!schemaStatus.isOK()) { return schemaStatus; } } FeatureCompatibilityVersion::setIfCleanStartup(opCtx, _storageInterface); } catch (const DBException& ex) { return ex.toStatus(); } return Status::OK(); }
Status ReplicationCoordinatorExternalStateImpl::storeLocalConfigDocument(OperationContext* opCtx, const BSONObj& config) { try { writeConflictRetry(opCtx, "save replica set config", configCollectionName, [&] { Lock::DBLock dbWriteLock(opCtx, configDatabaseName, MODE_X); Helpers::putSingleton(opCtx, configCollectionName, config); }); return Status::OK(); } catch (const DBException& ex) { return ex.toStatus(); } }
Status dropIndexes(OperationContext* opCtx, const NamespaceString& nss, const BSONObj& cmdObj, BSONObjBuilder* result) { return writeConflictRetry(opCtx, "dropIndexes", nss.db(), [opCtx, &nss, &cmdObj, result] { AutoGetCollection autoColl(opCtx, nss, MODE_IX, MODE_X); bool userInitiatedWritesAndNotPrimary = opCtx->writesAreReplicated() && !repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesFor(opCtx, nss); if (userInitiatedWritesAndNotPrimary) { return Status(ErrorCodes::NotMaster, str::stream() << "Not primary while dropping indexes in " << nss); } if (!serverGlobalParams.quiet.load()) { LOG(0) << "CMD: dropIndexes " << nss << ": " << cmdObj[kIndexFieldName].toString(false); } // If db/collection does not exist, short circuit and return. Database* db = autoColl.getDb(); Collection* collection = autoColl.getCollection(); if (!collection) { if (db && ViewCatalog::get(db)->lookup(opCtx, nss.ns())) { return Status(ErrorCodes::CommandNotSupportedOnView, str::stream() << "Cannot drop indexes on view " << nss); } return Status(ErrorCodes::NamespaceNotFound, "ns not found"); } BackgroundOperation::assertNoBgOpInProgForNs(nss); IndexBuildsCoordinator::get(opCtx)->assertNoIndexBuildInProgForCollection( collection->uuid().get()); WriteUnitOfWork wunit(opCtx); OldClientContext ctx(opCtx, nss.ns()); Status status = wrappedRun(opCtx, collection, cmdObj, result); if (!status.isOK()) { return status; } wunit.commit(); return Status::OK(); }); }
StatusWith<BSONObj> ReplicationCoordinatorExternalStateImpl::loadLocalConfigDocument( OperationContext* opCtx) { try { return writeConflictRetry(opCtx, "load replica set config", configCollectionName, [opCtx] { BSONObj config; if (!Helpers::getSingleton(opCtx, configCollectionName, config)) { return StatusWith<BSONObj>( ErrorCodes::NoMatchingDocument, str::stream() << "Did not find replica set configuration document in " << configCollectionName); } return StatusWith<BSONObj>(config); }); } catch (const DBException& ex) { return StatusWith<BSONObj>(ex.toStatus()); } }
void ServiceContextMongoDTest::_dropAllDBs(OperationContext* opCtx) { dropAllDatabasesExceptLocal(opCtx); Lock::GlobalWrite lk(opCtx); AutoGetDb autoDBLocal(opCtx, "local", MODE_X); const auto localDB = autoDBLocal.getDb(); if (localDB) { writeConflictRetry(opCtx, "_dropAllDBs", "local", [&] { // Do not wrap in a WriteUnitOfWork until SERVER-17103 is addressed. autoDBLocal.getDb()->dropDatabase(opCtx, localDB); }); } // dropAllDatabasesExceptLocal() does not close empty databases. However the holder still // allocates resources to track these empty databases. These resources not released by // dropAllDatabasesExceptLocal() will be leaked at exit unless we call DatabaseHolder::closeAll. dbHolder().closeAll(opCtx, "all databases dropped"); }
StatusWith<LastVote> ReplicationCoordinatorExternalStateImpl::loadLocalLastVoteDocument( OperationContext* opCtx) { try { return writeConflictRetry( opCtx, "load replica set lastVote", lastVoteCollectionName, [opCtx] { BSONObj lastVoteObj; if (!Helpers::getSingleton(opCtx, lastVoteCollectionName, lastVoteObj)) { return StatusWith<LastVote>( ErrorCodes::NoMatchingDocument, str::stream() << "Did not find replica set lastVote document in " << lastVoteCollectionName); } return LastVote::readFromLastVote(lastVoteObj); }); } catch (const DBException& ex) { return StatusWith<LastVote>(ex.toStatus()); } }
void NoopWriter::_writeNoop(OperationContext* opCtx) { // Use GlobalLock + lockMMAPV1Flush instead of DBLock to allow return when the lock is not // available. It may happen when the primary steps down and a shared global lock is acquired. Lock::GlobalLock lock( opCtx, MODE_IX, Date_t::now() + Milliseconds(1), Lock::InterruptBehavior::kLeaveUnlocked); if (!lock.isLocked()) { LOG(1) << "Global lock is not available skipping noopWrite"; return; } opCtx->lockState()->lockMMAPV1Flush(); auto replCoord = ReplicationCoordinator::get(opCtx); // Its a proxy for being a primary if (!replCoord->canAcceptWritesForDatabase(opCtx, "admin")) { LOG(1) << "Not a primary, skipping the noop write"; return; } auto lastAppliedOpTime = replCoord->getMyLastAppliedOpTime(); // _lastKnownOpTime is not protected by lock as its used only by one thread. if (lastAppliedOpTime != _lastKnownOpTime) { LOG(1) << "Not scheduling a noop write. Last known OpTime: " << _lastKnownOpTime << " != last primary OpTime: " << lastAppliedOpTime; } else { if (writePeriodicNoops.load()) { const auto logLevel = getTestCommandsEnabled() ? 0 : 1; LOG(logLevel) << "Writing noop to oplog as there has been no writes to this replica set in over " << _writeInterval; writeConflictRetry( opCtx, "writeNoop", NamespaceString::kRsOplogNamespace.ns(), [&opCtx] { WriteUnitOfWork uow(opCtx); opCtx->getClient()->getServiceContext()->getOpObserver()->onOpMessage(opCtx, kMsgObj); uow.commit(); }); } } _lastKnownOpTime = replCoord->getMyLastAppliedOpTime(); LOG(1) << "Set last known op time to " << _lastKnownOpTime; }
SessionCatalogMigrationSource::SessionCatalogMigrationSource(OperationContext* opCtx, NamespaceString ns) : _ns(std::move(ns)), _rollbackIdAtInit(repl::ReplicationProcess::get(opCtx)->getRollbackID()) { // Exclude entries for transaction. Query query; // Sort is not needed for correctness. This is just for making it easier to write deterministic // tests. query.sort(BSON("_id" << 1)); DBDirectClient client(opCtx); auto cursor = client.query(NamespaceString::kSessionTransactionsTableNamespace, query); while (cursor->more()) { auto nextSession = SessionTxnRecord::parse( IDLParserErrorContext("Session migration cloning"), cursor->next()); if (!nextSession.getLastWriteOpTime().isNull()) { _sessionOplogIterators.push_back( stdx::make_unique<SessionOplogIterator>(std::move(nextSession), _rollbackIdAtInit)); } } { AutoGetCollection autoColl(opCtx, NamespaceString::kRsOplogNamespace, MODE_IX); writeConflictRetry( opCtx, "session migration initialization majority commit barrier", NamespaceString::kRsOplogNamespace.ns(), [&] { const auto message = BSON("sessionMigrateCloneStart" << _ns.ns()); WriteUnitOfWork wuow(opCtx); opCtx->getClient()->getServiceContext()->getOpObserver()->onInternalOpMessage( opCtx, _ns, {}, {}, message); wuow.commit(); }); } auto opTimeToWait = repl::ReplClientInfo::forClient(opCtx->getClient()).getLastOp(); WriteConcernResult result; WriteConcernOptions majority( WriteConcernOptions::kMajority, WriteConcernOptions::SyncMode::UNSET, 0); uassertStatusOK(waitForWriteConcern(opCtx, opTimeToWait, majority, &result)); }
Status ReplicationCoordinatorExternalStateImpl::storeLocalLastVoteDocument( OperationContext* opCtx, const LastVote& lastVote) { BSONObj lastVoteObj = lastVote.toBSON(); try { Status status = writeConflictRetry(opCtx, "save replica set lastVote", lastVoteCollectionName, [&] { Lock::DBLock dbWriteLock(opCtx, lastVoteDatabaseName, MODE_X); // If there is no last vote document, we want to store one. Otherwise, we only want // to replace it if the new last vote document would have a higher term. We both // check the term of the current last vote document and insert the new document // under the DBLock to synchronize the two operations. BSONObj result; bool exists = Helpers::getSingleton(opCtx, lastVoteCollectionName, result); if (!exists) { Helpers::putSingleton(opCtx, lastVoteCollectionName, lastVoteObj); } else { StatusWith<LastVote> oldLastVoteDoc = LastVote::readFromLastVote(result); if (!oldLastVoteDoc.isOK()) { return oldLastVoteDoc.getStatus(); } if (lastVote.getTerm() > oldLastVoteDoc.getValue().getTerm()) { Helpers::putSingleton(opCtx, lastVoteCollectionName, lastVoteObj); } } return Status::OK(); }); if (!status.isOK()) { return status; } opCtx->recoveryUnit()->waitUntilDurable(); return Status::OK(); } catch (const DBException& ex) { return ex.toStatus(); } }
StatusWith<int> CollectionRangeDeleter::_doDeletion(OperationContext* opCtx, Collection* collection, BSONObj const& keyPattern, ChunkRange const& range, int maxToDelete) { invariant(collection != nullptr); invariant(!isEmpty()); auto const& nss = collection->ns(); // The IndexChunk has a keyPattern that may apply to more than one index - we need to // select the index and get the full index keyPattern here. auto catalog = collection->getIndexCatalog(); const IndexDescriptor* idx = catalog->findShardKeyPrefixedIndex(opCtx, keyPattern, false); if (!idx) { std::string msg = str::stream() << "Unable to find shard key index for " << keyPattern.toString() << " in " << nss.ns(); LOG(0) << msg; return {ErrorCodes::InternalError, msg}; } // Extend bounds to match the index we found const KeyPattern indexKeyPattern(idx->keyPattern()); const auto extend = [&](const auto& key) { return Helpers::toKeyFormat(indexKeyPattern.extendRangeBound(key, false)); }; const auto min = extend(range.getMin()); const auto max = extend(range.getMax()); LOG(1) << "begin removal of " << min << " to " << max << " in " << nss.ns(); const auto indexName = idx->indexName(); IndexDescriptor* descriptor = collection->getIndexCatalog()->findIndexByName(opCtx, indexName); if (!descriptor) { std::string msg = str::stream() << "shard key index with name " << indexName << " on '" << nss.ns() << "' was dropped"; LOG(0) << msg; return {ErrorCodes::InternalError, msg}; } boost::optional<Helpers::RemoveSaver> saver; if (serverGlobalParams.moveParanoia) { saver.emplace("moveChunk", nss.ns(), "cleaning"); } auto halfOpen = BoundInclusion::kIncludeStartKeyOnly; auto manual = PlanExecutor::YIELD_MANUAL; auto forward = InternalPlanner::FORWARD; auto fetch = InternalPlanner::IXSCAN_FETCH; auto exec = InternalPlanner::indexScan( opCtx, collection, descriptor, min, max, halfOpen, manual, forward, fetch); int numDeleted = 0; do { RecordId rloc; BSONObj obj; PlanExecutor::ExecState state = exec->getNext(&obj, &rloc); if (state == PlanExecutor::IS_EOF) { break; } if (state == PlanExecutor::FAILURE || state == PlanExecutor::DEAD) { warning() << PlanExecutor::statestr(state) << " - cursor error while trying to delete " << redact(min) << " to " << redact(max) << " in " << nss << ": " << redact(WorkingSetCommon::toStatusString(obj)) << ", stats: " << Explain::getWinningPlanStats(exec.get()); break; } invariant(PlanExecutor::ADVANCED == state); exec->saveState(); writeConflictRetry(opCtx, "delete range", nss.ns(), [&] { WriteUnitOfWork wuow(opCtx); if (saver) { uassertStatusOK(saver->goingToDelete(obj)); } collection->deleteDocument(opCtx, kUninitializedStmtId, rloc, nullptr, true); wuow.commit(); }); try { exec->restoreState(); } catch (const DBException& ex) { warning() << "error restoring cursor state while trying to delete " << redact(min) << " to " << redact(max) << " in " << nss << ", stats: " << Explain::getWinningPlanStats(exec.get()) << ": " << redact(ex.toStatus()); break; } ShardingStatistics::get(opCtx).countDocsDeletedOnDonor.addAndFetch(1); } while (++numDeleted < maxToDelete); return numDeleted; }
Status createCollectionForApplyOps(OperationContext* opCtx, const std::string& dbName, const BSONElement& ui, const BSONObj& cmdObj, const BSONObj& idIndex) { invariant(opCtx->lockState()->isDbLockedForMode(dbName, MODE_X)); auto db = dbHolder().get(opCtx, dbName); const NamespaceString newCollName(Command::parseNsCollectionRequired(dbName, cmdObj)); auto newCmd = cmdObj; // If a UUID is given, see if we need to rename a collection out of the way, and whether the // collection already exists under a different name. If so, rename it into place. As this is // done during replay of the oplog, the operations do not need to be atomic, just idempotent. // We need to do the renaming part in a separate transaction, as we cannot transactionally // create a database on MMAPv1, which could result in createCollection failing if the database // does not yet exist. if (ui.ok()) { // Return an optional, indicating whether we need to early return (if the collection already // exists, or in case of an error). using Result = boost::optional<Status>; auto result = writeConflictRetry(opCtx, "createCollectionForApplyOps", newCollName.ns(), [&] { WriteUnitOfWork wunit(opCtx); // Options need the field to be named "uuid", so parse/recreate. auto uuid = uassertStatusOK(UUID::parse(ui)); uassert(ErrorCodes::InvalidUUID, "Invalid UUID in applyOps create command: " + uuid.toString(), uuid.isRFC4122v4()); auto& catalog = UUIDCatalog::get(opCtx); auto currentName = catalog.lookupNSSByUUID(uuid); OpObserver* opObserver = getGlobalServiceContext()->getOpObserver(); if (currentName == newCollName) return Result(Status::OK()); // In the case of oplog replay, a future command may have created or renamed a // collection with that same name. In that case, renaming this future collection to // a random temporary name is correct: once all entries are replayed no temporary // names will remain. On MMAPv1 the rename can result in index names that are too // long. However this should only happen for initial sync and "resync collection" // for rollback, so we can let the error propagate resulting in an abort and restart // of the initial sync or result in rollback to fassert, requiring a resync of that // node. const bool stayTemp = true; if (auto futureColl = db ? db->getCollection(opCtx, newCollName) : nullptr) { auto tmpNameResult = db->makeUniqueCollectionNamespace(opCtx, "tmp%%%%%"); if (!tmpNameResult.isOK()) { return Result(Status(tmpNameResult.getStatus().code(), str::stream() << "Cannot generate temporary " "collection namespace for applyOps " "create command: collection: " << newCollName.ns() << ". error: " << tmpNameResult.getStatus().reason())); } const auto& tmpName = tmpNameResult.getValue(); Status status = db->renameCollection(opCtx, newCollName.ns(), tmpName.ns(), stayTemp); if (!status.isOK()) return Result(status); opObserver->onRenameCollection(opCtx, newCollName, tmpName, futureColl->uuid(), /*dropTarget*/ false, /*dropTargetUUID*/ {}, stayTemp); } // If the collection with the requested UUID already exists, but with a different // name, just rename it to 'newCollName'. if (catalog.lookupCollectionByUUID(uuid)) { Status status = db->renameCollection(opCtx, currentName.ns(), newCollName.ns(), stayTemp); if (!status.isOK()) return Result(status); opObserver->onRenameCollection(opCtx, currentName, newCollName, uuid, /*dropTarget*/ false, /*dropTargetUUID*/ {}, stayTemp); wunit.commit(); return Result(Status::OK()); } // A new collection with the specific UUID must be created, so add the UUID to the // creation options. Regular user collection creation commands cannot do this. auto uuidObj = uuid.toBSON(); newCmd = cmdObj.addField(uuidObj.firstElement()); wunit.commit(); return Result(boost::none); }); if (result) { return *result; } } return createCollection( opCtx, newCollName, newCmd, idIndex, CollectionOptions::parseForStorage); }
Status dropCollection(OperationContext* opCtx, const NamespaceString& collectionName, BSONObjBuilder& result, const repl::OpTime& dropOpTime, DropCollectionSystemCollectionMode systemCollectionMode) { if (!serverGlobalParams.quiet.load()) { log() << "CMD: drop " << collectionName; } return writeConflictRetry(opCtx, "drop", collectionName.ns(), [&] { AutoGetDb autoDb(opCtx, collectionName.db(), MODE_X); Database* const db = autoDb.getDb(); Collection* coll = db ? db->getCollection(opCtx, collectionName) : nullptr; auto view = db && !coll ? db->getViewCatalog()->lookup(opCtx, collectionName.ns()) : nullptr; if (MONGO_FAIL_POINT(hangDuringDropCollection)) { log() << "hangDuringDropCollection fail point enabled. Blocking until fail point is " "disabled."; MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangDuringDropCollection); } if (!db || (!coll && !view)) { return Status(ErrorCodes::NamespaceNotFound, "ns not found"); } const bool shardVersionCheck = true; OldClientContext context(opCtx, collectionName.ns(), shardVersionCheck); bool userInitiatedWritesAndNotPrimary = opCtx->writesAreReplicated() && !repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesFor(opCtx, collectionName); if (userInitiatedWritesAndNotPrimary) { return Status(ErrorCodes::NotMaster, str::stream() << "Not primary while dropping collection " << collectionName); } WriteUnitOfWork wunit(opCtx); if (!result.hasField("ns")) { result.append("ns", collectionName.ns()); } if (coll) { invariant(!view); int numIndexes = coll->getIndexCatalog()->numIndexesTotal(opCtx); BackgroundOperation::assertNoBgOpInProgForNs(collectionName.ns()); Status s = systemCollectionMode == DropCollectionSystemCollectionMode::kDisallowSystemCollectionDrops ? db->dropCollection(opCtx, collectionName.ns(), dropOpTime) : db->dropCollectionEvenIfSystem(opCtx, collectionName, dropOpTime); if (!s.isOK()) { return s; } result.append("nIndexesWas", numIndexes); } else { invariant(view); Status status = db->dropView(opCtx, collectionName.ns()); if (!status.isOK()) { return status; } } wunit.commit(); return Status::OK(); }); }
Status dropDatabase(OperationContext* opCtx, const std::string& dbName) { uassert(ErrorCodes::IllegalOperation, "Cannot drop a database in read-only mode", !storageGlobalParams.readOnly); // TODO (Kal): OldClientContext legacy, needs to be removed { CurOp::get(opCtx)->ensureStarted(); stdx::lock_guard<Client> lk(*opCtx->getClient()); CurOp::get(opCtx)->setNS_inlock(dbName); } auto replCoord = repl::ReplicationCoordinator::get(opCtx); std::size_t numCollectionsToDrop = 0; // We have to wait for the last drop-pending collection to be removed if there are no // collections to drop. repl::OpTime latestDropPendingOpTime; using Result = boost::optional<Status>; // Get an optional result--if it's there, early return; otherwise, wait for collections to drop. auto result = writeConflictRetry(opCtx, "dropDatabase_collection", dbName, [&] { Lock::GlobalWrite lk(opCtx); AutoGetDb autoDB(opCtx, dbName, MODE_X); Database* const db = autoDB.getDb(); if (!db) { return Result(Status(ErrorCodes::NamespaceNotFound, str::stream() << "Could not drop database " << dbName << " because it does not exist")); } bool userInitiatedWritesAndNotPrimary = opCtx->writesAreReplicated() && !replCoord->canAcceptWritesForDatabase(opCtx, dbName); if (userInitiatedWritesAndNotPrimary) { return Result( Status(ErrorCodes::NotMaster, str::stream() << "Not primary while dropping database " << dbName)); } log() << "dropDatabase " << dbName << " - starting"; db->setDropPending(opCtx, true); // If Database::dropCollectionEventIfSystem() fails, we should reset the drop-pending state // on Database. auto dropPendingGuard = MakeGuard([&db, opCtx] { db->setDropPending(opCtx, false); }); for (auto collection : *db) { const auto& nss = collection->ns(); if (nss.isDropPendingNamespace() && replCoord->isReplEnabled() && opCtx->writesAreReplicated()) { log() << "dropDatabase " << dbName << " - found drop-pending collection: " << nss; latestDropPendingOpTime = std::max( latestDropPendingOpTime, uassertStatusOK(nss.getDropPendingNamespaceOpTime())); continue; } if (replCoord->isOplogDisabledFor(opCtx, nss) || nss.isSystemDotIndexes()) { continue; } log() << "dropDatabase " << dbName << " - dropping collection: " << nss; WriteUnitOfWork wunit(opCtx); fassertStatusOK(40476, db->dropCollectionEvenIfSystem(opCtx, nss)); wunit.commit(); numCollectionsToDrop++; } dropPendingGuard.Dismiss(); // If there are no collection drops to wait for, we complete the drop database operation. if (numCollectionsToDrop == 0U && latestDropPendingOpTime.isNull()) { return Result(_finishDropDatabase(opCtx, dbName, db)); } return Result(boost::none); }); if (result) { return *result; } // If waitForWriteConcern() returns an error or throws an exception, we should reset the // drop-pending state on Database. auto dropPendingGuardWhileAwaitingReplication = MakeGuard([dbName, opCtx] { Lock::GlobalWrite lk(opCtx); AutoGetDb autoDB(opCtx, dbName, MODE_X); if (auto db = autoDB.getDb()) { db->setDropPending(opCtx, false); } }); { // Holding of any locks is disallowed while awaiting replication because this can // potentially block for long time while doing network activity. // // Even though dropDatabase() does not explicitly acquire any locks before awaiting // replication, it is possible that the caller of this function may already have acquired // a lock. The applyOps command is an example of a dropDatabase() caller that does this. // Therefore, we have to release any locks using a TempRelease RAII object. // // TODO: Remove the use of this TempRelease object when SERVER-29802 is completed. // The work in SERVER-29802 will adjust the locking rules around applyOps operations and // dropDatabase is expected to be one of the operations where we expect to no longer acquire // the global lock. Lock::TempRelease release(opCtx->lockState()); if (numCollectionsToDrop > 0U) { auto status = replCoord->awaitReplicationOfLastOpForClient(opCtx, kDropDatabaseWriteConcern) .status; if (!status.isOK()) { return Status(status.code(), str::stream() << "dropDatabase " << dbName << " failed waiting for " << numCollectionsToDrop << " collection drops to replicate: " << status.reason()); } log() << "dropDatabase " << dbName << " - successfully dropped " << numCollectionsToDrop << " collections. dropping database"; } else { invariant(!latestDropPendingOpTime.isNull()); auto status = replCoord ->awaitReplication(opCtx, latestDropPendingOpTime, kDropDatabaseWriteConcern) .status; if (!status.isOK()) { return Status( status.code(), str::stream() << "dropDatabase " << dbName << " failed waiting for pending collection drops (most recent drop optime: " << latestDropPendingOpTime.toString() << ") to replicate: " << status.reason()); } log() << "dropDatabase " << dbName << " - pending collection drops completed. dropping database"; } } dropPendingGuardWhileAwaitingReplication.Dismiss(); return writeConflictRetry(opCtx, "dropDatabase_database", dbName, [&] { Lock::GlobalWrite lk(opCtx); AutoGetDb autoDB(opCtx, dbName, MODE_X); if (auto db = autoDB.getDb()) { return _finishDropDatabase(opCtx, dbName, db); } return Status(ErrorCodes::NamespaceNotFound, str::stream() << "Could not drop database " << dbName << " because it does not exist after dropping " << numCollectionsToDrop << " collection(s)."); }); }
Status IndexAccessMethod::commitBulk(OperationContext* opCtx, std::unique_ptr<BulkBuilder> bulk, bool mayInterrupt, bool dupsAllowed, set<RecordId>* dupsToDrop, bool assignTimestamp) { // Do not track multikey path info for index builds. ScopeGuard restartTracker = MakeGuard([opCtx] { MultikeyPathTracker::get(opCtx).startTrackingMultikeyPathInfo(); }); if (!MultikeyPathTracker::get(opCtx).isTrackingMultikeyPathInfo()) { restartTracker.Dismiss(); } MultikeyPathTracker::get(opCtx).stopTrackingMultikeyPathInfo(); Timer timer; std::unique_ptr<BulkBuilder::Sorter::Iterator> i(bulk->_sorter->done()); stdx::unique_lock<Client> lk(*opCtx->getClient()); ProgressMeterHolder pm( CurOp::get(opCtx)->setMessage_inlock("Index Bulk Build: (2/3) btree bottom up", "Index: (2/3) BTree Bottom Up Progress", bulk->_keysInserted, 10)); lk.unlock(); std::unique_ptr<SortedDataBuilderInterface> builder; writeConflictRetry(opCtx, "setting index multikey flag", "", [&] { WriteUnitOfWork wunit(opCtx); if (bulk->_everGeneratedMultipleKeys || isMultikeyFromPaths(bulk->_indexMultikeyPaths)) { _btreeState->setMultikey(opCtx, bulk->_indexMultikeyPaths); } builder.reset(_newInterface->getBulkBuilder(opCtx, dupsAllowed)); if (assignTimestamp) { fassertStatusOK(50705, opCtx->recoveryUnit()->setTimestamp( LogicalClock::get(opCtx)->getClusterTime().asTimestamp())); } wunit.commit(); }); while (i->more()) { if (mayInterrupt) { opCtx->checkForInterrupt(); } WriteUnitOfWork wunit(opCtx); // Improve performance in the btree-building phase by disabling rollback tracking. // This avoids copying all the written bytes to a buffer that is only used to roll back. // Note that this is safe to do, as this entire index-build-in-progress will be cleaned // up by the index system. opCtx->recoveryUnit()->setRollbackWritesDisabled(); // Get the next datum and add it to the builder. BulkBuilder::Sorter::Data d = i->next(); Status status = builder->addKey(d.first, d.second); if (!status.isOK()) { // Overlong key that's OK to skip? if (status.code() == ErrorCodes::KeyTooLong && ignoreKeyTooLong(opCtx)) { continue; } // Check if this is a duplicate that's OK to skip if (status.code() == ErrorCodes::DuplicateKey) { invariant(!dupsAllowed); // shouldn't be getting DupKey errors if dupsAllowed. if (dupsToDrop) { dupsToDrop->insert(d.second); continue; } } return status; } // If we're here either it's a dup and we're cool with it or the addKey went just // fine. pm.hit(); if (assignTimestamp) { fassertStatusOK(50704, opCtx->recoveryUnit()->setTimestamp( LogicalClock::get(opCtx)->getClusterTime().asTimestamp())); } wunit.commit(); } pm.finished(); { stdx::lock_guard<Client> lk(*opCtx->getClient()); CurOp::get(opCtx)->setMessage_inlock("Index Bulk Build: (3/3) btree-middle", "Index: (3/3) BTree Middle Progress"); } LOG(timer.seconds() > 10 ? 0 : 1) << "\t done building bottom layer, going to commit"; std::unique_ptr<TimestampBlock> tsBlock; if (assignTimestamp) { tsBlock = stdx::make_unique<TimestampBlock>( opCtx, LogicalClock::get(opCtx)->getClusterTime().asTimestamp()); } builder->commit(mayInterrupt); return Status::OK(); }
Status IndexBuildInterceptor::drainWritesIntoIndex(OperationContext* opCtx, const InsertDeleteOptions& options, RecoveryUnit::ReadSource readSource) { invariant(!opCtx->lockState()->inAWriteUnitOfWork()); // Callers may request to read at a specific timestamp so that no drained writes are timestamped // earlier than their original write timestamp. Also ensure that leaving this function resets // the ReadSource to its original value. auto resetReadSourceGuard = makeGuard([ opCtx, prevReadSource = opCtx->recoveryUnit()->getTimestampReadSource() ] { opCtx->recoveryUnit()->abandonSnapshot(); opCtx->recoveryUnit()->setTimestampReadSource(prevReadSource); }); if (readSource != RecoveryUnit::ReadSource::kUnset) { opCtx->recoveryUnit()->abandonSnapshot(); opCtx->recoveryUnit()->setTimestampReadSource(readSource); } else { resetReadSourceGuard.dismiss(); } // These are used for logging only. int64_t totalDeleted = 0; int64_t totalInserted = 0; Timer timer; const int64_t appliedAtStart = _numApplied; // Set up the progress meter. This will never be completely accurate, because more writes can be // read from the side writes table than are observed before draining. static const char* curopMessage = "Index Build: draining writes received during build"; ProgressMeterHolder progress; { stdx::unique_lock<Client> lk(*opCtx->getClient()); progress.set(CurOp::get(opCtx)->setProgress_inlock(curopMessage)); } // Force the progress meter to log at the end of every batch. By default, the progress meter // only logs after a large number of calls to hit(), but since we batch inserts by up to // 1000 records, progress would rarely be displayed. progress->reset(_sideWritesCounter.load() - appliedAtStart /* total */, 3 /* secondsBetween */, 1 /* checkInterval */); // Buffer operations into batches to insert per WriteUnitOfWork. Impose an upper limit on the // number of documents and the total size of the batch. const int32_t kBatchMaxSize = 1000; const int64_t kBatchMaxBytes = BSONObjMaxInternalSize; int64_t batchSizeBytes = 0; std::vector<SideWriteRecord> batch; batch.reserve(kBatchMaxSize); // Hold on to documents that would exceed the per-batch memory limit. Always insert this first // into the next batch. boost::optional<SideWriteRecord> stashed; auto cursor = _sideWritesTable->rs()->getCursor(opCtx); bool atEof = false; while (!atEof) { opCtx->checkForInterrupt(); // Stashed records should be inserted into a batch first. if (stashed) { invariant(batch.empty()); batch.push_back(std::move(stashed.get())); stashed.reset(); } auto record = cursor->next(); if (record) { RecordId currentRecordId = record->id; BSONObj docOut = record->data.toBson().getOwned(); // If the total batch size in bytes would be too large, stash this document and let the // current batch insert. int objSize = docOut.objsize(); if (batchSizeBytes + objSize > kBatchMaxBytes) { invariant(!stashed); // Stash this document to be inserted in the next batch. stashed.emplace(currentRecordId, std::move(docOut)); } else { batchSizeBytes += objSize; batch.emplace_back(currentRecordId, std::move(docOut)); // Continue if there is more room in the batch. if (batch.size() < kBatchMaxSize) { continue; } } } else { atEof = true; if (batch.empty()) break; } invariant(!batch.empty()); cursor->save(); // If we are here, either we have reached the end of the table or the batch is full, so // insert everything in one WriteUnitOfWork, and delete each inserted document from the side // writes table. auto status = writeConflictRetry(opCtx, "index build drain", _indexCatalogEntry->ns(), [&] { WriteUnitOfWork wuow(opCtx); for (auto& operation : batch) { auto status = _applyWrite(opCtx, operation.second, options, &totalInserted, &totalDeleted); if (!status.isOK()) { return status; } // Delete the document from the table as soon as it has been inserted into the // index. This ensures that no key is ever inserted twice and no keys are skipped. _sideWritesTable->rs()->deleteRecord(opCtx, operation.first); } // For rollback to work correctly, these writes need to be timestamped. The actual time // is not important, as long as it not older than the most recent visible side write. IndexTimestampHelper::setGhostCommitTimestampForWrite( opCtx, NamespaceString(_indexCatalogEntry->ns())); wuow.commit(); return Status::OK(); }); if (!status.isOK()) { return status; } progress->hit(batch.size()); // Lock yielding will only happen if we are holding intent locks. _tryYield(opCtx); cursor->restore(); // Account for more writes coming in during a batch. progress->setTotalWhileRunning(_sideWritesCounter.loadRelaxed() - appliedAtStart); _numApplied += batch.size(); batch.clear(); batchSizeBytes = 0; } progress->finished(); int logLevel = (_numApplied - appliedAtStart > 0) ? 0 : 1; LOG(logLevel) << "index build: drain applied " << (_numApplied - appliedAtStart) << " side writes (inserted: " << totalInserted << ", deleted: " << totalDeleted << ") for '" << _indexCatalogEntry->descriptor()->indexName() << "' in " << timer.millis() << " ms"; return Status::OK(); }
Status applyOps(OperationContext* opCtx, const std::string& dbName, const BSONObj& applyOpCmd, BSONObjBuilder* result) { bool allowAtomic = false; uassertStatusOK( bsonExtractBooleanFieldWithDefault(applyOpCmd, "allowAtomic", true, &allowAtomic)); auto areOpsCrudOnly = _areOpsCrudOnly(applyOpCmd); auto isAtomic = allowAtomic && areOpsCrudOnly; auto hasPrecondition = _hasPrecondition(applyOpCmd); boost::optional<Lock::GlobalWrite> globalWriteLock; boost::optional<Lock::DBLock> dbWriteLock; // There's only one case where we are allowed to take the database lock instead of the global // lock - no preconditions; only CRUD ops; and non-atomic mode. if (!hasPrecondition && areOpsCrudOnly && !allowAtomic) { dbWriteLock.emplace(opCtx, dbName, MODE_X); } else { globalWriteLock.emplace(opCtx); } bool userInitiatedWritesAndNotPrimary = opCtx->writesAreReplicated() && !repl::getGlobalReplicationCoordinator()->canAcceptWritesForDatabase(opCtx, dbName); if (userInitiatedWritesAndNotPrimary) return Status(ErrorCodes::NotMaster, str::stream() << "Not primary while applying ops to database " << dbName); if (hasPrecondition) { auto status = _checkPrecondition(opCtx, applyOpCmd, result); if (!status.isOK()) { return status; } } int numApplied = 0; if (!isAtomic) return _applyOps(opCtx, dbName, applyOpCmd, result, &numApplied); // Perform write ops atomically invariant(globalWriteLock); try { writeConflictRetry(opCtx, "applyOps", dbName, [&] { BSONObjBuilder intermediateResult; WriteUnitOfWork wunit(opCtx); numApplied = 0; { // Suppress replication for atomic operations until end of applyOps. repl::UnreplicatedWritesBlock uwb(opCtx); uassertStatusOK( _applyOps(opCtx, dbName, applyOpCmd, &intermediateResult, &numApplied)); } // Generate oplog entry for all atomic ops collectively. if (opCtx->writesAreReplicated()) { // We want this applied atomically on slaves so we rewrite the oplog entry without // the pre-condition for speed. BSONObjBuilder cmdBuilder; for (auto elem : applyOpCmd) { auto name = elem.fieldNameStringData(); if (name == kPreconditionFieldName) continue; if (name == "bypassDocumentValidation") continue; cmdBuilder.append(elem); } const BSONObj cmdRewritten = cmdBuilder.done(); auto opObserver = getGlobalServiceContext()->getOpObserver(); invariant(opObserver); opObserver->onApplyOps(opCtx, dbName, cmdRewritten); } wunit.commit(); result->appendElements(intermediateResult.obj()); }); } catch (const DBException& ex) { if (ex.getCode() == ErrorCodes::NamespaceNotFound) { // Retry in non-atomic mode, since MMAP cannot implicitly create a new database // within an active WriteUnitOfWork. return _applyOps(opCtx, dbName, applyOpCmd, result, &numApplied); } BSONArrayBuilder ab; ++numApplied; for (int j = 0; j < numApplied; j++) ab.append(false); result->append("applied", numApplied); result->append("code", ex.getCode()); result->append("codeName", ErrorCodes::errorString(ErrorCodes::fromInt(ex.getCode()))); result->append("errmsg", ex.what()); result->append("results", ab.arr()); return Status(ErrorCodes::UnknownError, ex.what()); } return Status::OK(); }