OpTime ReplicationCoordinatorExternalStateImpl::onTransitionToPrimary(OperationContext* txn, bool isV1ElectionProtocol) { invariant(txn->lockState()->isW()); // Clear the appliedThrough marker so on startup we'll use the top of the oplog. This must be // done before we add anything to our oplog. invariant(_storageInterface->getOplogDeleteFromPoint(txn).isNull()); _storageInterface->setAppliedThrough(txn, {}); if (isV1ElectionProtocol) { MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { ScopedTransaction scopedXact(txn, MODE_X); WriteUnitOfWork wuow(txn); txn->getClient()->getServiceContext()->getOpObserver()->onOpMessage( txn, BSON("msg" << "new primary")); wuow.commit(); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END( txn, "logging transition to primary to oplog", "local.oplog.rs"); } const auto opTimeToReturn = fassertStatusOK(28665, loadLastOpTime(txn)); _shardingOnTransitionToPrimaryHook(txn); _dropAllTempCollections(txn); return opTimeToReturn; }
OpTime ReplicationCoordinatorExternalStateImpl::onTransitionToPrimary(OperationContext* opCtx, bool isV1ElectionProtocol) { invariant(opCtx->lockState()->isW()); // Clear the appliedThrough marker so on startup we'll use the top of the oplog. This must be // done before we add anything to our oplog. // We record this update at the 'lastAppliedOpTime'. If there are any outstanding // checkpoints being taken, they should only reflect this write if they see all writes up // to our 'lastAppliedOpTime'. invariant( _replicationProcess->getConsistencyMarkers()->getOplogTruncateAfterPoint(opCtx).isNull()); auto lastAppliedOpTime = repl::ReplicationCoordinator::get(opCtx)->getMyLastAppliedOpTime(); _replicationProcess->getConsistencyMarkers()->clearAppliedThrough( opCtx, lastAppliedOpTime.getTimestamp()); if (isV1ElectionProtocol) { writeConflictRetry(opCtx, "logging transition to primary to oplog", "local.oplog.rs", [&] { WriteUnitOfWork wuow(opCtx); opCtx->getClient()->getServiceContext()->getOpObserver()->onOpMessage( opCtx, BSON("msg" << "new primary")); wuow.commit(); }); } const auto opTimeToReturn = fassertStatusOK(28665, loadLastOpTime(opCtx)); _shardingOnTransitionToPrimaryHook(opCtx); _dropAllTempCollections(opCtx); serverGlobalParams.validateFeaturesAsMaster.store(true); return opTimeToReturn; }
void ReplicationCoordinatorExternalStateImpl::shutdown(OperationContext* txn) { UniqueLock lk(_threadMutex); if (_startedThreads) { _stopDataReplication_inlock(txn, &lk); if (_snapshotThread) { log() << "Stopping replication snapshot thread"; _snapshotThread->shutdown(); } if (_storageInterface->getOplogDeleteFromPoint(txn).isNull() && loadLastOpTime(txn) == _storageInterface->getAppliedThrough(txn)) { // Clear the appliedThrough marker to indicate we are consistent with the top of the // oplog. _storageInterface->setAppliedThrough(txn, {}); } if (_noopWriter) { LOG(1) << "Stopping noop writer"; _noopWriter->stopWritingPeriodicNoops(); } log() << "Stopping replication storage threads"; _taskExecutor->shutdown(); _taskExecutor->join(); _storageInterface->shutdown(); } }
void ReplicationCoordinatorExternalStateImpl::shutdown(OperationContext* opCtx) { UniqueLock lk(_threadMutex); if (!_startedThreads) { return; } _inShutdown = true; _stopDataReplication_inlock(opCtx, &lk); if (_noopWriter) { LOG(1) << "Stopping noop writer"; _noopWriter->stopWritingPeriodicNoops(); } log() << "Stopping replication storage threads"; _taskExecutor->shutdown(); _taskExecutor->join(); lk.unlock(); // Perform additional shutdown steps below that must be done outside _threadMutex. if (_replicationProcess->getConsistencyMarkers()->getOplogTruncateAfterPoint(opCtx).isNull() && loadLastOpTime(opCtx) == _replicationProcess->getConsistencyMarkers()->getAppliedThrough(opCtx)) { // Clear the appliedThrough marker to indicate we are consistent with the top of the // oplog. We record this update at the 'lastAppliedOpTime'. If there are any outstanding // checkpoints being taken, they should only reflect this write if they see all writes up // to our 'lastAppliedOpTime'. auto lastAppliedOpTime = repl::ReplicationCoordinator::get(opCtx)->getMyLastAppliedOpTime(); _replicationProcess->getConsistencyMarkers()->clearAppliedThrough( opCtx, lastAppliedOpTime.getTimestamp()); } }
void ReplicationCoordinatorExternalStateImpl::cleanUpLastApplyBatch(OperationContext* txn) { if (_storageInterface->getInitialSyncFlag(txn)) { return; // Initial Sync will take over so no cleanup is needed. } const auto deleteFromPoint = _storageInterface->getOplogDeleteFromPoint(txn); const auto appliedThrough = _storageInterface->getAppliedThrough(txn); const bool needToDeleteEndOfOplog = !deleteFromPoint.isNull() && // This version should never have a non-null deleteFromPoint with a null appliedThrough. // This scenario means that we downgraded after unclean shutdown, then the downgraded node // deleted the ragged end of our oplog, then did a clean shutdown. !appliedThrough.isNull() && // Similarly we should never have an appliedThrough higher than the deleteFromPoint. This // means that the downgraded node deleted our ragged end then applied ahead of our // deleteFromPoint and then had an unclean shutdown before upgrading. We are ok with // applying these ops because older versions wrote to the oplog from a single thread so we // know they are in order. !(appliedThrough.getTimestamp() >= deleteFromPoint); if (needToDeleteEndOfOplog) { log() << "Removing unapplied entries starting at: " << deleteFromPoint; truncateOplogTo(txn, deleteFromPoint); } _storageInterface->setOplogDeleteFromPoint(txn, {}); // clear the deleteFromPoint if (appliedThrough.isNull()) { // No follow-up work to do. return; } // Check if we have any unapplied ops in our oplog. It is important that this is done after // deleting the ragged end of the oplog. const auto topOfOplog = fassertStatusOK(40290, loadLastOpTime(txn)); if (topOfOplog >= appliedThrough) { return; // We've applied all the valid oplog we have. } log() << "Replaying stored operations from " << appliedThrough << " (exclusive) to " << topOfOplog << " (inclusive)."; DBDirectClient db(txn); auto cursor = db.query(rsOplogName, QUERY("ts" << BSON("$gte" << appliedThrough.getTimestamp())), /*batchSize*/ 0, /*skip*/ 0, /*projection*/ nullptr, QueryOption_OplogReplay); // Check that the first document matches our appliedThrough point then skip it since it's // already been applied. if (!cursor->more()) { // This should really be impossible because we check above that the top of the oplog is // strictly > appliedThrough. If this fails it represents a serious bug in either the // storage engine or query's implementation of OplogReplay. severe() << "Couldn't find any entries in the oplog >= " << appliedThrough << " which should be impossible."; fassertFailedNoTrace(40293); } auto firstOpTimeFound = fassertStatusOK(40291, OpTime::parseFromOplogEntry(cursor->nextSafe())); if (firstOpTimeFound != appliedThrough) { severe() << "Oplog entry at " << appliedThrough << " is missing; actual entry found is " << firstOpTimeFound; fassertFailedNoTrace(40292); } // Apply remaining ops one at at time, but don't log them because they are already logged. const bool wereWritesReplicated = txn->writesAreReplicated(); ON_BLOCK_EXIT([&] { txn->setReplicatedWrites(wereWritesReplicated); }); txn->setReplicatedWrites(false); while (cursor->more()) { auto entry = cursor->nextSafe(); fassertStatusOK(40294, SyncTail::syncApply(txn, entry, true)); _storageInterface->setAppliedThrough( txn, fassertStatusOK(40295, OpTime::parseFromOplogEntry(entry))); } }