void ReplicationRecoveryImpl::recoverFromOplog(OperationContext* opCtx) try { if (_consistencyMarkers->getInitialSyncFlag(opCtx)) { log() << "No recovery needed. Initial sync flag set."; return; // Initial Sync will take over so no cleanup is needed. } const auto truncateAfterPoint = _consistencyMarkers->getOplogTruncateAfterPoint(opCtx); const auto appliedThrough = _consistencyMarkers->getAppliedThrough(opCtx); if (!truncateAfterPoint.isNull()) { log() << "Removing unapplied entries starting at: " << truncateAfterPoint.toBSON(); _truncateOplogTo(opCtx, truncateAfterPoint); } // Clear the truncateAfterPoint so that we don't truncate the next batch of oplog entries // erroneously. _consistencyMarkers->setOplogTruncateAfterPoint(opCtx, {}); // TODO (SERVER-30556): Delete this line since the old oplog delete from point cannot exist. _consistencyMarkers->removeOldOplogDeleteFromPointField(opCtx); auto topOfOplogSW = _getLastAppliedOpTime(opCtx); boost::optional<OpTime> topOfOplog = boost::none; if (topOfOplogSW.getStatus() != ErrorCodes::CollectionIsEmpty && topOfOplogSW.getStatus() != ErrorCodes::NamespaceNotFound) { fassertStatusOK(40290, topOfOplogSW); topOfOplog = topOfOplogSW.getValue(); } // If we have a checkpoint timestamp, then we recovered to a timestamp and should set the // initial data timestamp to that. Otherwise, we simply recovered the data on disk so we should // set the initial data timestamp to the top OpTime in the oplog once the data is consistent // there. If there is nothing in the oplog, then we do not set the initial data timestamp. auto checkpointTimestamp = _consistencyMarkers->getCheckpointTimestamp(opCtx); if (!checkpointTimestamp.isNull()) { // If we have a checkpoint timestamp, we set the initial data timestamp now so that // the operations we apply below can be given the proper timestamps. _storageInterface->setInitialDataTimestamp(opCtx->getServiceContext(), SnapshotName(checkpointTimestamp)); } // Oplog is empty. There are no oplog entries to apply, so we exit recovery. If there was a // checkpointTimestamp then we already set the initial data timestamp. Otherwise, there is // nothing to set it to. if (!topOfOplog) { log() << "No oplog entries to apply for recovery. Oplog is empty."; return; } if (auto startPoint = _getOplogApplicationStartPoint(checkpointTimestamp, appliedThrough)) { _applyToEndOfOplog(opCtx, startPoint.get(), topOfOplog->getTimestamp()); } // If we don't have a checkpoint timestamp, then we are either not running a storage engine // that supports "recover to stable timestamp" or we just upgraded from a version that didn't. // In both cases, the data on disk is not consistent until we have applied all oplog entries to // the end of the oplog, since we do not know which ones actually got applied before shutdown. // As a result, we do not set the initial data timestamp until after we have applied to the end // of the oplog. if (checkpointTimestamp.isNull()) { _storageInterface->setInitialDataTimestamp(opCtx->getServiceContext(), SnapshotName(topOfOplog->getTimestamp())); } } catch (...) { severe() << "Caught exception during replication recovery: " << exceptionToStatus(); std::terminate(); }
void ReplicationRecoveryImpl::recoverFromOplog(OperationContext* opCtx, boost::optional<Timestamp> stableTimestamp) try { if (_consistencyMarkers->getInitialSyncFlag(opCtx)) { log() << "No recovery needed. Initial sync flag set."; return; // Initial Sync will take over so no cleanup is needed. } const auto serviceCtx = getGlobalServiceContext(); inReplicationRecovery(serviceCtx) = true; ON_BLOCK_EXIT([serviceCtx] { invariant( inReplicationRecovery(serviceCtx), "replication recovery flag is unexpectedly unset when exiting recoverFromOplog()"); inReplicationRecovery(serviceCtx) = false; }); const auto truncateAfterPoint = _consistencyMarkers->getOplogTruncateAfterPoint(opCtx); if (!truncateAfterPoint.isNull()) { log() << "Removing unapplied entries starting at: " << truncateAfterPoint.toBSON(); _truncateOplogTo(opCtx, truncateAfterPoint); // Clear the truncateAfterPoint so that we don't truncate the next batch of oplog entries // erroneously. _consistencyMarkers->setOplogTruncateAfterPoint(opCtx, {}); opCtx->recoveryUnit()->waitUntilDurable(); } auto topOfOplogSW = _getTopOfOplog(opCtx); if (topOfOplogSW.getStatus() == ErrorCodes::CollectionIsEmpty || topOfOplogSW.getStatus() == ErrorCodes::NamespaceNotFound) { // Oplog is empty. There are no oplog entries to apply, so we exit recovery and go into // initial sync. log() << "No oplog entries to apply for recovery. Oplog is empty."; return; } fassert(40290, topOfOplogSW); const auto topOfOplog = topOfOplogSW.getValue(); // If we were passed in a stable timestamp, we are in rollback recovery and should recover from // that stable timestamp. Otherwise, we're recovering at startup. If this storage engine // supports recover to stable timestamp or enableMajorityReadConcern=false, we ask it for the // recovery timestamp. If the storage engine returns a timestamp, we recover from that point. // However, if the storage engine returns "none", the storage engine does not have a stable // checkpoint and we must recover from an unstable checkpoint instead. const bool supportsRecoveryTimestamp = _storageInterface->supportsRecoveryTimestamp(opCtx->getServiceContext()); if (!stableTimestamp && supportsRecoveryTimestamp) { stableTimestamp = _storageInterface->getRecoveryTimestamp(opCtx->getServiceContext()); } const auto appliedThrough = _consistencyMarkers->getAppliedThrough(opCtx); invariant(!stableTimestamp || stableTimestamp->isNull() || appliedThrough.isNull() || *stableTimestamp == appliedThrough.getTimestamp(), str::stream() << "Stable timestamp " << stableTimestamp->toString() << " does not equal appliedThrough timestamp " << appliedThrough.toString()); if (stableTimestamp) { invariant(supportsRecoveryTimestamp); _recoverFromStableTimestamp(opCtx, *stableTimestamp, appliedThrough, topOfOplog); } else { _recoverFromUnstableCheckpoint(opCtx, appliedThrough, topOfOplog); } _reconstructPreparedTransactions(opCtx); } catch (...) { severe() << "Caught exception during replication recovery: " << exceptionToStatus(); std::terminate(); }