Ejemplo n.º 1
void ReplicationRecoveryImpl::_recoverFromUnstableCheckpoint(OperationContext* opCtx,
                                                             OpTime appliedThrough,
                                                             OpTime topOfOplog) {
    log() << "Recovering from an unstable checkpoint (top of oplog: " << topOfOplog
          << ", appliedThrough: " << appliedThrough << ")";

    if (appliedThrough.isNull()) {
        // The appliedThrough would be null if we shut down cleanly or crashed as a primary. Either
        // way we are consistent at the top of the oplog.
        log() << "No oplog entries to apply for recovery. appliedThrough is null.";
    } else {
        // If the appliedThrough is not null, then we shut down uncleanly during secondary oplog
        // application and must apply from the appliedThrough to the top of the oplog.
        log() << "Starting recovery oplog application at the appliedThrough: " << appliedThrough
              << ", through the top of the oplog: " << topOfOplog;

        // When `recoverFromOplog` truncates the oplog, that also happens to set the "oldest
        // timestamp" to the truncation point[1]. `_applyToEndOfOplog` will then perform writes
        // before the truncation point. Doing so violates the constraint that all updates must be
        // timestamped newer than the "oldest timestamp". This call will move the "oldest
        // timestamp" back to the `startPoint`.
        // [1] This is arguably incorrect. On rollback for nodes that are not keeping history to
        // the "majority point", the "oldest timestamp" likely needs to go back in time. The
        // oplog's `cappedTruncateAfter` method was a convenient location for this logic, which,
        // unfortunately, conflicts with the usage above.

        _applyToEndOfOplog(opCtx, appliedThrough.getTimestamp(), topOfOplog.getTimestamp());

    // `_recoverFromUnstableCheckpoint` is only expected to be called on startup.

    // Ensure the `appliedThrough` is set to the top of oplog, specifically if the node was
    // previously running as a primary. If a crash happens before the first stable checkpoint on
    // upgrade, replication recovery will know it must apply from this point and not assume the
    // datafiles contain any writes that were taken before the crash.
    _consistencyMarkers->setAppliedThrough(opCtx, topOfOplog);

    // Force the set `appliedThrough` to become durable on disk in a checkpoint. This method would
    // typically take a stable checkpoint, but because we're starting up from a checkpoint that
    // has no checkpoint timestamp, the stable checkpoint "degrades" into an unstable checkpoint.
    // Not waiting for checkpoint durability here can result in a scenario where the node takes
    // writes and persists them to the oplog, but crashes before a stable checkpoint persists a
    // "recovery timestamp". The typical startup path for data-bearing nodes with 4.0 is to use
    // the recovery timestamp to determine where to play oplog forward from. As this method shows,
    // when a recovery timestamp does not exist, the applied through is used to determine where to
    // start playing oplog entries from.
Ejemplo n.º 2
void ReplicationRecoveryImpl::_recoverFromStableTimestamp(OperationContext* opCtx,
                                                          Timestamp stableTimestamp,
                                                          OpTime appliedThrough,
                                                          OpTime topOfOplog) {
    const auto truncateAfterPoint = _consistencyMarkers->getOplogTruncateAfterPoint(opCtx);
    log() << "Recovering from stable timestamp: " << stableTimestamp
          << " (top of oplog: " << topOfOplog << ", appliedThrough: " << appliedThrough
          << ", TruncateAfter: " << truncateAfterPoint << ")";

    log() << "Starting recovery oplog application at the stable timestamp: " << stableTimestamp;
    _applyToEndOfOplog(opCtx, stableTimestamp, topOfOplog.getTimestamp());
Ejemplo n.º 3
void ReplicationRecoveryImpl::recoverFromOplog(OperationContext* opCtx) try {
    if (_consistencyMarkers->getInitialSyncFlag(opCtx)) {
        log() << "No recovery needed. Initial sync flag set.";
        return;  // Initial Sync will take over so no cleanup is needed.

    const auto truncateAfterPoint = _consistencyMarkers->getOplogTruncateAfterPoint(opCtx);
    const auto appliedThrough = _consistencyMarkers->getAppliedThrough(opCtx);

    if (!truncateAfterPoint.isNull()) {
        log() << "Removing unapplied entries starting at: " << truncateAfterPoint.toBSON();
        _truncateOplogTo(opCtx, truncateAfterPoint);

    // Clear the truncateAfterPoint so that we don't truncate the next batch of oplog entries
    // erroneously.
    _consistencyMarkers->setOplogTruncateAfterPoint(opCtx, {});

    // TODO (SERVER-30556): Delete this line since the old oplog delete from point cannot exist.

    auto topOfOplogSW = _getLastAppliedOpTime(opCtx);
    boost::optional<OpTime> topOfOplog = boost::none;
    if (topOfOplogSW.getStatus() != ErrorCodes::CollectionIsEmpty &&
        topOfOplogSW.getStatus() != ErrorCodes::NamespaceNotFound) {
        fassertStatusOK(40290, topOfOplogSW);
        topOfOplog = topOfOplogSW.getValue();

    // If we have a checkpoint timestamp, then we recovered to a timestamp and should set the
    // initial data timestamp to that. Otherwise, we simply recovered the data on disk so we should
    // set the initial data timestamp to the top OpTime in the oplog once the data is consistent
    // there. If there is nothing in the oplog, then we do not set the initial data timestamp.
    auto checkpointTimestamp = _consistencyMarkers->getCheckpointTimestamp(opCtx);
    if (!checkpointTimestamp.isNull()) {

        // If we have a checkpoint timestamp, we set the initial data timestamp now so that
        // the operations we apply below can be given the proper timestamps.

    // Oplog is empty. There are no oplog entries to apply, so we exit recovery. If there was a
    // checkpointTimestamp then we already set the initial data timestamp. Otherwise, there is
    // nothing to set it to.
    if (!topOfOplog) {
        log() << "No oplog entries to apply for recovery. Oplog is empty.";

    if (auto startPoint = _getOplogApplicationStartPoint(checkpointTimestamp, appliedThrough)) {
        _applyToEndOfOplog(opCtx, startPoint.get(), topOfOplog->getTimestamp());

    // If we don't have a checkpoint timestamp, then we are either not running a storage engine
    // that supports "recover to stable timestamp" or we just upgraded from a version that didn't.
    // In both cases, the data on disk is not consistent until we have applied all oplog entries to
    // the end of the oplog, since we do not know which ones actually got applied before shutdown.
    // As a result, we do not set the initial data timestamp until after we have applied to the end
    // of the oplog.
    if (checkpointTimestamp.isNull()) {

} catch (...) {
    severe() << "Caught exception during replication recovery: " << exceptionToStatus();