예제 #1
0
    // This free function is used by the initial sync writer threads to apply each op
    void multiInitialSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) {
        initializeWriterThread();
        for (std::vector<BSONObj>::const_iterator it = ops.begin();
             it != ops.end();
             ++it) {
            try {
                OperationContextImpl txn;

                if (!st->syncApply(&txn, *it)) {
                    bool status;
                    {
                        Lock::GlobalWrite lk(txn.lockState());
                        status = st->shouldRetry(&txn, *it);
                    }

                    if (status) {
                        // retry
                        if (!st->syncApply(&txn, *it)) {
                            fassertFailedNoTrace(15915);
                        }
                    }
                    // If shouldRetry() returns false, fall through.
                    // This can happen if the document that was moved and missed by Cloner
                    // subsequently got deleted and no longer exists on the Sync Target at all
                }
            }
            catch (const DBException& e) {
                error() << "exception: " << causedBy(e) << " on: " << it->toString() << endl;
                fassertFailedNoTrace(16361);
            }
        }
    }
예제 #2
0
파일: sync_tail.cpp 프로젝트: Jaryli/mongo
// This free function is used by the writer threads to apply each op
void multiSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) {
    initializeWriterThread();

    OperationContextImpl txn;
    txn.setReplicatedWrites(false);
    DisableDocumentValidation validationDisabler(&txn);

    // allow us to get through the magic barrier
    txn.lockState()->setIsBatchWriter(true);

    bool convertUpdatesToUpserts = true;

    for (std::vector<BSONObj>::const_iterator it = ops.begin(); it != ops.end(); ++it) {
        try {
            const Status s = SyncTail::syncApply(&txn, *it, convertUpdatesToUpserts);
            if (!s.isOK()) {
                severe() << "Error applying operation (" << it->toString() << "): " << s;
                fassertFailedNoTrace(16359);
            }
        } catch (const DBException& e) {
            severe() << "writer worker caught exception: " << causedBy(e)
                     << " on: " << it->toString();

            if (inShutdown()) {
                return;
            }

            fassertFailedNoTrace(16360);
        }
    }
}
예제 #3
0
void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx,
                                                 Timestamp oplogApplicationStartPoint,
                                                 Timestamp topOfOplog) {
    invariant(!oplogApplicationStartPoint.isNull());
    invariant(!topOfOplog.isNull());

    // Check if we have any unapplied ops in our oplog. It is important that this is done after
    // deleting the ragged end of the oplog.
    if (oplogApplicationStartPoint == topOfOplog) {
        log()
            << "No oplog entries to apply for recovery. appliedThrough is at the top of the oplog.";
        return;  // We've applied all the valid oplog we have.
    } else if (oplogApplicationStartPoint > topOfOplog) {
        severe() << "Applied op " << oplogApplicationStartPoint.toBSON()
                 << " not found. Top of oplog is " << topOfOplog.toBSON() << '.';
        fassertFailedNoTrace(40313);
    }

    log() << "Replaying stored operations from " << oplogApplicationStartPoint.toBSON()
          << " (exclusive) to " << topOfOplog.toBSON() << " (inclusive).";

    DBDirectClient db(opCtx);
    auto cursor = db.query(NamespaceString::kRsOplogNamespace.ns(),
                           QUERY("ts" << BSON("$gte" << oplogApplicationStartPoint)),
                           /*batchSize*/ 0,
                           /*skip*/ 0,
                           /*projection*/ nullptr,
                           QueryOption_OplogReplay);

    // Check that the first document matches our appliedThrough point then skip it since it's
    // already been applied.
    if (!cursor->more()) {
        // This should really be impossible because we check above that the top of the oplog is
        // strictly > appliedThrough. If this fails it represents a serious bug in either the
        // storage engine or query's implementation of OplogReplay.
        severe() << "Couldn't find any entries in the oplog >= "
                 << oplogApplicationStartPoint.toBSON() << " which should be impossible.";
        fassertFailedNoTrace(40293);
    }

    auto firstTimestampFound =
        fassertStatusOK(40291, OpTime::parseFromOplogEntry(cursor->nextSafe())).getTimestamp();
    if (firstTimestampFound != oplogApplicationStartPoint) {
        severe() << "Oplog entry at " << oplogApplicationStartPoint.toBSON()
                 << " is missing; actual entry found is " << firstTimestampFound.toBSON();
        fassertFailedNoTrace(40292);
    }

    // Apply remaining ops one at at time, but don't log them because they are already logged.
    UnreplicatedWritesBlock uwb(opCtx);

    while (cursor->more()) {
        auto entry = cursor->nextSafe();
        fassertStatusOK(40294,
                        SyncTail::syncApply(opCtx, entry, OplogApplication::Mode::kRecovering));
        _consistencyMarkers->setAppliedThrough(
            opCtx, fassertStatusOK(40295, OpTime::parseFromOplogEntry(entry)));
    }
}
예제 #4
0
    /* applies oplog from "now" until endOpTime using the applier threads for initial sync*/
    void SyncTail::_applyOplogUntil(OperationContext* txn, const OpTime& endOpTime) {
        unsigned long long bytesApplied = 0;
        unsigned long long entriesApplied = 0;
        while (true) {
            OpQueue ops;
            OperationContextImpl ctx;

            while (!tryPopAndWaitForMore(&ops, getGlobalReplicationCoordinator())) {
                // nothing came back last time, so go again
                if (ops.empty()) continue;

                // Check if we reached the end
                const BSONObj currentOp = ops.back();
                const OpTime currentOpTime = currentOp["ts"]._opTime();

                // When we reach the end return this batch
                if (currentOpTime == endOpTime) {
                    break;
                }
                else if (currentOpTime > endOpTime) {
                    severe() << "Applied past expected end " << endOpTime << " to " << currentOpTime
                            << " without seeing it. Rollback?" << rsLog;
                    fassertFailedNoTrace(18693);
                }

                // apply replication batch limits
                if (ops.getSize() > replBatchLimitBytes)
                    break;
                if (ops.getDeque().size() > replBatchLimitOperations)
                    break;
            };

            if (ops.empty()) {
                severe() << "got no ops for batch...";
                fassertFailedNoTrace(18692);
            }

            const BSONObj lastOp = ops.back().getOwned();

            // Tally operation information
            bytesApplied += ops.getSize();
            entriesApplied += ops.getDeque().size();

            multiApply(ops.getDeque());
            OpTime lastOpTime = applyOpsToOplog(&ops.getDeque());

            // if the last op applied was our end, return
            if (lastOpTime == endOpTime) {
                LOG(1) << "SyncTail applied " << entriesApplied
                       << " entries (" << bytesApplied << " bytes)"
                       << " and finished at opTime " << endOpTime.toStringPretty();
                return;
            }
        } // end of while (true)
    }
예제 #5
0
    void restartInProgressIndexesFromLastShutdown(OperationContext* txn) {
        txn->getClient()->getAuthorizationSession()->grantInternalAuthorization();

        std::vector<std::string> dbNames;

        StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine();
        storageEngine->listDatabases( &dbNames );

        try {
            std::list<std::string> collNames;
            for (std::vector<std::string>::const_iterator dbName = dbNames.begin();
                 dbName < dbNames.end();
                 ++dbName) {

                ScopedTransaction scopedXact(txn, MODE_IS);
                AutoGetDb autoDb(txn, *dbName, MODE_S);

                Database* db = autoDb.getDb();
                db->getDatabaseCatalogEntry()->getCollectionNamespaces(&collNames);
            }
            checkNS(txn, collNames);
        }
        catch (const DBException& e) {
            error() << "Index verification did not complete: " << e.toString();
            fassertFailedNoTrace(18643);
        }
        LOG(1) << "checking complete" << endl;
    }
예제 #6
0
void ServiceContextMongoD::createLockFile() {
    try {
        _lockFile.reset(new StorageEngineLockFile(storageGlobalParams.dbpath));
    } catch (const std::exception& ex) {
        uassert(28596,
                str::stream() << "Unable to determine status of lock file in the data directory "
                              << storageGlobalParams.dbpath << ": " << ex.what(),
                false);
    }
    bool wasUnclean = _lockFile->createdByUncleanShutdown();
    auto openStatus = _lockFile->open();
    if (storageGlobalParams.readOnly && openStatus == ErrorCodes::IllegalOperation) {
        _lockFile.reset();
    } else {
        uassertStatusOK(openStatus);
    }

    if (wasUnclean) {
        if (storageGlobalParams.readOnly) {
            severe() << "Attempted to open dbpath in readOnly mode, but the server was "
                        "previously not shut down cleanly.";
            fassertFailedNoTrace(34416);
        }
        warning() << "Detected unclean shutdown - " << _lockFile->getFilespec() << " is not empty.";
    }
}
예제 #7
0
파일: sync_tail.cpp 프로젝트: Jaryli/mongo
BSONObj SyncTail::getMissingDoc(OperationContext* txn, Database* db, const BSONObj& o) {
    OplogReader missingObjReader;  // why are we using OplogReader to run a non-oplog query?
    const char* ns = o.getStringField("ns");

    // capped collections
    Collection* collection = db->getCollection(ns);
    if (collection && collection->isCapped()) {
        log() << "missing doc, but this is okay for a capped collection (" << ns << ")";
        return BSONObj();
    }

    const int retryMax = 3;
    for (int retryCount = 1; retryCount <= retryMax; ++retryCount) {
        if (retryCount != 1) {
            // if we are retrying, sleep a bit to let the network possibly recover
            sleepsecs(retryCount * retryCount);
        }
        try {
            bool ok = missingObjReader.connect(HostAndPort(_hostname));
            if (!ok) {
                warning() << "network problem detected while connecting to the "
                          << "sync source, attempt " << retryCount << " of " << retryMax << endl;
                continue;  // try again
            }
        } catch (const SocketException&) {
            warning() << "network problem detected while connecting to the "
                      << "sync source, attempt " << retryCount << " of " << retryMax << endl;
            continue;  // try again
        }

        // get _id from oplog entry to create query to fetch document.
        const BSONElement opElem = o.getField("op");
        const bool isUpdate = !opElem.eoo() && opElem.str() == "u";
        const BSONElement idElem = o.getObjectField(isUpdate ? "o2" : "o")["_id"];

        if (idElem.eoo()) {
            severe() << "cannot fetch missing document without _id field: " << o.toString();
            fassertFailedNoTrace(28742);
        }

        BSONObj query = BSONObjBuilder().append(idElem).obj();
        BSONObj missingObj;
        try {
            missingObj = missingObjReader.findOne(ns, query);
        } catch (const SocketException&) {
            warning() << "network problem detected while fetching a missing document from the "
                      << "sync source, attempt " << retryCount << " of " << retryMax << endl;
            continue;  // try again
        } catch (DBException& e) {
            error() << "assertion fetching missing object: " << e.what() << endl;
            throw;
        }

        // success!
        return missingObj;
    }
    // retry count exceeded
    msgasserted(15916,
                str::stream() << "Can no longer connect to initial sync source: " << _hostname);
}
예제 #8
0
파일: rs.cpp 프로젝트: FromPointer/mongo
    void ReplSet::haveNewConfig(OperationContext* txn, ReplSetConfig& newConfig, bool addComment) {
        bo comment;
        if( addComment )
            comment = BSON( "msg" << "Reconfig set" << "version" << newConfig.version );

        newConfig.saveConfigLocally(txn, comment);

        try {
            BSONObj oldConfForAudit = config().asBson();
            BSONObj newConfForAudit = newConfig.asBson();
            audit::logReplSetReconfig(ClientBasic::getCurrent(),
                                      &oldConfForAudit,
                                      &newConfForAudit);
            if (initFromConfig(txn, newConfig, true)) {
                log() << "replSet replSetReconfig new config saved locally" << rsLog;
            }
        }
        catch (const DBException& e) {
            log() << "replSet error unexpected exception in haveNewConfig() : " << e.toString() << rsLog;
            fassertFailedNoTrace(18755);
        }
        catch (...) {
            std::terminate();
        }
    }
예제 #9
0
    void restartInProgressIndexesFromLastShutdown() {
        OperationContextImpl txn;

        cc().getAuthorizationSession()->grantInternalAuthorization();

        std::vector<std::string> dbNames;

        StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine();
        storageEngine->listDatabases( &dbNames );

        try {
            std::list<std::string> collNames;
            for (std::vector<std::string>::const_iterator dbName = dbNames.begin();
                 dbName < dbNames.end();
                 ++dbName) {
                Client::ReadContext ctx(&txn, *dbName);

                Database* db = ctx.ctx().db();
                db->getDatabaseCatalogEntry()->getCollectionNamespaces(&collNames);
            }
            checkNS(&txn, collNames);
        }
        catch (const DBException& e) {
            error() << "Index rebuilding did not complete: " << e.toString();
            log() << "note: restart the server with --noIndexBuildRetry to skip index rebuilds";
            fassertFailedNoTrace(18643);
        }
        LOG(1) << "checking complete" << endl;
    }
예제 #10
0
    // Copies ops out of the bgsync queue into the deque passed in as a parameter.
    // Returns true if the batch should be ended early.
    // Batch should end early if we encounter a command, or if
    // there are no further ops in the bgsync queue to read.
    // This function also blocks 1 second waiting for new ops to appear in the bgsync
    // queue.  We can't block forever because there are maintenance things we need
    // to periodically check in the loop.
    bool SyncTail::tryPopAndWaitForMore(SyncTail::OpQueue* ops, ReplicationCoordinator* replCoord) {
        BSONObj op;
        // Check to see if there are ops waiting in the bgsync queue
        bool peek_success = peek(&op);

        if (!peek_success) {
            // if we don't have anything in the queue, wait a bit for something to appear
            if (ops->empty()) {
                replCoord->signalDrainComplete();
                // block up to 1 second
                _networkQueue->waitForMore();
                return false;
            }

            // otherwise, apply what we have
            return true;
        }

        const char* ns = op["ns"].valuestrsafe();

        // check for commands
        if ((op["op"].valuestrsafe()[0] == 'c') ||
            // Index builds are acheived through the use of an insert op, not a command op.
            // The following line is the same as what the insert code uses to detect an index build.
            ( *ns != '\0' && nsToCollectionSubstring(ns) == "system.indexes" )) {

            if (ops->empty()) {
                // apply commands one-at-a-time
                ops->push_back(op);
                _networkQueue->consume();
            }

            // otherwise, apply what we have so far and come back for the command
            return true;
        }

        // check for oplog version change
        BSONElement elemVersion = op["v"];
        int curVersion = 0;
        if (elemVersion.eoo())
            // missing version means version 1
            curVersion = 1;
        else
            curVersion = elemVersion.Int();
        
        if (curVersion != OPLOG_VERSION) {
            severe() << "expected oplog version " << OPLOG_VERSION << " but found version " 
                     << curVersion << " in oplog entry: " << op;
            fassertFailedNoTrace(18820);
        }
    
        // Copy the op to the deque and remove it from the bgsync queue.
        ops->push_back(op);
        _networkQueue->consume();

        // Go back for more ops
        return false;
    }
// Copies ops out of the bgsync queue into the deque passed in as a parameter.
// Returns true if the batch should be ended early.
// Batch should end early if we encounter a command, or if
// there are no further ops in the bgsync queue to read.
// This function also blocks 1 second waiting for new ops to appear in the bgsync
// queue.  We can't block forever because there are maintenance things we need
// to periodically check in the loop.
bool SyncTail::tryPopAndWaitForMore(OperationContext* txn, SyncTail::OpQueue* ops) {
    BSONObj op;
    // Check to see if there are ops waiting in the bgsync queue
    bool peek_success = peek(&op);

    if (!peek_success) {
        // if we don't have anything in the queue, wait a bit for something to appear
        if (ops->empty()) {
            // block up to 1 second
            _networkQueue->waitForMore();
            return false;
        }

        // otherwise, apply what we have
        return true;
    }

    auto entry = OplogEntry(op);

    // Check for ops that must be processed one at a time.
    if (entry.raw.isEmpty() ||       // sentinel that network queue is drained.
        (entry.opType[0] == 'c') ||  // commands.
        // Index builds are acheived through the use of an insert op, not a command op.
        // The following line is the same as what the insert code uses to detect an index build.
        (!entry.ns.empty() && nsToCollectionSubstring(entry.ns) == "system.indexes")) {
        if (ops->empty()) {
            // apply commands one-at-a-time
            ops->push_back(std::move(entry));
            _networkQueue->consume();
        }

        // otherwise, apply what we have so far and come back for the command
        return true;
    }

    // check for oplog version change
    int curVersion = 0;
    if (entry.version.eoo())
        // missing version means version 1
        curVersion = 1;
    else
        curVersion = entry.version.Int();

    if (curVersion != OPLOG_VERSION) {
        severe() << "expected oplog version " << OPLOG_VERSION << " but found version "
                 << curVersion << " in oplog entry: " << op;
        fassertFailedNoTrace(18820);
    }

    // Copy the op to the deque and remove it from the bgsync queue.
    ops->push_back(std::move(entry));
    _networkQueue->consume();

    // Go back for more ops
    return false;
}
예제 #12
0
void syncDoInitialSync(ReplicationCoordinatorExternalState* replicationCoordinatorExternalState) {
    stdx::unique_lock<stdx::mutex> lk(_initialSyncMutex, stdx::defer_lock);
    if (!lk.try_lock()) {
        uasserted(34474, "Initial Sync Already Active.");
    }

    std::unique_ptr<BackgroundSync> bgsync;
    {
        log() << "Starting replication fetcher thread for initial sync";
        auto txn = cc().makeOperationContext();
        bgsync = stdx::make_unique<BackgroundSync>(
            replicationCoordinatorExternalState,
            replicationCoordinatorExternalState->makeInitialSyncOplogBuffer(txn.get()));
        bgsync->startup(txn.get());
        createOplog(txn.get());
    }
    ON_BLOCK_EXIT([&bgsync]() {
        log() << "Stopping replication fetcher thread for initial sync";
        auto txn = cc().makeOperationContext();
        bgsync->shutdown(txn.get());
        bgsync->join(txn.get());
    });

    int failedAttempts = 0;
    while (failedAttempts < kMaxFailedAttempts) {
        try {
            // leave loop when successful
            Status status = _initialSync(bgsync.get());
            if (status.isOK()) {
                break;
            } else {
                error() << status;
            }
        } catch (const DBException& e) {
            error() << e;
            // Return if in shutdown
            if (inShutdown()) {
                return;
            }
        }

        if (inShutdown()) {
            return;
        }

        error() << "initial sync attempt failed, " << (kMaxFailedAttempts - ++failedAttempts)
                << " attempts remaining";
        sleepmillis(durationCount<Milliseconds>(kInitialSyncRetrySleepDuration));
    }

    // No need to print a stack
    if (failedAttempts >= kMaxFailedAttempts) {
        severe() << "The maximum number of retries have been exhausted for initial sync.";
        fassertFailedNoTrace(16233);
    }
}
예제 #13
0
파일: bgsync.cpp 프로젝트: gormanb/mongo
void BackgroundSync::_rollback(OperationContext* txn,
                               const HostAndPort& source,
                               stdx::function<DBClientBase*()> getConnection) {
    // Abort only when syncRollback detects we are in a unrecoverable state.
    // In other cases, we log the message contained in the error status and retry later.
    auto status = syncRollback(txn,
                               OplogInterfaceLocal(txn, rsOplogName),
                               RollbackSourceImpl(getConnection, source, rsOplogName),
                               _replCoord);
    if (status.isOK()) {
        // When the syncTail thread sees there is no new data by adding something to the buffer.
        _signalNoNewDataForApplier(txn);
        // Wait until the buffer is empty.
        // This is an indication that syncTail has removed the sentinal marker from the buffer
        // and reset its local lastAppliedOpTime via the replCoord.
        while (!_oplogBuffer->isEmpty()) {
            sleepmillis(10);
            if (inShutdown()) {
                return;
            }
        }

        // At this point we are about to leave rollback.  Before we do, wait for any writes done
        // as part of rollback to be durable, and then do any necessary checks that we didn't
        // wind up rolling back something illegal.  We must wait for the rollback to be durable
        // so that if we wind up shutting down uncleanly in response to something we rolled back
        // we know that we won't wind up right back in the same situation when we start back up
        // because the rollback wasn't durable.
        txn->recoveryUnit()->waitUntilDurable();

        // If we detected that we rolled back the shardIdentity document as part of this rollback
        // then we must shut down to clear the in-memory ShardingState associated with the
        // shardIdentity document.
        if (ShardIdentityRollbackNotifier::get(txn)->didRollbackHappen()) {
            severe()
                << "shardIdentity document rollback detected.  Shutting down to clear "
                   "in-memory sharding state.  Restarting this process should safely return it "
                   "to a healthy state";
            fassertFailedNoTrace(40276);
        }

        // It is now safe to clear the ROLLBACK state, which may result in the applier thread
        // transitioning to SECONDARY.  This is safe because the applier thread has now reloaded
        // the new rollback minValid from the database.
        if (!_replCoord->setFollowerMode(MemberState::RS_RECOVERING)) {
            warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                      << "; expected to be in state " << MemberState(MemberState::RS_ROLLBACK)
                      << " but found self in " << _replCoord->getMemberState();
        }
        return;
    }
    if (ErrorCodes::UnrecoverableRollbackError == status.code()) {
        fassertNoTrace(28723, status);
    }
    warning() << "rollback cannot proceed at this time (retrying later): " << redact(status);
}
예제 #14
0
파일: db.cpp 프로젝트: judahschvimer/mongo
/**
 * Check that the oplog is capped, and abort the process if it is not.
 * Caller must lock DB before calling this function.
 */
void checkForCappedOplog(OperationContext* txn, Database* db) {
    const NamespaceString oplogNss(repl::rsOplogName);
    invariant(txn->lockState()->isDbLockedForMode(oplogNss.db(), MODE_IS));
    Collection* oplogCollection = db->getCollection(oplogNss);
    if (oplogCollection && !oplogCollection->isCapped()) {
        severe() << "The oplog collection " << oplogNss
                 << " is not capped; a capped oplog is a requirement for replication to function.";
        fassertFailedNoTrace(40115);
    }
}
예제 #15
0
void markThreadIdle() {
    if (!threadIdleCallback) {
        return;
    }
    try {
        threadIdleCallback();
    } catch (...) {
        severe() << "Exception escaped from threadIdleCallback";
        fassertFailedNoTrace(28603);
    }
}
예제 #16
0
파일: sync_tail.cpp 프로젝트: Jaryli/mongo
// This free function is used by the initial sync writer threads to apply each op
void multiInitialSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) {
    initializeWriterThread();

    OperationContextImpl txn;
    txn.setReplicatedWrites(false);
    DisableDocumentValidation validationDisabler(&txn);

    // allow us to get through the magic barrier
    txn.lockState()->setIsBatchWriter(true);

    bool convertUpdatesToUpserts = false;

    for (std::vector<BSONObj>::const_iterator it = ops.begin(); it != ops.end(); ++it) {
        try {
            const Status s = SyncTail::syncApply(&txn, *it, convertUpdatesToUpserts);
            if (!s.isOK()) {
                if (st->shouldRetry(&txn, *it)) {
                    const Status s2 = SyncTail::syncApply(&txn, *it, convertUpdatesToUpserts);
                    if (!s2.isOK()) {
                        severe() << "Error applying operation (" << it->toString() << "): " << s2;
                        fassertFailedNoTrace(15915);
                    }
                }

                // If shouldRetry() returns false, fall through.
                // This can happen if the document that was moved and missed by Cloner
                // subsequently got deleted and no longer exists on the Sync Target at all
            }
        } catch (const DBException& e) {
            severe() << "writer worker caught exception: " << causedBy(e)
                     << " on: " << it->toString();

            if (inShutdown()) {
                return;
            }

            fassertFailedNoTrace(16361);
        }
    }
}
예제 #17
0
    // This free function is used by the writer threads to apply each op
    void multiSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) {
        initializeWriterThread();

        // convert update operations only for 2.2.1 or greater, because we need guaranteed
        // idempotent operations for this to work.  See SERVER-6825
        bool convertUpdatesToUpserts = theReplSet->oplogVersion > 1 ? true : false;

        for (std::vector<BSONObj>::const_iterator it = ops.begin();
             it != ops.end();
             ++it) {
            try {
                OperationContextImpl txn;
                if (!st->syncApply(&txn, *it, convertUpdatesToUpserts)) {
                    fassertFailedNoTrace(16359);
                }
            } catch (const DBException& e) {
                error() << "writer worker caught exception: " << causedBy(e)
                        << " on: " << it->toString() << endl;
                fassertFailedNoTrace(16360);
            }
        }
    }
예제 #18
0
void ReplicationRecoveryImpl::_truncateOplogTo(OperationContext* opCtx,
                                               Timestamp truncateTimestamp) {
    Timer timer;
    const NamespaceString oplogNss(NamespaceString::kRsOplogNamespace);
    AutoGetDb autoDb(opCtx, oplogNss.db(), MODE_IX);
    Lock::CollectionLock oplogCollectionLoc(opCtx->lockState(), oplogNss.ns(), MODE_X);
    Collection* oplogCollection = autoDb.getDb()->getCollection(opCtx, oplogNss);
    if (!oplogCollection) {
        fassertFailedWithStatusNoTrace(
            34418,
            Status(ErrorCodes::NamespaceNotFound,
                   str::stream() << "Can't find " << NamespaceString::kRsOplogNamespace.ns()));
    }

    // Scan through oplog in reverse, from latest entry to first, to find the truncateTimestamp.
    RecordId oldestIDToDelete;  // Non-null if there is something to delete.
    auto oplogRs = oplogCollection->getRecordStore();
    auto oplogReverseCursor = oplogRs->getCursor(opCtx, /*forward=*/false);
    size_t count = 0;
    while (auto next = oplogReverseCursor->next()) {
        const BSONObj entry = next->data.releaseToBson();
        const RecordId id = next->id;
        count++;

        const auto tsElem = entry["ts"];
        if (count == 1) {
            if (tsElem.eoo())
                LOG(2) << "Oplog tail entry: " << redact(entry);
            else
                LOG(2) << "Oplog tail entry ts field: " << tsElem;
        }

        if (tsElem.timestamp() < truncateTimestamp) {
            // If count == 1, that means that we have nothing to delete because everything in the
            // oplog is < truncateTimestamp.
            if (count != 1) {
                invariant(!oldestIDToDelete.isNull());
                oplogCollection->cappedTruncateAfter(opCtx, oldestIDToDelete, /*inclusive=*/true);
            }
            log() << "Replication recovery oplog truncation finished in: " << timer.millis()
                  << "ms";
            return;
        }

        oldestIDToDelete = id;
    }

    severe() << "Reached end of oplog looking for oplog entry before " << truncateTimestamp.toBSON()
             << " but couldn't find any after looking through " << count << " entries.";
    fassertFailedNoTrace(40296);
}
예제 #19
0
    // This free function is used by the writer threads to apply each op
    void multiSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) {
        initializeWriterThread();

        OperationContextImpl txn;

        // allow us to get through the magic barrier
        Lock::ParallelBatchWriterMode::iAmABatchParticipant(txn.lockState());

        bool convertUpdatesToUpserts = true;

        for (std::vector<BSONObj>::const_iterator it = ops.begin();
             it != ops.end();
             ++it) {
            try {
                if (!st->syncApply(&txn, *it, convertUpdatesToUpserts)) {
                    fassertFailedNoTrace(16359);
                }
            } catch (const DBException& e) {
                error() << "writer worker caught exception: " << causedBy(e)
                        << " on: " << it->toString() << endl;
                fassertFailedNoTrace(16360);
            }
        }
    }
예제 #20
0
파일: oplog.cpp 프로젝트: TylerBrock/mongo
    /** write an op to the oplog that is already built.
        todo : make _logOpRS() call this so we don't repeat ourself?
        */
    OpTime _logOpObjRS(OperationContext* txn, const BSONObj& op) {
        Lock::DBLock lk(txn->lockState(), "local", newlm::MODE_X);
        // XXX soon this needs to be part of an outer WUOW not its own.
        // We can't do this yet due to locking limitations.
        WriteUnitOfWork wunit(txn);

        const OpTime ts = op["ts"]._opTime();
        long long hash = op["h"].numberLong();

        {
            if ( localOplogRSCollection == 0 ) {
                Client::Context ctx(txn, rsoplog);

                localDB = ctx.db();
                verify( localDB );
                localOplogRSCollection = localDB->getCollection(txn, rsoplog);
                massert(13389,
                        "local.oplog.rs missing. did you drop it? if so restart server",
                        localOplogRSCollection);
            }
            Client::Context ctx(txn, rsoplog, localDB);
            checkOplogInsert(localOplogRSCollection->insertDocument(txn, op, false));

            ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator();
            OpTime myLastOptime = replCoord->getMyLastOptime();
            if (!(myLastOptime < ts)) {
                severe() << "replication oplog stream went back in time. previous timestamp: "
                         << myLastOptime << " newest timestamp: " << ts;
                fassertFailedNoTrace(18905);
            }
            
            BackgroundSync* bgsync = BackgroundSync::get();
            // Keep this up-to-date, in case we step up to primary.
            bgsync->setLastAppliedHash(hash);

            ctx.getClient()->setLastOp( ts );
            
            replCoord->setMyLastOptime(txn, ts);
            bgsync->notify();
        }

        setNewOptime(ts);
        wunit.commit();
        return ts;
    }
예제 #21
0
void syncDoInitialSync(BackgroundSync* bgsync) {
    stdx::unique_lock<stdx::mutex> lk(_initialSyncMutex, stdx::defer_lock);
    if (!lk.try_lock()) {
        uasserted(34474, "Initial Sync Already Active.");
    }

    {
        const ServiceContext::UniqueOperationContext txnPtr = cc().makeOperationContext();
        OperationContext& txn = *txnPtr;
        createOplog(&txn);
    }

    int failedAttempts = 0;
    while (failedAttempts < kMaxFailedAttempts) {
        try {
            // leave loop when successful
            Status status = _initialSync(bgsync);
            if (status.isOK()) {
                break;
            } else {
                error() << status;
            }
        } catch (const DBException& e) {
            error() << e;
            // Return if in shutdown
            if (inShutdown()) {
                return;
            }
        }

        if (inShutdown()) {
            return;
        }

        error() << "initial sync attempt failed, " << (kMaxFailedAttempts - ++failedAttempts)
                << " attempts remaining";
        sleepmillis(durationCount<Milliseconds>(kInitialSyncRetrySleepDuration));
    }

    // No need to print a stack
    if (failedAttempts >= kMaxFailedAttempts) {
        severe() << "The maximum number of retries have been exhausted for initial sync.";
        fassertFailedNoTrace(16233);
    }
}
void DropPendingCollectionReaper::addDropPendingNamespace(
    const OpTime& dropOpTime, const NamespaceString& dropPendingNamespace) {
    invariant(dropPendingNamespace.isDropPendingNamespace());
    stdx::lock_guard<stdx::mutex> lock(_mutex);
    const auto equalRange = _dropPendingNamespaces.equal_range(dropOpTime);
    const auto& lowerBound = equalRange.first;
    const auto& upperBound = equalRange.second;
    auto matcher = [&dropPendingNamespace](const auto& pair) {
        return pair.second == dropPendingNamespace;
    };
    if (std::find_if(lowerBound, upperBound, matcher) == upperBound) {
        _dropPendingNamespaces.insert(std::make_pair(dropOpTime, dropPendingNamespace));
    } else {
        severe() << "Failed to add drop-pending collection " << dropPendingNamespace
                 << " with drop optime " << dropOpTime << ": duplicate optime and namespace pair.";
        fassertFailedNoTrace(40448);
    }
}
예제 #23
0
    void syncDoInitialSync() {
        static const int maxFailedAttempts = 10;

        {
            OperationContextImpl txn;
            createOplog(&txn);
        }

        int failedAttempts = 0;
        while ( failedAttempts < maxFailedAttempts ) {
            try {
                // leave loop when successful
                Status status = _initialSync();
                if (status.isOK()) {
                    break;
                }
                if (status == ErrorCodes::InitialSyncOplogSourceMissing) {
                    sleepsecs(1);
                    return;
                }
            }
            catch(const DBException& e) {
                error() << e ;
                // Return if in shutdown
                if (inShutdown()) {
                    return;
                }
            }

            if (inShutdown()) {
                return;
            }

            error() << "initial sync attempt failed, "
                    << (maxFailedAttempts - ++failedAttempts) << " attempts remaining";
            sleepsecs(5);
        }

        // No need to print a stack
        if (failedAttempts >= maxFailedAttempts) {
            severe() << "The maximum number of retries have been exhausted for initial sync.";
            fassertFailedNoTrace(16233);
        }
    }
예제 #24
0
Status WiredTigerSnapshotManager::setTransactionReadTimestamp(Timestamp pointInTime,
                                                              WT_SESSION* session,
                                                              bool roundToOldest) const {
    char readTSConfigString[15 /* read_timestamp= */ + 16 /* 16 hexadecimal digits */ +
                            17 /* ,round_to_oldest= */ + 5 /* false */ + 1 /* trailing null */];
    auto size = std::snprintf(readTSConfigString,
                              sizeof(readTSConfigString),
                              "read_timestamp=%llx,round_to_oldest=%s",
                              pointInTime.asULL(),
                              (roundToOldest) ? "true" : "false");
    if (size < 0) {
        int e = errno;
        error() << "error snprintf " << errnoWithDescription(e);
        fassertFailedNoTrace(40664);
    }
    invariant(static_cast<std::size_t>(size) < sizeof(readTSConfigString));

    return wtRCToStatus(session->timestamp_transaction(session, readTSConfigString));
}
예제 #25
0
파일: rs.cpp 프로젝트: FromPointer/mongo
    /* forked as a thread during startup
       it can run quite a while looking for config.  but once found,
       a separate thread takes over as ReplSetImpl::Manager, and this thread
       terminates.
    */
    void startReplSets(ReplSetSeedList *replSetSeedList) {
        Client::initThread("rsStart");
        OperationContextImpl txn;

        try {
            verify( theReplSet == 0 );
            if( replSetSeedList == 0 ) {
                return;
            }
            replLocalAuth();
            (theReplSet = ReplSet::make(&txn, *replSetSeedList))->go();
        }
        catch(std::exception& e) {
            log() << "replSet caught exception in startReplSets thread: " << e.what() << rsLog;
            if( theReplSet )
                fassertFailedNoTrace(18756);
        }
        cc().shutdown();
    }
예제 #26
0
// Uses the collator factory to convert the BSON representation of a collator to a
// CollatorInterface. Returns null if the BSONObj is empty. We expect the stored collation to be
// valid, since it gets validated on collection create.
std::unique_ptr<CollatorInterface> parseCollation(OperationContext* opCtx,
                                                  const NamespaceString& nss,
                                                  BSONObj collationSpec) {
    if (collationSpec.isEmpty()) {
        return {nullptr};
    }

    auto collator =
        CollatorFactoryInterface::get(opCtx->getServiceContext())->makeFromBSON(collationSpec);

    // If the collection's default collator has a version not currently supported by our ICU
    // integration, shut down the server. Errors other than IncompatibleCollationVersion should not
    // be possible, so these are an invariant rather than fassert.
    if (collator == ErrorCodes::IncompatibleCollationVersion) {
        log() << "Collection " << nss
              << " has a default collation which is incompatible with this version: "
              << collationSpec;
        fassertFailedNoTrace(40144);
    }
    invariantOK(collator.getStatus());

    return std::move(collator.getValue());
}
예제 #27
0
파일: db.cpp 프로젝트: judahschvimer/mongo
void repairDatabasesAndCheckVersion(OperationContext* txn) {
    LOG(1) << "enter repairDatabases (to check pdfile version #)";

    ScopedTransaction transaction(txn, MODE_X);
    Lock::GlobalWrite lk(txn->lockState());

    vector<string> dbNames;

    StorageEngine* storageEngine = txn->getServiceContext()->getGlobalStorageEngine();
    storageEngine->listDatabases(&dbNames);

    // Repair all databases first, so that we do not try to open them if they are in bad shape
    if (storageGlobalParams.repair) {
        invariant(!storageGlobalParams.readOnly);
        for (vector<string>::const_iterator i = dbNames.begin(); i != dbNames.end(); ++i) {
            const string dbName = *i;
            LOG(1) << "    Repairing database: " << dbName;

            fassert(18506, repairDatabase(txn, storageEngine, dbName));
        }
    }

    const repl::ReplSettings& replSettings = repl::getGlobalReplicationCoordinator()->getSettings();

    // On replica set members we only clear temp collections on DBs other than "local" during
    // promotion to primary. On pure slaves, they are only cleared when the oplog tells them
    // to. The local DB is special because it is not replicated.  See SERVER-10927 for more
    // details.
    const bool shouldClearNonLocalTmpCollections =
        !(checkIfReplMissingFromCommandLine(txn) || replSettings.usingReplSets() ||
          replSettings.isSlave());

    const bool shouldDoCleanupForSERVER23299 = isSubjectToSERVER23299(txn);

    for (vector<string>::const_iterator i = dbNames.begin(); i != dbNames.end(); ++i) {
        const string dbName = *i;
        LOG(1) << "    Recovering database: " << dbName;

        Database* db = dbHolder().openDb(txn, dbName);
        invariant(db);

        // First thing after opening the database is to check for file compatibility,
        // otherwise we might crash if this is a deprecated format.
        auto status = db->getDatabaseCatalogEntry()->currentFilesCompatible(txn);
        if (!status.isOK()) {
            if (status.code() == ErrorCodes::CanRepairToDowngrade) {
                // Convert CanRepairToDowngrade statuses to MustUpgrade statuses to avoid logging a
                // potentially confusing and inaccurate message.
                //
                // TODO SERVER-24097: Log a message informing the user that they can start the
                // current version of mongod with --repair and then proceed with normal startup.
                status = {ErrorCodes::MustUpgrade, status.reason()};
            }
            severe() << "Unable to start mongod due to an incompatibility with the data files and"
                        " this version of mongod: "
                     << redact(status);
            severe() << "Please consult our documentation when trying to downgrade to a previous"
                        " major release";
            quickExit(EXIT_NEED_UPGRADE);
            return;
        }

        // Check if admin.system.version contains an invalid featureCompatibilityVersion.
        // If a valid featureCompatibilityVersion is present, cache it as a server parameter.
        if (dbName == "admin") {
            if (Collection* versionColl =
                    db->getCollection(FeatureCompatibilityVersion::kCollection)) {
                BSONObj featureCompatibilityVersion;
                if (Helpers::findOne(txn,
                                     versionColl,
                                     BSON("_id" << FeatureCompatibilityVersion::kParameterName),
                                     featureCompatibilityVersion)) {
                    auto version = FeatureCompatibilityVersion::parse(featureCompatibilityVersion);
                    if (!version.isOK()) {
                        severe() << version.getStatus();
                        fassertFailedNoTrace(40283);
                    }
                    serverGlobalParams.featureCompatibilityVersion.store(version.getValue());
                }
            }
        }

        // Major versions match, check indexes
        const string systemIndexes = db->name() + ".system.indexes";

        Collection* coll = db->getCollection(systemIndexes);
        unique_ptr<PlanExecutor> exec(
            InternalPlanner::collectionScan(txn, systemIndexes, coll, PlanExecutor::YIELD_MANUAL));

        BSONObj index;
        PlanExecutor::ExecState state;
        while (PlanExecutor::ADVANCED == (state = exec->getNext(&index, NULL))) {
            const BSONObj key = index.getObjectField("key");
            const string plugin = IndexNames::findPluginName(key);

            if (db->getDatabaseCatalogEntry()->isOlderThan24(txn)) {
                if (IndexNames::existedBefore24(plugin)) {
                    continue;
                }

                log() << "Index " << index << " claims to be of type '" << plugin << "', "
                      << "which is either invalid or did not exist before v2.4. "
                      << "See the upgrade section: "
                      << "http://dochub.mongodb.org/core/upgrade-2.4" << startupWarningsLog;
            }

            const Status keyStatus = validateKeyPattern(key);
            if (!keyStatus.isOK()) {
                log() << "Problem with index " << index << ": " << redact(keyStatus)
                      << " This index can still be used however it cannot be rebuilt."
                      << " For more info see"
                      << " http://dochub.mongodb.org/core/index-validation" << startupWarningsLog;
            }

            if (index["v"].isNumber() && index["v"].numberInt() == 0) {
                log() << "WARNING: The index: " << index << " was created with the deprecated"
                      << " v:0 format.  This format will not be supported in a future release."
                      << startupWarningsLog;
                log() << "\t To fix this, you need to rebuild this index."
                      << " For instructions, see http://dochub.mongodb.org/core/rebuild-v0-indexes"
                      << startupWarningsLog;
            }
        }

        // Non-yielding collection scans from InternalPlanner will never error.
        invariant(PlanExecutor::IS_EOF == state);

        if (replSettings.usingReplSets()) {
            // We only care about the _id index if we are in a replset
            checkForIdIndexes(txn, db);
            // Ensure oplog is capped (mmap does not guarantee order of inserts on noncapped
            // collections)
            if (db->name() == "local") {
                checkForCappedOplog(txn, db);
            }
        }

        if (shouldDoCleanupForSERVER23299) {
            handleSERVER23299ForDb(txn, db);
        }

        if (!storageGlobalParams.readOnly &&
            (shouldClearNonLocalTmpCollections || dbName == "local")) {
            db->clearTmpCollections(txn);
        }
    }

    LOG(1) << "done repairDatabases";
}
예제 #28
0
void ThreadPool::_consumeTasks() {
    stdx::unique_lock<stdx::mutex> lk(_mutex);
    while (_state == running) {
        if (_pendingTasks.empty()) {
            if (_threads.size() > _options.minThreads) {
                // Since there are more than minThreads threads, this thread may be eligible for
                // retirement. If it isn't now, it may be later, so it must put a time limit on how
                // long it waits on _workAvailable.
                const auto now = Date_t::now();
                const auto nextThreadRetirementDate =
                    _lastFullUtilizationDate + _options.maxIdleThreadAge;
                if (now >= nextThreadRetirementDate) {
                    _lastFullUtilizationDate = now;
                    LOG(1) << "Reaping this thread; next thread reaped no earlier than "
                           << _lastFullUtilizationDate + _options.maxIdleThreadAge;
                    break;
                }

                LOG(3) << "Not reaping because the earliest retirement date is "
                       << nextThreadRetirementDate;
                _workAvailable.wait_until(lk, nextThreadRetirementDate.toSystemTimePoint());
            } else {
                // Since the number of threads is not more than minThreads, this thread is not
                // eligible for retirement. It is OK to sleep until _workAvailable is signaled,
                // because any new threads that put the number of total threads above minThreads
                // would be eligible for retirement once they had no work left to do.
                LOG(3) << "waiting for work; I am one of " << _threads.size() << " thread(s);"
                       << " the minimum number of threads is " << _options.minThreads;
                _workAvailable.wait(lk);
            }
            continue;
        }

        _doOneTask(&lk);
    }

    // We still hold the lock, but this thread is retiring. If the whole pool is shutting down, this
    // thread lends a hand in draining the work pool and returns so it can be joined. Otherwise, it
    // falls through to the detach code, below.

    if (_state == joinRequired || _state == joining) {
        // Drain the leftover pending tasks.
        while (!_pendingTasks.empty()) {
            _doOneTask(&lk);
        }
        --_numIdleThreads;
        return;
    }
    --_numIdleThreads;

    if (_state != running) {
        severe() << "State of pool " << _options.poolName << " is " << static_cast<int32_t>(_state)
                 << ", but expected " << static_cast<int32_t>(running);
        fassertFailedNoTrace(28701);
    }

    // This thread is ending because it was idle for too long.  Find self in _threads, remove self
    // from _threads, detach self.
    for (size_t i = 0; i < _threads.size(); ++i) {
        auto& t = _threads[i];
        if (t.get_id() != stdx::this_thread::get_id()) {
            continue;
        }
        t.detach();
        t.swap(_threads.back());
        _threads.pop_back();
        return;
    }
    severe().stream() << "Could not find this thread, with id " << stdx::this_thread::get_id()
                      << " in pool " << _options.poolName;
    fassertFailedNoTrace(28703);
}
예제 #29
0
    void ReplSetImpl::loadConfig(OperationContext* txn) {
        startupStatus = LOADINGCONFIG;
        startupStatusMsg.set("loading " + rsConfigNs + " config (LOADINGCONFIG)");
        LOG(1) << "loadConfig() " << rsConfigNs << endl;

        while (1) {
            try {
                OwnedPointerVector<ReplSetConfig> configs;
                try {
                    configs.mutableVector().push_back(ReplSetConfig::makeDirect(txn));
                }
                catch (DBException& e) {
                    log() << "replSet exception loading our local replset configuration object : "
                          << e.toString() << rsLog;
                }
                for (vector<HostAndPort>::const_iterator i = _seeds->begin();
                        i != _seeds->end();
                        i++) {
                    try {
                        configs.mutableVector().push_back(ReplSetConfig::make(txn, *i));
                    }
                    catch (DBException& e) {
                        log() << "replSet exception trying to load config from " << *i
                              << " : " << e.toString() << rsLog;
                    }
                }
                ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings();
                {
                    scoped_lock lck(replSettings.discoveredSeeds_mx);
                    if (replSettings.discoveredSeeds.size() > 0) {
                        for (set<string>::iterator i = replSettings.discoveredSeeds.begin(); 
                             i != replSettings.discoveredSeeds.end(); 
                             i++) {
                            try {
                                configs.mutableVector().push_back(
                                                            ReplSetConfig::make(txn, HostAndPort(*i)));
                            }
                            catch (DBException&) {
                                LOG(1) << "replSet exception trying to load config from discovered "
                                          "seed " << *i << rsLog;
                                replSettings.discoveredSeeds.erase(*i);
                            }
                        }
                    }
                }

                if (!replSettings.reconfig.isEmpty()) {
                    try {
                        configs.mutableVector().push_back(ReplSetConfig::make(txn, replSettings.reconfig,
                                                                       true));
                    }
                    catch (DBException& re) {
                        log() << "replSet couldn't load reconfig: " << re.what() << rsLog;
                        replSettings.reconfig = BSONObj();
                    }
                }

                int nok = 0;
                int nempty = 0;
                for (vector<ReplSetConfig*>::iterator i = configs.mutableVector().begin();
                     i != configs.mutableVector().end(); i++) {
                    if ((*i)->ok())
                        nok++;
                    if ((*i)->empty())
                        nempty++;
                }
                if (nok == 0) {

                    if (nempty == (int) configs.mutableVector().size()) {
                        startupStatus = EMPTYCONFIG;
                        startupStatusMsg.set("can't get " + rsConfigNs +
                                             " config from self or any seed (EMPTYCONFIG)");
                        log() << "replSet can't get " << rsConfigNs
                              << " config from self or any seed (EMPTYCONFIG)" << rsLog;
                        static unsigned once;
                        if (++once == 1) {
                            log() << "replSet info you may need to run replSetInitiate -- rs.initia"
                                     "te() in the shell -- if that is not already done" << rsLog;
                        }
                        if (_seeds->size() == 0) {
                            LOG(1) << "replSet info no seed hosts were specified on the --replSet "
                                      "command line" << rsLog;
                        }
                    }
                    else {
                        startupStatus = EMPTYUNREACHABLE;
                        startupStatusMsg.set("can't currently get " + rsConfigNs +
                                             " config from self or any seed (EMPTYUNREACHABLE)");
                        log() << "replSet can't get " << rsConfigNs
                              << " config from self or any seed (yet)" << rsLog;
                    }

                    sleepsecs(1);
                    continue;
                }

                if (!_loadConfigFinish(txn, configs.mutableVector())) {
                    log() << "replSet info Couldn't load config yet. Sleeping 3 sec and will try "
                             "again." << rsLog;
                    sleepsecs(3);
                    continue;
                }
            }
            catch (DBException& e) {
                startupStatus = BADCONFIG;
                startupStatusMsg.set("replSet error loading set config (BADCONFIG)");
                log() << "replSet error loading configurations " << e.toString() << rsLog;
                log() << "replSet error replication will not start" << rsLog;
                sethbmsg("error loading set config");
                fassertFailedNoTrace(18754);
                throw;
            }
            break;
        }
        startupStatusMsg.set("? started");
        startupStatus = STARTED;
    }
void ReplicationCoordinatorExternalStateImpl::cleanUpLastApplyBatch(OperationContext* txn) {
    if (_storageInterface->getInitialSyncFlag(txn)) {
        return;  // Initial Sync will take over so no cleanup is needed.
    }

    const auto deleteFromPoint = _storageInterface->getOplogDeleteFromPoint(txn);
    const auto appliedThrough = _storageInterface->getAppliedThrough(txn);

    const bool needToDeleteEndOfOplog = !deleteFromPoint.isNull() &&
        // This version should never have a non-null deleteFromPoint with a null appliedThrough.
        // This scenario means that we downgraded after unclean shutdown, then the downgraded node
        // deleted the ragged end of our oplog, then did a clean shutdown.
        !appliedThrough.isNull() &&
        // Similarly we should never have an appliedThrough higher than the deleteFromPoint. This
        // means that the downgraded node deleted our ragged end then applied ahead of our
        // deleteFromPoint and then had an unclean shutdown before upgrading. We are ok with
        // applying these ops because older versions wrote to the oplog from a single thread so we
        // know they are in order.
        !(appliedThrough.getTimestamp() >= deleteFromPoint);
    if (needToDeleteEndOfOplog) {
        log() << "Removing unapplied entries starting at: " << deleteFromPoint;
        truncateOplogTo(txn, deleteFromPoint);
    }
    _storageInterface->setOplogDeleteFromPoint(txn, {});  // clear the deleteFromPoint

    if (appliedThrough.isNull()) {
        // No follow-up work to do.
        return;
    }

    // Check if we have any unapplied ops in our oplog. It is important that this is done after
    // deleting the ragged end of the oplog.
    const auto topOfOplog = fassertStatusOK(40290, loadLastOpTime(txn));
    if (topOfOplog >= appliedThrough) {
        return;  // We've applied all the valid oplog we have.
    }

    log() << "Replaying stored operations from " << appliedThrough << " (exclusive) to "
          << topOfOplog << " (inclusive).";

    DBDirectClient db(txn);
    auto cursor = db.query(rsOplogName,
                           QUERY("ts" << BSON("$gte" << appliedThrough.getTimestamp())),
                           /*batchSize*/ 0,
                           /*skip*/ 0,
                           /*projection*/ nullptr,
                           QueryOption_OplogReplay);

    // Check that the first document matches our appliedThrough point then skip it since it's
    // already been applied.
    if (!cursor->more()) {
        // This should really be impossible because we check above that the top of the oplog is
        // strictly > appliedThrough. If this fails it represents a serious bug in either the
        // storage engine or query's implementation of OplogReplay.
        severe() << "Couldn't find any entries in the oplog >= " << appliedThrough
                 << " which should be impossible.";
        fassertFailedNoTrace(40293);
    }
    auto firstOpTimeFound = fassertStatusOK(40291, OpTime::parseFromOplogEntry(cursor->nextSafe()));
    if (firstOpTimeFound != appliedThrough) {
        severe() << "Oplog entry at " << appliedThrough << " is missing; actual entry found is "
                 << firstOpTimeFound;
        fassertFailedNoTrace(40292);
    }

    // Apply remaining ops one at at time, but don't log them because they are already logged.
    const bool wereWritesReplicated = txn->writesAreReplicated();
    ON_BLOCK_EXIT([&] { txn->setReplicatedWrites(wereWritesReplicated); });
    txn->setReplicatedWrites(false);

    while (cursor->more()) {
        auto entry = cursor->nextSafe();
        fassertStatusOK(40294, SyncTail::syncApply(txn, entry, true));
        _storageInterface->setAppliedThrough(
            txn, fassertStatusOK(40295, OpTime::parseFromOplogEntry(entry)));
    }
}