Esempio n. 1
0
/* tail an oplog.  ok to return, will be re-called. */
void SyncTail::oplogApplication() {
    ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator();
    ApplyBatchFinalizer finalizer(replCoord);

    OperationContextImpl txn;
    OpTime originalEndOpTime(getMinValid(&txn).end);

    while (!inShutdown()) {
        OpQueue ops;

        Timer batchTimer;
        int lastTimeChecked = 0;

        do {
            int now = batchTimer.seconds();

            // apply replication batch limits
            if (!ops.empty()) {
                if (now > replBatchLimitSeconds)
                    break;
                if (ops.getDeque().size() > replBatchLimitOperations)
                    break;
            }
            // occasionally check some things
            // (always checked in the first iteration of this do-while loop, because
            // ops is empty)
            if (ops.empty() || now > lastTimeChecked) {
                BackgroundSync* bgsync = BackgroundSync::get();
                if (bgsync->getInitialSyncRequestedFlag()) {
                    // got a resync command
                    return;
                }
                lastTimeChecked = now;
                // can we become secondary?
                // we have to check this before calling mgr, as we must be a secondary to
                // become primary
                tryToGoLiveAsASecondary(&txn, replCoord);
            }

            const int slaveDelaySecs = durationCount<Seconds>(replCoord->getSlaveDelaySecs());
            if (!ops.empty() && slaveDelaySecs > 0) {
                const BSONObj lastOp = ops.back();
                const unsigned int opTimestampSecs = lastOp["ts"].timestamp().getSecs();

                // Stop the batch as the lastOp is too new to be applied. If we continue
                // on, we can get ops that are way ahead of the delay and this will
                // make this thread sleep longer when handleSlaveDelay is called
                // and apply ops much sooner than we like.
                if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) {
                    break;
                }
            }

            if (MONGO_FAIL_POINT(rsSyncApplyStop)) {
                break;
            }

            // keep fetching more ops as long as we haven't filled up a full batch yet
        } while (!tryPopAndWaitForMore(&txn, &ops, replCoord) &&  // tryPopAndWaitForMore returns
                                                                  // true when we need to end a
                                                                  // batch early
                 (ops.getSize() < replBatchLimitBytes) &&
                 !inShutdown());

        // For pausing replication in tests
        while (MONGO_FAIL_POINT(rsSyncApplyStop)) {
            sleepmillis(0);
            if (inShutdown())
                return;
        }

        if (ops.empty()) {
            continue;
        }

        const BSONObj lastOp = ops.back();
        handleSlaveDelay(lastOp);

        // Set minValid to the last OpTime that needs to be applied, in this batch or from the
        // (last) failed batch, whichever is larger.
        // This will cause this node to go into RECOVERING state
        // if we should crash and restart before updating finishing.
        const OpTime start(getLastSetTimestamp(), OpTime::kUninitializedTerm);

        // Take the max of the first endOptime (if we recovered) and the end of our batch.
        const auto lastOpTime = fassertStatusOK(28773, OpTime::parseFromOplogEntry(lastOp));

        // Setting end to the max of originalEndOpTime and lastOpTime (the end of the batch)
        // ensures that we keep pushing out the point where we can become consistent
        // and allow reads. If we recover and end up doing smaller batches we must pass the
        // originalEndOpTime before we are good.
        //
        // For example:
        // batch apply, 20-40, end = 40
        // batch failure,
        // restart
        // batch apply, 20-25, end = max(25, 40) = 40
        // batch apply, 25-45, end = 45
        const OpTime end(std::max(originalEndOpTime, lastOpTime));

        // This write will not journal/checkpoint.
        setMinValid(&txn, {start, end});

        OpTime finalOpTime = multiApply(&txn, ops);
        setNewTimestamp(finalOpTime.getTimestamp());

        setMinValid(&txn, end, DurableRequirement::None);
        finalizer.record(finalOpTime);
    }
}
Esempio n. 2
0
    /* tail an oplog.  ok to return, will be re-called. */
    void SyncTail::oplogApplication() {
        while( 1 ) {
            OpQueue ops;

            verify( !Lock::isLocked() );

            Timer batchTimer;
            int lastTimeChecked = 0;

            do {
                if (theReplSet->isPrimary()) {
                    massert(16620, "there are ops to sync, but I'm primary", ops.empty());
                    return;
                }

                int now = batchTimer.seconds();

                // apply replication batch limits
                if (!ops.empty()) {
                    if (now > replBatchLimitSeconds)
                        break;
                    if (ops.getDeque().size() > replBatchLimitOperations)
                        break;
                }
                // occasionally check some things
                // (always checked in the first iteration of this do-while loop, because
                // ops is empty)
                if (ops.empty() || now > lastTimeChecked) {
                    {
                        boost::unique_lock<boost::mutex> lock(theReplSet->initialSyncMutex);
                        if (theReplSet->initialSyncRequested) {
                            // got a resync command
                            return;
                        }
                    }
                    lastTimeChecked = now;
                    // can we become secondary?
                    // we have to check this before calling mgr, as we must be a secondary to
                    // become primary
                    if (!theReplSet->isSecondary()) {
                        OpTime minvalid;

                        OperationContextImpl txn;
                        theReplSet->tryToGoLiveAsASecondary(&txn, minvalid);
                    }

                    // normally msgCheckNewState gets called periodically, but in a single node
                    // replset there are no heartbeat threads, so we do it here to be sure.  this is
                    // relevant if the singleton member has done a stepDown() and needs to come back
                    // up.
                    if (theReplSet->config().members.size() == 1 &&
                        theReplSet->myConfig().potentiallyHot()) {
                        Manager* mgr = theReplSet->mgr;
                        // When would mgr be null?  During replsettest'ing, in which case we should
                        // fall through and actually apply ops as if we were a real secondary.
                        if (mgr) { 
                            mgr->send(stdx::bind(&Manager::msgCheckNewState, theReplSet->mgr));
                            sleepsecs(1);
                            // There should never be ops to sync in a 1-member set, anyway
                            return;
                        }
                    }
                }

                const int slaveDelaySecs = theReplSet->myConfig().slaveDelay;
                if (!ops.empty() && slaveDelaySecs > 0) {
                    const BSONObj& lastOp = ops.getDeque().back();
                    const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs();

                    // Stop the batch as the lastOp is too new to be applied. If we continue
                    // on, we can get ops that are way ahead of the delay and this will
                    // make this thread sleep longer when handleSlaveDelay is called
                    // and apply ops much sooner than we like.
                    if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) {
                        break;
                    }
                }
                // keep fetching more ops as long as we haven't filled up a full batch yet
            } while (!tryPopAndWaitForMore(&ops) && // tryPopAndWaitForMore returns true 
                                                    // when we need to end a batch early
                   (ops.getSize() < replBatchLimitBytes));

            // For pausing replication in tests
            while (MONGO_FAIL_POINT(rsSyncApplyStop)) {
                sleepmillis(0);
            }

            const BSONObj& lastOp = ops.getDeque().back();
            setOplogVersion(lastOp);
            handleSlaveDelay(lastOp);

            // Set minValid to the last op to be applied in this next batch.
            // This will cause this node to go into RECOVERING state
            // if we should crash and restart before updating the oplog
            theReplSet->setMinValid(lastOp);

            if (BackgroundSync::get()->isAssumingPrimary()) {
                LOG(1) << "about to apply batch up to optime: "
                       << ops.getDeque().back()["ts"]._opTime().toStringPretty();
            }
            
            multiApply(ops.getDeque(), multiSyncApply);

            if (BackgroundSync::get()->isAssumingPrimary()) {
                LOG(1) << "about to update oplog to optime: "
                       << ops.getDeque().back()["ts"]._opTime().toStringPretty();
            }
            
            applyOpsToOplog(&ops.getDeque());

            // If we're just testing (no manager), don't keep looping if we exhausted the bgqueue
            if (!theReplSet->mgr) {
                BSONObj op;
                if (!peek(&op)) {
                    return;
                }
            }
        }
    }
Esempio n. 3
0
    /* tail an oplog.  ok to return, will be re-called. */
    void SyncTail::oplogApplication() {
        ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator();

        while(!inShutdown()) {
            OpQueue ops;
            OperationContextImpl txn;

            Timer batchTimer;
            int lastTimeChecked = 0;

            do {
                int now = batchTimer.seconds();

                // apply replication batch limits
                if (!ops.empty()) {
                    if (now > replBatchLimitSeconds)
                        break;
                    if (ops.getDeque().size() > replBatchLimitOperations)
                        break;
                }
                // occasionally check some things
                // (always checked in the first iteration of this do-while loop, because
                // ops is empty)
                if (ops.empty() || now > lastTimeChecked) {
                    BackgroundSync* bgsync = BackgroundSync::get();
                    if (bgsync->getInitialSyncRequestedFlag()) {
                        // got a resync command
                        Lock::DBLock lk(txn.lockState(), "local", MODE_X);
                        WriteUnitOfWork wunit(&txn);
                        Client::Context ctx(&txn, "local");

                        ctx.db()->dropCollection(&txn, "local.oplog.rs");

                        // Note: the following order is important.
                        // The bgsync thread uses an empty optime as a sentinel to know to wait
                        // for initial sync (done in this thread after we return); thus, we must
                        // ensure the lastAppliedOptime is empty before pausing the bgsync thread
                        // via stop().
                        // We must clear the sync source blacklist after calling stop()
                        // because the bgsync thread, while running, may update the blacklist.
                        replCoord->setMyLastOptime(&txn, OpTime());
                        bgsync->stop();
                        replCoord->clearSyncSourceBlacklist();

                        wunit.commit();

                        return;
                    }
                    lastTimeChecked = now;
                    // can we become secondary?
                    // we have to check this before calling mgr, as we must be a secondary to
                    // become primary
                    tryToGoLiveAsASecondary(&txn, replCoord);

                    // TODO(emilkie): This can be removed once we switch over from legacy;
                    // this code is what moves 1-node sets to PRIMARY state.
                    // normally msgCheckNewState gets called periodically, but in a single node
                    // replset there are no heartbeat threads, so we do it here to be sure.  this is
                    // relevant if the singleton member has done a stepDown() and needs to come back
                    // up.
                    if (theReplSet &&
                            theReplSet->config().members.size() == 1 &&
                            theReplSet->myConfig().potentiallyHot()) {
                        Manager* mgr = theReplSet->mgr;
                        // When would mgr be null?  During replsettest'ing, in which case we should
                        // fall through and actually apply ops as if we were a real secondary.
                        if (mgr) { 
                            mgr->send(stdx::bind(&Manager::msgCheckNewState, theReplSet->mgr));
                            sleepsecs(1);
                            // There should never be ops to sync in a 1-member set, anyway
                            return;
                        }
                    }
                }

                const int slaveDelaySecs = replCoord->getSlaveDelaySecs().total_seconds();
                if (!ops.empty() && slaveDelaySecs > 0) {
                    const BSONObj& lastOp = ops.getDeque().back();
                    const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs();

                    // Stop the batch as the lastOp is too new to be applied. If we continue
                    // on, we can get ops that are way ahead of the delay and this will
                    // make this thread sleep longer when handleSlaveDelay is called
                    // and apply ops much sooner than we like.
                    if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) {
                        break;
                    }
                }
                // keep fetching more ops as long as we haven't filled up a full batch yet
            } while (!tryPopAndWaitForMore(&ops, replCoord) && // tryPopAndWaitForMore returns true 
                                                               // when we need to end a batch early
                   (ops.getSize() < replBatchLimitBytes) &&
                   !inShutdown());

            // For pausing replication in tests
            while (MONGO_FAIL_POINT(rsSyncApplyStop)) {
                sleepmillis(0);
            }

            if (ops.empty()) {
                continue;
            }

            const BSONObj& lastOp = ops.getDeque().back();
            handleSlaveDelay(lastOp);

            if (replCoord->getCurrentMemberState().primary() && 
                !replCoord->isWaitingForApplierToDrain()) {
                severe() << "attempting to replicate ops while primary";
                fassertFailed(28527);
            }

            // Set minValid to the last op to be applied in this next batch.
            // This will cause this node to go into RECOVERING state
            // if we should crash and restart before updating the oplog
            OpTime minValid = lastOp["ts"]._opTime();
            setMinValid(&txn, minValid);

            multiApply(ops.getDeque());

            applyOpsToOplog(&ops.getDeque());

            // If we're just testing (no manager), don't keep looping if we exhausted the bgqueue
            // TODO(spencer): Remove repltest.cpp dbtest or make this work with the new replication
            // coordinator
            if (theReplSet && !theReplSet->mgr) {
                BSONObj op;
                if (!peek(&op)) {
                    return;
                }
            }
        }
    }
/* tail an oplog.  ok to return, will be re-called. */
void SyncTail::oplogApplication() {
    OpQueueBatcher batcher(this);

    OperationContextImpl txn;
    auto replCoord = ReplicationCoordinator::get(&txn);
    ApplyBatchFinalizer finalizer(replCoord);

    auto minValidBoundaries = getMinValid(&txn);
    OpTime originalEndOpTime(minValidBoundaries.end);
    OpTime lastWriteOpTime{replCoord->getMyLastOptime()};
    while (!inShutdown()) {
        OpQueue ops;

        do {
            if (BackgroundSync::get()->getInitialSyncRequestedFlag()) {
                // got a resync command
                return;
            }

            tryToGoLiveAsASecondary(&txn, replCoord, minValidBoundaries, lastWriteOpTime);

            // Blocks up to a second waiting for a batch to be ready to apply. If one doesn't become
            // ready in time, we'll loop again so we can do the above checks periodically.
            ops = batcher.getNextBatch(Seconds(1));
        } while (!inShutdown() && ops.empty());

        if (inShutdown())
            return;

        invariant(!ops.empty());

        const BSONObj lastOp = ops.back().raw;

        if (lastOp.isEmpty()) {
            // This means that the network thread has coalesced and we have processed all of its
            // data.
            invariant(ops.getDeque().size() == 1);
            if (replCoord->isWaitingForApplierToDrain()) {
                replCoord->signalDrainComplete(&txn);
            }
            continue;  // This wasn't a real op. Don't try to apply it.
        }

        handleSlaveDelay(lastOp);

        // Set minValid to the last OpTime that needs to be applied, in this batch or from the
        // (last) failed batch, whichever is larger.
        // This will cause this node to go into RECOVERING state
        // if we should crash and restart before updating finishing.
        const OpTime start(getLastSetTimestamp(), OpTime::kUninitializedTerm);

        // Take the max of the first endOptime (if we recovered) and the end of our batch.
        const auto lastOpTime = fassertStatusOK(28773, OpTime::parseFromOplogEntry(lastOp));

        // Setting end to the max of originalEndOpTime and lastOpTime (the end of the batch)
        // ensures that we keep pushing out the point where we can become consistent
        // and allow reads. If we recover and end up doing smaller batches we must pass the
        // originalEndOpTime before we are good.
        //
        // For example:
        // batch apply, 20-40, end = 40
        // batch failure,
        // restart
        // batch apply, 20-25, end = max(25, 40) = 40
        // batch apply, 25-45, end = 45
        const OpTime end(std::max(originalEndOpTime, lastOpTime));

        // This write will not journal/checkpoint.
        setMinValid(&txn, {start, end});

        lastWriteOpTime = multiApply(&txn, ops);
        setNewTimestamp(lastWriteOpTime.getTimestamp());

        setMinValid(&txn, end, DurableRequirement::None);
        minValidBoundaries.start = {};
        minValidBoundaries.end = end;
        finalizer.record(lastWriteOpTime);
    }
}
Esempio n. 5
0
    // returns number of seconds to sleep, if any
    uint32_t BackgroundSync::produce() {

        // normally msgCheckNewState gets called periodically, but in a single node repl set
        // there are no heartbeat threads, so we do it here to be sure.  this is relevant if the
        // singleton member has done a stepDown() and needs to come back up.
        if (theReplSet->config().members.size() == 1 &&
            theReplSet->myConfig().potentiallyHot()) {
            Manager* mgr = theReplSet->mgr;
            // When would mgr be null?  During replsettest'ing, in which case we should
            // fall through and actually apply ops as if we were a real secondary.
            if (mgr) {
                mgr->send(boost::bind(&Manager::msgCheckNewState, theReplSet->mgr));
                // There should never be ops to sync in a 1-member set, anyway
                return 1;
            }
        }

        OplogReader r(true /* doHandshake */);

        // find a target to sync from the last op time written
        getOplogReader(r);

        // no server found
        GTID lastGTIDFetched = theReplSet->gtidManager->getLiveState();
        {
            boost::unique_lock<boost::mutex> lock(_mutex);

            if (_currentSyncTarget == NULL) {
                // if there is no one to sync from
                return 1; //sleep one second
            }
        }
        r.tailingQueryGTE(rsoplog, lastGTIDFetched);

        // if target cut connections between connecting and querying (for
        // example, because it stepped down) we might not have a cursor
        if (!r.haveCursor()) {
            return 0;
        }

        try {
            // this method may actually run rollback, yes, the name is bad
            if (isRollbackRequired(r)) {
                // sleep 2 seconds and try again. (The 2 is arbitrary).
                // If we are not fatal, then we will keep trying to sync
                // from another machine
                return 2;
            }
        }
        catch (RollbackOplogException& re){
            // we attempted a rollback and failed, we must go fatal.
            log() << "Caught a RollbackOplogException during rollback, going fatal" << rsLog;
            theReplSet->fatal();
            return 2; // 2 is arbitrary, if we are going fatal, we are done
        }

        while (!_opSyncShouldExit) {
            while (!_opSyncShouldExit) {
                {
                    // check if we should bail out
                    boost::unique_lock<boost::mutex> lck(_mutex);
                    if (!_opSyncShouldRun) {
                        return 0;
                    }
                }
                if (!r.moreInCurrentBatch()) {
                    // check to see if we have a request to sync
                    // from a specific target. If so, get out so that
                    // we can restart the act of syncing and
                    // do so from the correct target
                    if (theReplSet->gotForceSync()) {
                        return 0;
                    }

                    verify(!theReplSet->isPrimary());

                    if (shouldChangeSyncTarget()) {
                        return 0;
                    }
                    //record time for each getmore
                    {
                        TimerHolder batchTimer(&getmoreReplStats);
                        r.more();
                    }
                    //increment
                    networkByteStats.increment(r.currentBatchMessageSize());

                }

                if (!r.more()) {
                    break;
                }

                // This is the operation we have received from the target
                // that we must put in our oplog with an applied field of false
                BSONObj o = r.nextSafe().getOwned();
                opsReadStats.increment();
                LOG(3) << "replicating " << o.toString(false, true) << " from " << _currentSyncTarget->fullName() << endl;
                uint64_t ts = o["ts"]._numberLong();

                // now that we have the element in o, let's check
                // if there a delay is required (via slaveDelay) before
                // writing it to the oplog
                if (theReplSet->myConfig().slaveDelay > 0) {
                    handleSlaveDelay(ts);
                    {
                        boost::unique_lock<boost::mutex> lck(_mutex);
                        if (!_opSyncShouldRun) {
                            break;
                        }
                    }
                }

                {
                    Timer timer;
                    bool bigTxn = false;
                    {
                        Client::Transaction transaction(DB_SERIALIZABLE);
                        replicateFullTransactionToOplog(o, r, &bigTxn);
                        // we are operating as a secondary. We don't have to fsync
                        transaction.commit(DB_TXN_NOSYNC);
                    }
                    {
                        GTID currEntry = getGTIDFromOplogEntry(o);
                        uint64_t lastHash = o["h"].numberLong();
                        boost::unique_lock<boost::mutex> lock(_mutex);
                        // update counters
                        theReplSet->gtidManager->noteGTIDAdded(currEntry, ts, lastHash);
                        // notify applier thread that data exists
                        if (_deque.size() == 0) {
                            _queueCond.notify_all();
                        }
                        _deque.push_back(o);
                        bufferCountGauge.increment();
                        bufferSizeGauge.increment(o.objsize());
                        // this is a flow control mechanism, with bad numbers
                        // hard coded for now just to get something going.
                        // If the opSync thread notices that we have over 20000
                        // transactions in the queue, it waits until we get below
                        // 10000. This is where we wait if we get too high
                        // Once we have spilling of transactions working, this
                        // logic will need to be redone
                        if (_deque.size() > 20000) {
                            _queueCond.wait(lock);
                        }
                        if (bigTxn) {
                            // if we have a large transaction, we don't want
                            // to let it pile up. We want to process it immedietely
                            // before processing anything else.
                            while (_deque.size() > 0) {
                                _queueDone.wait(lock);
                            }
                        }
                    }
                }
            } // end while

            if (shouldChangeSyncTarget()) {
                return 0;
            }

            r.tailCheck();
            if( !r.haveCursor() ) {
                LOG(1) << "replSet end opSync pass" << rsLog;
                return 0;
            }

            // looping back is ok because this is a tailable cursor
        }
        return 0;
    }
Esempio n. 6
0
    /* tail an oplog.  ok to return, will be re-called. */
    void SyncTail::oplogApplication() {
        while( 1 ) {
            OpQueue ops;

            verify( !Lock::isLocked() );

            Timer batchTimer;
            int lastTimeChecked = 0;

            // always fetch a few ops first
            // tryPopAndWaitForMore returns true when we need to end a batch early
            while (!tryPopAndWaitForMore(&ops) && 
                   (ops.getSize() < replBatchLimitBytes)) {

                if (theReplSet->isPrimary()) {
                    massert(16620, "there are ops to sync, but I'm primary", ops.empty());
                    return;
                }

                int now = batchTimer.seconds();

                // apply replication batch limits
                if (!ops.empty()) {
                    if (now > replBatchLimitSeconds)
                        break;
                    if (ops.getDeque().size() > replBatchLimitOperations)
                        break;
                }
                // occasionally check some things
                if (ops.empty() || now > lastTimeChecked) {
                    lastTimeChecked = now;
                    // can we become secondary?
                    // we have to check this before calling mgr, as we must be a secondary to
                    // become primary
                    if (!theReplSet->isSecondary()) {
                        OpTime minvalid;
                        theReplSet->tryToGoLiveAsASecondary(minvalid);
                    }

                    // normally msgCheckNewState gets called periodically, but in a single node repl set
                    // there are no heartbeat threads, so we do it here to be sure.  this is relevant if the
                    // singleton member has done a stepDown() and needs to come back up.
                    if (theReplSet->config().members.size() == 1 &&
                        theReplSet->myConfig().potentiallyHot()) {
                        Manager* mgr = theReplSet->mgr;
                        // When would mgr be null?  During replsettest'ing.
                        if (mgr) mgr->send(boost::bind(&Manager::msgCheckNewState, theReplSet->mgr));
                        sleepsecs(1);
                        // There should never be ops to sync in a 1-member set, anyway
                        return;
                    }
                }

                const int slaveDelaySecs = theReplSet->myConfig().slaveDelay;
                if (!ops.empty() && slaveDelaySecs > 0) {
                    const BSONObj& lastOp = ops.getDeque().back();
                    const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs();

                    // Stop the batch as the lastOp is too new to be applied. If we continue
                    // on, we can get ops that are way ahead of the delay and this will
                    // make this thread sleep longer when handleSlaveDelay is called
                    // and apply ops much sooner than we like.
                    if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) {
                        break;
                    }
                }
            }

            // For pausing replication in tests
            while (MONGO_FAIL_POINT(rsSyncApplyStop)) {
                sleepmillis(0);
            }

            const BSONObj& lastOp = ops.getDeque().back();
            setOplogVersion(lastOp);
            handleSlaveDelay(lastOp);

            // Set minValid to the last op to be applied in this next batch.
            // This will cause this node to go into RECOVERING state
            // if we should crash and restart before updating the oplog
            theReplSet->setMinValid(lastOp);

            multiApply(ops.getDeque(), multiSyncApply);

            applyOpsToOplog(&ops.getDeque());
        }
    }