/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); ApplyBatchFinalizer finalizer(replCoord); OperationContextImpl txn; OpTime originalEndOpTime(getMinValid(&txn).end); while (!inShutdown()) { OpQueue ops; Timer batchTimer; int lastTimeChecked = 0; do { int now = batchTimer.seconds(); // apply replication batch limits if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } // occasionally check some things // (always checked in the first iteration of this do-while loop, because // ops is empty) if (ops.empty() || now > lastTimeChecked) { BackgroundSync* bgsync = BackgroundSync::get(); if (bgsync->getInitialSyncRequestedFlag()) { // got a resync command return; } lastTimeChecked = now; // can we become secondary? // we have to check this before calling mgr, as we must be a secondary to // become primary tryToGoLiveAsASecondary(&txn, replCoord); } const int slaveDelaySecs = durationCount<Seconds>(replCoord->getSlaveDelaySecs()); if (!ops.empty() && slaveDelaySecs > 0) { const BSONObj lastOp = ops.back(); const unsigned int opTimestampSecs = lastOp["ts"].timestamp().getSecs(); // Stop the batch as the lastOp is too new to be applied. If we continue // on, we can get ops that are way ahead of the delay and this will // make this thread sleep longer when handleSlaveDelay is called // and apply ops much sooner than we like. if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { break; } } if (MONGO_FAIL_POINT(rsSyncApplyStop)) { break; } // keep fetching more ops as long as we haven't filled up a full batch yet } while (!tryPopAndWaitForMore(&txn, &ops, replCoord) && // tryPopAndWaitForMore returns // true when we need to end a // batch early (ops.getSize() < replBatchLimitBytes) && !inShutdown()); // For pausing replication in tests while (MONGO_FAIL_POINT(rsSyncApplyStop)) { sleepmillis(0); if (inShutdown()) return; } if (ops.empty()) { continue; } const BSONObj lastOp = ops.back(); handleSlaveDelay(lastOp); // Set minValid to the last OpTime that needs to be applied, in this batch or from the // (last) failed batch, whichever is larger. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating finishing. const OpTime start(getLastSetTimestamp(), OpTime::kUninitializedTerm); // Take the max of the first endOptime (if we recovered) and the end of our batch. const auto lastOpTime = fassertStatusOK(28773, OpTime::parseFromOplogEntry(lastOp)); // Setting end to the max of originalEndOpTime and lastOpTime (the end of the batch) // ensures that we keep pushing out the point where we can become consistent // and allow reads. If we recover and end up doing smaller batches we must pass the // originalEndOpTime before we are good. // // For example: // batch apply, 20-40, end = 40 // batch failure, // restart // batch apply, 20-25, end = max(25, 40) = 40 // batch apply, 25-45, end = 45 const OpTime end(std::max(originalEndOpTime, lastOpTime)); // This write will not journal/checkpoint. setMinValid(&txn, {start, end}); OpTime finalOpTime = multiApply(&txn, ops); setNewTimestamp(finalOpTime.getTimestamp()); setMinValid(&txn, end, DurableRequirement::None); finalizer.record(finalOpTime); } }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { while( 1 ) { OpQueue ops; verify( !Lock::isLocked() ); Timer batchTimer; int lastTimeChecked = 0; do { if (theReplSet->isPrimary()) { massert(16620, "there are ops to sync, but I'm primary", ops.empty()); return; } int now = batchTimer.seconds(); // apply replication batch limits if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } // occasionally check some things // (always checked in the first iteration of this do-while loop, because // ops is empty) if (ops.empty() || now > lastTimeChecked) { { boost::unique_lock<boost::mutex> lock(theReplSet->initialSyncMutex); if (theReplSet->initialSyncRequested) { // got a resync command return; } } lastTimeChecked = now; // can we become secondary? // we have to check this before calling mgr, as we must be a secondary to // become primary if (!theReplSet->isSecondary()) { OpTime minvalid; OperationContextImpl txn; theReplSet->tryToGoLiveAsASecondary(&txn, minvalid); } // normally msgCheckNewState gets called periodically, but in a single node // replset there are no heartbeat threads, so we do it here to be sure. this is // relevant if the singleton member has done a stepDown() and needs to come back // up. if (theReplSet->config().members.size() == 1 && theReplSet->myConfig().potentiallyHot()) { Manager* mgr = theReplSet->mgr; // When would mgr be null? During replsettest'ing, in which case we should // fall through and actually apply ops as if we were a real secondary. if (mgr) { mgr->send(stdx::bind(&Manager::msgCheckNewState, theReplSet->mgr)); sleepsecs(1); // There should never be ops to sync in a 1-member set, anyway return; } } } const int slaveDelaySecs = theReplSet->myConfig().slaveDelay; if (!ops.empty() && slaveDelaySecs > 0) { const BSONObj& lastOp = ops.getDeque().back(); const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs(); // Stop the batch as the lastOp is too new to be applied. If we continue // on, we can get ops that are way ahead of the delay and this will // make this thread sleep longer when handleSlaveDelay is called // and apply ops much sooner than we like. if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { break; } } // keep fetching more ops as long as we haven't filled up a full batch yet } while (!tryPopAndWaitForMore(&ops) && // tryPopAndWaitForMore returns true // when we need to end a batch early (ops.getSize() < replBatchLimitBytes)); // For pausing replication in tests while (MONGO_FAIL_POINT(rsSyncApplyStop)) { sleepmillis(0); } const BSONObj& lastOp = ops.getDeque().back(); setOplogVersion(lastOp); handleSlaveDelay(lastOp); // Set minValid to the last op to be applied in this next batch. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating the oplog theReplSet->setMinValid(lastOp); if (BackgroundSync::get()->isAssumingPrimary()) { LOG(1) << "about to apply batch up to optime: " << ops.getDeque().back()["ts"]._opTime().toStringPretty(); } multiApply(ops.getDeque(), multiSyncApply); if (BackgroundSync::get()->isAssumingPrimary()) { LOG(1) << "about to update oplog to optime: " << ops.getDeque().back()["ts"]._opTime().toStringPretty(); } applyOpsToOplog(&ops.getDeque()); // If we're just testing (no manager), don't keep looping if we exhausted the bgqueue if (!theReplSet->mgr) { BSONObj op; if (!peek(&op)) { return; } } } }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); while(!inShutdown()) { OpQueue ops; OperationContextImpl txn; Timer batchTimer; int lastTimeChecked = 0; do { int now = batchTimer.seconds(); // apply replication batch limits if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } // occasionally check some things // (always checked in the first iteration of this do-while loop, because // ops is empty) if (ops.empty() || now > lastTimeChecked) { BackgroundSync* bgsync = BackgroundSync::get(); if (bgsync->getInitialSyncRequestedFlag()) { // got a resync command Lock::DBLock lk(txn.lockState(), "local", MODE_X); WriteUnitOfWork wunit(&txn); Client::Context ctx(&txn, "local"); ctx.db()->dropCollection(&txn, "local.oplog.rs"); // Note: the following order is important. // The bgsync thread uses an empty optime as a sentinel to know to wait // for initial sync (done in this thread after we return); thus, we must // ensure the lastAppliedOptime is empty before pausing the bgsync thread // via stop(). // We must clear the sync source blacklist after calling stop() // because the bgsync thread, while running, may update the blacklist. replCoord->setMyLastOptime(&txn, OpTime()); bgsync->stop(); replCoord->clearSyncSourceBlacklist(); wunit.commit(); return; } lastTimeChecked = now; // can we become secondary? // we have to check this before calling mgr, as we must be a secondary to // become primary tryToGoLiveAsASecondary(&txn, replCoord); // TODO(emilkie): This can be removed once we switch over from legacy; // this code is what moves 1-node sets to PRIMARY state. // normally msgCheckNewState gets called periodically, but in a single node // replset there are no heartbeat threads, so we do it here to be sure. this is // relevant if the singleton member has done a stepDown() and needs to come back // up. if (theReplSet && theReplSet->config().members.size() == 1 && theReplSet->myConfig().potentiallyHot()) { Manager* mgr = theReplSet->mgr; // When would mgr be null? During replsettest'ing, in which case we should // fall through and actually apply ops as if we were a real secondary. if (mgr) { mgr->send(stdx::bind(&Manager::msgCheckNewState, theReplSet->mgr)); sleepsecs(1); // There should never be ops to sync in a 1-member set, anyway return; } } } const int slaveDelaySecs = replCoord->getSlaveDelaySecs().total_seconds(); if (!ops.empty() && slaveDelaySecs > 0) { const BSONObj& lastOp = ops.getDeque().back(); const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs(); // Stop the batch as the lastOp is too new to be applied. If we continue // on, we can get ops that are way ahead of the delay and this will // make this thread sleep longer when handleSlaveDelay is called // and apply ops much sooner than we like. if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { break; } } // keep fetching more ops as long as we haven't filled up a full batch yet } while (!tryPopAndWaitForMore(&ops, replCoord) && // tryPopAndWaitForMore returns true // when we need to end a batch early (ops.getSize() < replBatchLimitBytes) && !inShutdown()); // For pausing replication in tests while (MONGO_FAIL_POINT(rsSyncApplyStop)) { sleepmillis(0); } if (ops.empty()) { continue; } const BSONObj& lastOp = ops.getDeque().back(); handleSlaveDelay(lastOp); if (replCoord->getCurrentMemberState().primary() && !replCoord->isWaitingForApplierToDrain()) { severe() << "attempting to replicate ops while primary"; fassertFailed(28527); } // Set minValid to the last op to be applied in this next batch. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating the oplog OpTime minValid = lastOp["ts"]._opTime(); setMinValid(&txn, minValid); multiApply(ops.getDeque()); applyOpsToOplog(&ops.getDeque()); // If we're just testing (no manager), don't keep looping if we exhausted the bgqueue // TODO(spencer): Remove repltest.cpp dbtest or make this work with the new replication // coordinator if (theReplSet && !theReplSet->mgr) { BSONObj op; if (!peek(&op)) { return; } } } }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { OpQueueBatcher batcher(this); OperationContextImpl txn; auto replCoord = ReplicationCoordinator::get(&txn); ApplyBatchFinalizer finalizer(replCoord); auto minValidBoundaries = getMinValid(&txn); OpTime originalEndOpTime(minValidBoundaries.end); OpTime lastWriteOpTime{replCoord->getMyLastOptime()}; while (!inShutdown()) { OpQueue ops; do { if (BackgroundSync::get()->getInitialSyncRequestedFlag()) { // got a resync command return; } tryToGoLiveAsASecondary(&txn, replCoord, minValidBoundaries, lastWriteOpTime); // Blocks up to a second waiting for a batch to be ready to apply. If one doesn't become // ready in time, we'll loop again so we can do the above checks periodically. ops = batcher.getNextBatch(Seconds(1)); } while (!inShutdown() && ops.empty()); if (inShutdown()) return; invariant(!ops.empty()); const BSONObj lastOp = ops.back().raw; if (lastOp.isEmpty()) { // This means that the network thread has coalesced and we have processed all of its // data. invariant(ops.getDeque().size() == 1); if (replCoord->isWaitingForApplierToDrain()) { replCoord->signalDrainComplete(&txn); } continue; // This wasn't a real op. Don't try to apply it. } handleSlaveDelay(lastOp); // Set minValid to the last OpTime that needs to be applied, in this batch or from the // (last) failed batch, whichever is larger. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating finishing. const OpTime start(getLastSetTimestamp(), OpTime::kUninitializedTerm); // Take the max of the first endOptime (if we recovered) and the end of our batch. const auto lastOpTime = fassertStatusOK(28773, OpTime::parseFromOplogEntry(lastOp)); // Setting end to the max of originalEndOpTime and lastOpTime (the end of the batch) // ensures that we keep pushing out the point where we can become consistent // and allow reads. If we recover and end up doing smaller batches we must pass the // originalEndOpTime before we are good. // // For example: // batch apply, 20-40, end = 40 // batch failure, // restart // batch apply, 20-25, end = max(25, 40) = 40 // batch apply, 25-45, end = 45 const OpTime end(std::max(originalEndOpTime, lastOpTime)); // This write will not journal/checkpoint. setMinValid(&txn, {start, end}); lastWriteOpTime = multiApply(&txn, ops); setNewTimestamp(lastWriteOpTime.getTimestamp()); setMinValid(&txn, end, DurableRequirement::None); minValidBoundaries.start = {}; minValidBoundaries.end = end; finalizer.record(lastWriteOpTime); } }
// returns number of seconds to sleep, if any uint32_t BackgroundSync::produce() { // normally msgCheckNewState gets called periodically, but in a single node repl set // there are no heartbeat threads, so we do it here to be sure. this is relevant if the // singleton member has done a stepDown() and needs to come back up. if (theReplSet->config().members.size() == 1 && theReplSet->myConfig().potentiallyHot()) { Manager* mgr = theReplSet->mgr; // When would mgr be null? During replsettest'ing, in which case we should // fall through and actually apply ops as if we were a real secondary. if (mgr) { mgr->send(boost::bind(&Manager::msgCheckNewState, theReplSet->mgr)); // There should never be ops to sync in a 1-member set, anyway return 1; } } OplogReader r(true /* doHandshake */); // find a target to sync from the last op time written getOplogReader(r); // no server found GTID lastGTIDFetched = theReplSet->gtidManager->getLiveState(); { boost::unique_lock<boost::mutex> lock(_mutex); if (_currentSyncTarget == NULL) { // if there is no one to sync from return 1; //sleep one second } } r.tailingQueryGTE(rsoplog, lastGTIDFetched); // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!r.haveCursor()) { return 0; } try { // this method may actually run rollback, yes, the name is bad if (isRollbackRequired(r)) { // sleep 2 seconds and try again. (The 2 is arbitrary). // If we are not fatal, then we will keep trying to sync // from another machine return 2; } } catch (RollbackOplogException& re){ // we attempted a rollback and failed, we must go fatal. log() << "Caught a RollbackOplogException during rollback, going fatal" << rsLog; theReplSet->fatal(); return 2; // 2 is arbitrary, if we are going fatal, we are done } while (!_opSyncShouldExit) { while (!_opSyncShouldExit) { { // check if we should bail out boost::unique_lock<boost::mutex> lck(_mutex); if (!_opSyncShouldRun) { return 0; } } if (!r.moreInCurrentBatch()) { // check to see if we have a request to sync // from a specific target. If so, get out so that // we can restart the act of syncing and // do so from the correct target if (theReplSet->gotForceSync()) { return 0; } verify(!theReplSet->isPrimary()); if (shouldChangeSyncTarget()) { return 0; } //record time for each getmore { TimerHolder batchTimer(&getmoreReplStats); r.more(); } //increment networkByteStats.increment(r.currentBatchMessageSize()); } if (!r.more()) { break; } // This is the operation we have received from the target // that we must put in our oplog with an applied field of false BSONObj o = r.nextSafe().getOwned(); opsReadStats.increment(); LOG(3) << "replicating " << o.toString(false, true) << " from " << _currentSyncTarget->fullName() << endl; uint64_t ts = o["ts"]._numberLong(); // now that we have the element in o, let's check // if there a delay is required (via slaveDelay) before // writing it to the oplog if (theReplSet->myConfig().slaveDelay > 0) { handleSlaveDelay(ts); { boost::unique_lock<boost::mutex> lck(_mutex); if (!_opSyncShouldRun) { break; } } } { Timer timer; bool bigTxn = false; { Client::Transaction transaction(DB_SERIALIZABLE); replicateFullTransactionToOplog(o, r, &bigTxn); // we are operating as a secondary. We don't have to fsync transaction.commit(DB_TXN_NOSYNC); } { GTID currEntry = getGTIDFromOplogEntry(o); uint64_t lastHash = o["h"].numberLong(); boost::unique_lock<boost::mutex> lock(_mutex); // update counters theReplSet->gtidManager->noteGTIDAdded(currEntry, ts, lastHash); // notify applier thread that data exists if (_deque.size() == 0) { _queueCond.notify_all(); } _deque.push_back(o); bufferCountGauge.increment(); bufferSizeGauge.increment(o.objsize()); // this is a flow control mechanism, with bad numbers // hard coded for now just to get something going. // If the opSync thread notices that we have over 20000 // transactions in the queue, it waits until we get below // 10000. This is where we wait if we get too high // Once we have spilling of transactions working, this // logic will need to be redone if (_deque.size() > 20000) { _queueCond.wait(lock); } if (bigTxn) { // if we have a large transaction, we don't want // to let it pile up. We want to process it immedietely // before processing anything else. while (_deque.size() > 0) { _queueDone.wait(lock); } } } } } // end while if (shouldChangeSyncTarget()) { return 0; } r.tailCheck(); if( !r.haveCursor() ) { LOG(1) << "replSet end opSync pass" << rsLog; return 0; } // looping back is ok because this is a tailable cursor } return 0; }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { while( 1 ) { OpQueue ops; verify( !Lock::isLocked() ); Timer batchTimer; int lastTimeChecked = 0; // always fetch a few ops first // tryPopAndWaitForMore returns true when we need to end a batch early while (!tryPopAndWaitForMore(&ops) && (ops.getSize() < replBatchLimitBytes)) { if (theReplSet->isPrimary()) { massert(16620, "there are ops to sync, but I'm primary", ops.empty()); return; } int now = batchTimer.seconds(); // apply replication batch limits if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } // occasionally check some things if (ops.empty() || now > lastTimeChecked) { lastTimeChecked = now; // can we become secondary? // we have to check this before calling mgr, as we must be a secondary to // become primary if (!theReplSet->isSecondary()) { OpTime minvalid; theReplSet->tryToGoLiveAsASecondary(minvalid); } // normally msgCheckNewState gets called periodically, but in a single node repl set // there are no heartbeat threads, so we do it here to be sure. this is relevant if the // singleton member has done a stepDown() and needs to come back up. if (theReplSet->config().members.size() == 1 && theReplSet->myConfig().potentiallyHot()) { Manager* mgr = theReplSet->mgr; // When would mgr be null? During replsettest'ing. if (mgr) mgr->send(boost::bind(&Manager::msgCheckNewState, theReplSet->mgr)); sleepsecs(1); // There should never be ops to sync in a 1-member set, anyway return; } } const int slaveDelaySecs = theReplSet->myConfig().slaveDelay; if (!ops.empty() && slaveDelaySecs > 0) { const BSONObj& lastOp = ops.getDeque().back(); const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs(); // Stop the batch as the lastOp is too new to be applied. If we continue // on, we can get ops that are way ahead of the delay and this will // make this thread sleep longer when handleSlaveDelay is called // and apply ops much sooner than we like. if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { break; } } } // For pausing replication in tests while (MONGO_FAIL_POINT(rsSyncApplyStop)) { sleepmillis(0); } const BSONObj& lastOp = ops.getDeque().back(); setOplogVersion(lastOp); handleSlaveDelay(lastOp); // Set minValid to the last op to be applied in this next batch. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating the oplog theReplSet->setMinValid(lastOp); multiApply(ops.getDeque(), multiSyncApply); applyOpsToOplog(&ops.getDeque()); } }