void run() { Client::initThread("ReplBatcher"); OperationContextImpl txn; auto replCoord = ReplicationCoordinator::get(&txn); while (!_inShutdown.load()) { Timer batchTimer; OpQueue ops; // tryPopAndWaitForMore returns true when we need to end a batch early while (!_syncTail->tryPopAndWaitForMore(&txn, &ops) && (ops.getSize() < replBatchLimitBytes) && !_inShutdown.load()) { int now = batchTimer.seconds(); // apply replication batch limits if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } const int slaveDelaySecs = durationCount<Seconds>(replCoord->getSlaveDelaySecs()); if (!ops.empty() && slaveDelaySecs > 0) { const BSONObj lastOp = ops.back().raw; const unsigned int opTimestampSecs = lastOp["ts"].timestamp().getSecs(); // Stop the batch as the lastOp is too new to be applied. If we continue // on, we can get ops that are way ahead of the delay and this will // make this thread sleep longer when handleSlaveDelay is called // and apply ops much sooner than we like. if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { break; } } if (MONGO_FAIL_POINT(rsSyncApplyStop)) { break; } // keep fetching more ops as long as we haven't filled up a full batch yet } // For pausing replication in tests while (MONGO_FAIL_POINT(rsSyncApplyStop) && !_inShutdown.load()) { sleepmillis(0); } stdx::unique_lock<stdx::mutex> lk(_mutex); while (!_ops.empty()) { // Block until the previous batch has been taken. if (_inShutdown.load()) return; _cv.wait(lk); } _ops = std::move(ops); _cv.notify_all(); } }
/* applies oplog from "now" until endOpTime using the applier threads for initial sync*/ void SyncTail::_applyOplogUntil(OperationContext* txn, const OpTime& endOpTime) { unsigned long long bytesApplied = 0; unsigned long long entriesApplied = 0; while (true) { OpQueue ops; OperationContextImpl ctx; while (!tryPopAndWaitForMore(&ops, getGlobalReplicationCoordinator())) { // nothing came back last time, so go again if (ops.empty()) continue; // Check if we reached the end const BSONObj currentOp = ops.back(); const OpTime currentOpTime = currentOp["ts"]._opTime(); // When we reach the end return this batch if (currentOpTime == endOpTime) { break; } else if (currentOpTime > endOpTime) { severe() << "Applied past expected end " << endOpTime << " to " << currentOpTime << " without seeing it. Rollback?" << rsLog; fassertFailedNoTrace(18693); } // apply replication batch limits if (ops.getSize() > replBatchLimitBytes) break; if (ops.getDeque().size() > replBatchLimitOperations) break; }; if (ops.empty()) { severe() << "got no ops for batch..."; fassertFailedNoTrace(18692); } const BSONObj lastOp = ops.back().getOwned(); // Tally operation information bytesApplied += ops.getSize(); entriesApplied += ops.getDeque().size(); multiApply(ops.getDeque()); OpTime lastOpTime = applyOpsToOplog(&ops.getDeque()); // if the last op applied was our end, return if (lastOpTime == endOpTime) { LOG(1) << "SyncTail applied " << entriesApplied << " entries (" << bytesApplied << " bytes)" << " and finished at opTime " << endOpTime.toStringPretty(); return; } } // end of while (true) }
BSONObj SyncTail::oplogApplySegment(const BSONObj& applyGTEObj, const BSONObj& minValidObj, MultiSyncApplyFunc func) { OpTime applyGTE = applyGTEObj["ts"]._opTime(); OpTime minValid = minValidObj["ts"]._opTime(); // We have to keep track of the last op applied to the data, because there's no other easy // way of getting this data synchronously. Batches may go past minValidObj, so we need to // know to bump minValid past minValidObj. BSONObj lastOp = applyGTEObj; OpTime ts = applyGTE; time_t start = time(0); time_t now = start; unsigned long long n = 0, lastN = 0; while( ts < minValid ) { OpQueue ops; while (ops.getSize() < replBatchLimitBytes) { if (tryPopAndWaitForMore(&ops)) { break; } // apply replication batch limits now = time(0); if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } } setOplogVersion(ops.getDeque().front()); multiApply(ops.getDeque(), func); n += ops.getDeque().size(); if ( n > lastN + 1000 ) { if (now - start > 10) { // simple progress metering log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to " << ts.toStringPretty() << rsLog; start = now; lastN = n; } } // we want to keep a record of the last op applied, to compare with minvalid lastOp = ops.getDeque().back(); OpTime tempTs = lastOp["ts"]._opTime(); applyOpsToOplog(&ops.getDeque()); ts = tempTs; } return lastOp; }
//BOOST_AUTO_TEST_CASE(opqueue_test) int test_main(int, char *[]) { OpQueue q; Op::Ptr p(new Op(1, std::bind(&foo, eng::Library::Ptr()))); BOOST_CHECK(q.empty()); tid_t old_id = p->id(); Op* old_ptr = p.get(); q.add(p); BOOST_CHECK(!q.empty()); BOOST_CHECK(p == nullptr); Op::Ptr p2(q.pop()); BOOST_CHECK(p2.get() == old_ptr); BOOST_CHECK(p2->id() == old_id); BOOST_CHECK(q.empty()); return 0; }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); ApplyBatchFinalizer finalizer(replCoord); OperationContextImpl txn; OpTime originalEndOpTime(getMinValid(&txn).end); while (!inShutdown()) { OpQueue ops; Timer batchTimer; int lastTimeChecked = 0; do { int now = batchTimer.seconds(); // apply replication batch limits if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } // occasionally check some things // (always checked in the first iteration of this do-while loop, because // ops is empty) if (ops.empty() || now > lastTimeChecked) { BackgroundSync* bgsync = BackgroundSync::get(); if (bgsync->getInitialSyncRequestedFlag()) { // got a resync command return; } lastTimeChecked = now; // can we become secondary? // we have to check this before calling mgr, as we must be a secondary to // become primary tryToGoLiveAsASecondary(&txn, replCoord); } const int slaveDelaySecs = durationCount<Seconds>(replCoord->getSlaveDelaySecs()); if (!ops.empty() && slaveDelaySecs > 0) { const BSONObj lastOp = ops.back(); const unsigned int opTimestampSecs = lastOp["ts"].timestamp().getSecs(); // Stop the batch as the lastOp is too new to be applied. If we continue // on, we can get ops that are way ahead of the delay and this will // make this thread sleep longer when handleSlaveDelay is called // and apply ops much sooner than we like. if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { break; } } if (MONGO_FAIL_POINT(rsSyncApplyStop)) { break; } // keep fetching more ops as long as we haven't filled up a full batch yet } while (!tryPopAndWaitForMore(&txn, &ops, replCoord) && // tryPopAndWaitForMore returns // true when we need to end a // batch early (ops.getSize() < replBatchLimitBytes) && !inShutdown()); // For pausing replication in tests while (MONGO_FAIL_POINT(rsSyncApplyStop)) { sleepmillis(0); if (inShutdown()) return; } if (ops.empty()) { continue; } const BSONObj lastOp = ops.back(); handleSlaveDelay(lastOp); // Set minValid to the last OpTime that needs to be applied, in this batch or from the // (last) failed batch, whichever is larger. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating finishing. const OpTime start(getLastSetTimestamp(), OpTime::kUninitializedTerm); // Take the max of the first endOptime (if we recovered) and the end of our batch. const auto lastOpTime = fassertStatusOK(28773, OpTime::parseFromOplogEntry(lastOp)); // Setting end to the max of originalEndOpTime and lastOpTime (the end of the batch) // ensures that we keep pushing out the point where we can become consistent // and allow reads. If we recover and end up doing smaller batches we must pass the // originalEndOpTime before we are good. // // For example: // batch apply, 20-40, end = 40 // batch failure, // restart // batch apply, 20-25, end = max(25, 40) = 40 // batch apply, 25-45, end = 45 const OpTime end(std::max(originalEndOpTime, lastOpTime)); // This write will not journal/checkpoint. setMinValid(&txn, {start, end}); OpTime finalOpTime = multiApply(&txn, ops); setNewTimestamp(finalOpTime.getTimestamp()); setMinValid(&txn, end, DurableRequirement::None); finalizer.record(finalOpTime); } }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { while( 1 ) { OpQueue ops; verify( !Lock::isLocked() ); Timer batchTimer; int lastTimeChecked = 0; do { if (theReplSet->isPrimary()) { massert(16620, "there are ops to sync, but I'm primary", ops.empty()); return; } int now = batchTimer.seconds(); // apply replication batch limits if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } // occasionally check some things // (always checked in the first iteration of this do-while loop, because // ops is empty) if (ops.empty() || now > lastTimeChecked) { { boost::unique_lock<boost::mutex> lock(theReplSet->initialSyncMutex); if (theReplSet->initialSyncRequested) { // got a resync command return; } } lastTimeChecked = now; // can we become secondary? // we have to check this before calling mgr, as we must be a secondary to // become primary if (!theReplSet->isSecondary()) { OpTime minvalid; OperationContextImpl txn; theReplSet->tryToGoLiveAsASecondary(&txn, minvalid); } // normally msgCheckNewState gets called periodically, but in a single node // replset there are no heartbeat threads, so we do it here to be sure. this is // relevant if the singleton member has done a stepDown() and needs to come back // up. if (theReplSet->config().members.size() == 1 && theReplSet->myConfig().potentiallyHot()) { Manager* mgr = theReplSet->mgr; // When would mgr be null? During replsettest'ing, in which case we should // fall through and actually apply ops as if we were a real secondary. if (mgr) { mgr->send(stdx::bind(&Manager::msgCheckNewState, theReplSet->mgr)); sleepsecs(1); // There should never be ops to sync in a 1-member set, anyway return; } } } const int slaveDelaySecs = theReplSet->myConfig().slaveDelay; if (!ops.empty() && slaveDelaySecs > 0) { const BSONObj& lastOp = ops.getDeque().back(); const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs(); // Stop the batch as the lastOp is too new to be applied. If we continue // on, we can get ops that are way ahead of the delay and this will // make this thread sleep longer when handleSlaveDelay is called // and apply ops much sooner than we like. if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { break; } } // keep fetching more ops as long as we haven't filled up a full batch yet } while (!tryPopAndWaitForMore(&ops) && // tryPopAndWaitForMore returns true // when we need to end a batch early (ops.getSize() < replBatchLimitBytes)); // For pausing replication in tests while (MONGO_FAIL_POINT(rsSyncApplyStop)) { sleepmillis(0); } const BSONObj& lastOp = ops.getDeque().back(); setOplogVersion(lastOp); handleSlaveDelay(lastOp); // Set minValid to the last op to be applied in this next batch. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating the oplog theReplSet->setMinValid(lastOp); if (BackgroundSync::get()->isAssumingPrimary()) { LOG(1) << "about to apply batch up to optime: " << ops.getDeque().back()["ts"]._opTime().toStringPretty(); } multiApply(ops.getDeque(), multiSyncApply); if (BackgroundSync::get()->isAssumingPrimary()) { LOG(1) << "about to update oplog to optime: " << ops.getDeque().back()["ts"]._opTime().toStringPretty(); } applyOpsToOplog(&ops.getDeque()); // If we're just testing (no manager), don't keep looping if we exhausted the bgqueue if (!theReplSet->mgr) { BSONObj op; if (!peek(&op)) { return; } } } }
/* applies oplog from "now" until endOpTime using the applier threads for initial sync*/ void InitialSync::_applyOplogUntil(OperationContext* txn, const OpTime& endOpTime) { unsigned long long bytesApplied = 0; unsigned long long entriesApplied = 0; while (true) { OpQueue ops; auto replCoord = repl::ReplicationCoordinator::get(txn); while (!tryPopAndWaitForMore(txn, &ops)) { // nothing came back last time, so go again if (ops.empty()) continue; // Check if we reached the end const BSONObj currentOp = ops.back().raw; const OpTime currentOpTime = fassertStatusOK(28772, OpTime::parseFromOplogEntry(currentOp)); // When we reach the end return this batch if (currentOpTime == endOpTime) { break; } else if (currentOpTime > endOpTime) { severe() << "Applied past expected end " << endOpTime << " to " << currentOpTime << " without seeing it. Rollback?"; fassertFailedNoTrace(18693); } // apply replication batch limits if (ops.getSize() > replBatchLimitBytes) break; if (ops.getDeque().size() > replBatchLimitOperations) break; }; if (ops.empty()) { severe() << "got no ops for batch..."; fassertFailedNoTrace(18692); } const BSONObj lastOp = ops.back().raw.getOwned(); // Tally operation information bytesApplied += ops.getSize(); entriesApplied += ops.getDeque().size(); const OpTime lastOpTime = multiApply(txn, ops); replCoord->setMyLastAppliedOpTime(lastOpTime); setNewTimestamp(lastOpTime.getTimestamp()); if (inShutdown()) { return; } // if the last op applied was our end, return if (lastOpTime == endOpTime) { LOG(1) << "SyncTail applied " << entriesApplied << " entries (" << bytesApplied << " bytes) and finished at opTime " << endOpTime; return; } } // end of while (true) }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); while(!inShutdown()) { OpQueue ops; OperationContextImpl txn; Timer batchTimer; int lastTimeChecked = 0; do { int now = batchTimer.seconds(); // apply replication batch limits if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } // occasionally check some things // (always checked in the first iteration of this do-while loop, because // ops is empty) if (ops.empty() || now > lastTimeChecked) { BackgroundSync* bgsync = BackgroundSync::get(); if (bgsync->getInitialSyncRequestedFlag()) { // got a resync command Lock::DBLock lk(txn.lockState(), "local", MODE_X); WriteUnitOfWork wunit(&txn); Client::Context ctx(&txn, "local"); ctx.db()->dropCollection(&txn, "local.oplog.rs"); // Note: the following order is important. // The bgsync thread uses an empty optime as a sentinel to know to wait // for initial sync (done in this thread after we return); thus, we must // ensure the lastAppliedOptime is empty before pausing the bgsync thread // via stop(). // We must clear the sync source blacklist after calling stop() // because the bgsync thread, while running, may update the blacklist. replCoord->setMyLastOptime(&txn, OpTime()); bgsync->stop(); replCoord->clearSyncSourceBlacklist(); wunit.commit(); return; } lastTimeChecked = now; // can we become secondary? // we have to check this before calling mgr, as we must be a secondary to // become primary tryToGoLiveAsASecondary(&txn, replCoord); // TODO(emilkie): This can be removed once we switch over from legacy; // this code is what moves 1-node sets to PRIMARY state. // normally msgCheckNewState gets called periodically, but in a single node // replset there are no heartbeat threads, so we do it here to be sure. this is // relevant if the singleton member has done a stepDown() and needs to come back // up. if (theReplSet && theReplSet->config().members.size() == 1 && theReplSet->myConfig().potentiallyHot()) { Manager* mgr = theReplSet->mgr; // When would mgr be null? During replsettest'ing, in which case we should // fall through and actually apply ops as if we were a real secondary. if (mgr) { mgr->send(stdx::bind(&Manager::msgCheckNewState, theReplSet->mgr)); sleepsecs(1); // There should never be ops to sync in a 1-member set, anyway return; } } } const int slaveDelaySecs = replCoord->getSlaveDelaySecs().total_seconds(); if (!ops.empty() && slaveDelaySecs > 0) { const BSONObj& lastOp = ops.getDeque().back(); const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs(); // Stop the batch as the lastOp is too new to be applied. If we continue // on, we can get ops that are way ahead of the delay and this will // make this thread sleep longer when handleSlaveDelay is called // and apply ops much sooner than we like. if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { break; } } // keep fetching more ops as long as we haven't filled up a full batch yet } while (!tryPopAndWaitForMore(&ops, replCoord) && // tryPopAndWaitForMore returns true // when we need to end a batch early (ops.getSize() < replBatchLimitBytes) && !inShutdown()); // For pausing replication in tests while (MONGO_FAIL_POINT(rsSyncApplyStop)) { sleepmillis(0); } if (ops.empty()) { continue; } const BSONObj& lastOp = ops.getDeque().back(); handleSlaveDelay(lastOp); if (replCoord->getCurrentMemberState().primary() && !replCoord->isWaitingForApplierToDrain()) { severe() << "attempting to replicate ops while primary"; fassertFailed(28527); } // Set minValid to the last op to be applied in this next batch. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating the oplog OpTime minValid = lastOp["ts"]._opTime(); setMinValid(&txn, minValid); multiApply(ops.getDeque()); applyOpsToOplog(&ops.getDeque()); // If we're just testing (no manager), don't keep looping if we exhausted the bgqueue // TODO(spencer): Remove repltest.cpp dbtest or make this work with the new replication // coordinator if (theReplSet && !theReplSet->mgr) { BSONObj op; if (!peek(&op)) { return; } } } }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { OpQueueBatcher batcher(this); OperationContextImpl txn; auto replCoord = ReplicationCoordinator::get(&txn); ApplyBatchFinalizer finalizer(replCoord); auto minValidBoundaries = getMinValid(&txn); OpTime originalEndOpTime(minValidBoundaries.end); OpTime lastWriteOpTime{replCoord->getMyLastOptime()}; while (!inShutdown()) { OpQueue ops; do { if (BackgroundSync::get()->getInitialSyncRequestedFlag()) { // got a resync command return; } tryToGoLiveAsASecondary(&txn, replCoord, minValidBoundaries, lastWriteOpTime); // Blocks up to a second waiting for a batch to be ready to apply. If one doesn't become // ready in time, we'll loop again so we can do the above checks periodically. ops = batcher.getNextBatch(Seconds(1)); } while (!inShutdown() && ops.empty()); if (inShutdown()) return; invariant(!ops.empty()); const BSONObj lastOp = ops.back().raw; if (lastOp.isEmpty()) { // This means that the network thread has coalesced and we have processed all of its // data. invariant(ops.getDeque().size() == 1); if (replCoord->isWaitingForApplierToDrain()) { replCoord->signalDrainComplete(&txn); } continue; // This wasn't a real op. Don't try to apply it. } handleSlaveDelay(lastOp); // Set minValid to the last OpTime that needs to be applied, in this batch or from the // (last) failed batch, whichever is larger. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating finishing. const OpTime start(getLastSetTimestamp(), OpTime::kUninitializedTerm); // Take the max of the first endOptime (if we recovered) and the end of our batch. const auto lastOpTime = fassertStatusOK(28773, OpTime::parseFromOplogEntry(lastOp)); // Setting end to the max of originalEndOpTime and lastOpTime (the end of the batch) // ensures that we keep pushing out the point where we can become consistent // and allow reads. If we recover and end up doing smaller batches we must pass the // originalEndOpTime before we are good. // // For example: // batch apply, 20-40, end = 40 // batch failure, // restart // batch apply, 20-25, end = max(25, 40) = 40 // batch apply, 25-45, end = 45 const OpTime end(std::max(originalEndOpTime, lastOpTime)); // This write will not journal/checkpoint. setMinValid(&txn, {start, end}); lastWriteOpTime = multiApply(&txn, ops); setNewTimestamp(lastWriteOpTime.getTimestamp()); setMinValid(&txn, end, DurableRequirement::None); minValidBoundaries.start = {}; minValidBoundaries.end = end; finalizer.record(lastWriteOpTime); } }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { while( 1 ) { OpQueue ops; verify( !Lock::isLocked() ); Timer batchTimer; int lastTimeChecked = 0; // always fetch a few ops first // tryPopAndWaitForMore returns true when we need to end a batch early while (!tryPopAndWaitForMore(&ops) && (ops.getSize() < replBatchLimitBytes)) { if (theReplSet->isPrimary()) { massert(16620, "there are ops to sync, but I'm primary", ops.empty()); return; } int now = batchTimer.seconds(); // apply replication batch limits if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } // occasionally check some things if (ops.empty() || now > lastTimeChecked) { lastTimeChecked = now; // can we become secondary? // we have to check this before calling mgr, as we must be a secondary to // become primary if (!theReplSet->isSecondary()) { OpTime minvalid; theReplSet->tryToGoLiveAsASecondary(minvalid); } // normally msgCheckNewState gets called periodically, but in a single node repl set // there are no heartbeat threads, so we do it here to be sure. this is relevant if the // singleton member has done a stepDown() and needs to come back up. if (theReplSet->config().members.size() == 1 && theReplSet->myConfig().potentiallyHot()) { Manager* mgr = theReplSet->mgr; // When would mgr be null? During replsettest'ing. if (mgr) mgr->send(boost::bind(&Manager::msgCheckNewState, theReplSet->mgr)); sleepsecs(1); // There should never be ops to sync in a 1-member set, anyway return; } } const int slaveDelaySecs = theReplSet->myConfig().slaveDelay; if (!ops.empty() && slaveDelaySecs > 0) { const BSONObj& lastOp = ops.getDeque().back(); const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs(); // Stop the batch as the lastOp is too new to be applied. If we continue // on, we can get ops that are way ahead of the delay and this will // make this thread sleep longer when handleSlaveDelay is called // and apply ops much sooner than we like. if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { break; } } } // For pausing replication in tests while (MONGO_FAIL_POINT(rsSyncApplyStop)) { sleepmillis(0); } const BSONObj& lastOp = ops.getDeque().back(); setOplogVersion(lastOp); handleSlaveDelay(lastOp); // Set minValid to the last op to be applied in this next batch. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating the oplog theReplSet->setMinValid(lastOp); multiApply(ops.getDeque(), multiSyncApply); applyOpsToOplog(&ops.getDeque()); } }