/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { OpQueueBatcher batcher(this); OperationContextImpl txn; auto replCoord = ReplicationCoordinator::get(&txn); ApplyBatchFinalizer finalizer(replCoord); OpTime originalEndOpTime(getMinValid(&txn).end); while (!inShutdown()) { OpQueue ops; do { if (BackgroundSync::get()->getInitialSyncRequestedFlag()) { // got a resync command return; } tryToGoLiveAsASecondary(&txn, replCoord); // Blocks up to a second waiting for a batch to be ready to apply. If one doesn't become // ready in time, we'll loop again so we can do the above checks periodically. ops = batcher.getNextBatch(Seconds(1)); } while (!inShutdown() && ops.empty()); if (inShutdown()) return; invariant(!ops.empty()); const BSONObj lastOp = ops.back().raw; if (lastOp.isEmpty()) { // This means that the network thread has coalesced and we have processed all of its // data. invariant(ops.getDeque().size() == 1); if (replCoord->isWaitingForApplierToDrain()) { replCoord->signalDrainComplete(&txn); } continue; // This wasn't a real op. Don't try to apply it. } handleSlaveDelay(lastOp); // Set minValid to the last OpTime that needs to be applied, in this batch or from the // (last) failed batch, whichever is larger. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating finishing. const OpTime start(getLastSetTimestamp(), OpTime::kUninitializedTerm); // Take the max of the first endOptime (if we recovered) and the end of our batch. const auto lastOpTime = fassertStatusOK(28773, OpTime::parseFromOplogEntry(lastOp)); // Setting end to the max of originalEndOpTime and lastOpTime (the end of the batch) // ensures that we keep pushing out the point where we can become consistent // and allow reads. If we recover and end up doing smaller batches we must pass the // originalEndOpTime before we are good. // // For example: // batch apply, 20-40, end = 40 // batch failure, // restart // batch apply, 20-25, end = max(25, 40) = 40 // batch apply, 25-45, end = 45 const OpTime end(std::max(originalEndOpTime, lastOpTime)); // This write will not journal/checkpoint. setMinValid(&txn, {start, end}); OpTime finalOpTime = multiApply(&txn, ops); setNewTimestamp(finalOpTime.getTimestamp()); setMinValid(&txn, end, DurableRequirement::None); finalizer.record(finalOpTime); } }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { while( 1 ) { OpQueue ops; verify( !Lock::isLocked() ); Timer batchTimer; int lastTimeChecked = 0; // always fetch a few ops first // tryPopAndWaitForMore returns true when we need to end a batch early while (!tryPopAndWaitForMore(&ops) && (ops.getSize() < replBatchLimitBytes)) { if (theReplSet->isPrimary()) { massert(16620, "there are ops to sync, but I'm primary", ops.empty()); return; } int now = batchTimer.seconds(); // apply replication batch limits if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } // occasionally check some things if (ops.empty() || now > lastTimeChecked) { lastTimeChecked = now; // can we become secondary? // we have to check this before calling mgr, as we must be a secondary to // become primary if (!theReplSet->isSecondary()) { OpTime minvalid; theReplSet->tryToGoLiveAsASecondary(minvalid); } // normally msgCheckNewState gets called periodically, but in a single node repl set // there are no heartbeat threads, so we do it here to be sure. this is relevant if the // singleton member has done a stepDown() and needs to come back up. if (theReplSet->config().members.size() == 1 && theReplSet->myConfig().potentiallyHot()) { Manager* mgr = theReplSet->mgr; // When would mgr be null? During replsettest'ing. if (mgr) mgr->send(boost::bind(&Manager::msgCheckNewState, theReplSet->mgr)); sleepsecs(1); // There should never be ops to sync in a 1-member set, anyway return; } } const int slaveDelaySecs = theReplSet->myConfig().slaveDelay; if (!ops.empty() && slaveDelaySecs > 0) { const BSONObj& lastOp = ops.getDeque().back(); const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs(); // Stop the batch as the lastOp is too new to be applied. If we continue // on, we can get ops that are way ahead of the delay and this will // make this thread sleep longer when handleSlaveDelay is called // and apply ops much sooner than we like. if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { break; } } } // For pausing replication in tests while (MONGO_FAIL_POINT(rsSyncApplyStop)) { sleepmillis(0); } const BSONObj& lastOp = ops.getDeque().back(); setOplogVersion(lastOp); handleSlaveDelay(lastOp); // Set minValid to the last op to be applied in this next batch. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating the oplog theReplSet->setMinValid(lastOp); multiApply(ops.getDeque(), multiSyncApply); applyOpsToOplog(&ops.getDeque()); } }