BSONObj OplogFetcher::_makeFindCommandObject(const NamespaceString& nss, OpTime lastOpTimeFetched, Milliseconds findMaxTime) const { auto lastCommittedWithCurrentTerm = _dataReplicatorExternalState->getCurrentTermAndLastCommittedOpTime(); auto term = lastCommittedWithCurrentTerm.value; BSONObjBuilder cmdBob; cmdBob.append("find", nss.coll()); cmdBob.append("filter", BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp()))); cmdBob.append("tailable", true); cmdBob.append("oplogReplay", true); cmdBob.append("awaitData", true); cmdBob.append("maxTimeMS", durationCount<Milliseconds>(findMaxTime)); cmdBob.append("batchSize", _batchSize); if (term != OpTime::kUninitializedTerm) { cmdBob.append("term", term); } // This ensures that the sync source never returns an empty batch of documents for the first set // of results. cmdBob.append("readConcern", BSON("afterClusterTime" << lastOpTimeFetched.getTimestamp())); return cmdBob.obj(); }
void ReplicationRecoveryImpl::_recoverFromUnstableCheckpoint(OperationContext* opCtx, OpTime appliedThrough, OpTime topOfOplog) { invariant(!topOfOplog.isNull()); log() << "Recovering from an unstable checkpoint (top of oplog: " << topOfOplog << ", appliedThrough: " << appliedThrough << ")"; if (appliedThrough.isNull()) { // The appliedThrough would be null if we shut down cleanly or crashed as a primary. Either // way we are consistent at the top of the oplog. log() << "No oplog entries to apply for recovery. appliedThrough is null."; } else { // If the appliedThrough is not null, then we shut down uncleanly during secondary oplog // application and must apply from the appliedThrough to the top of the oplog. log() << "Starting recovery oplog application at the appliedThrough: " << appliedThrough << ", through the top of the oplog: " << topOfOplog; // When `recoverFromOplog` truncates the oplog, that also happens to set the "oldest // timestamp" to the truncation point[1]. `_applyToEndOfOplog` will then perform writes // before the truncation point. Doing so violates the constraint that all updates must be // timestamped newer than the "oldest timestamp". This call will move the "oldest // timestamp" back to the `startPoint`. // // [1] This is arguably incorrect. On rollback for nodes that are not keeping history to // the "majority point", the "oldest timestamp" likely needs to go back in time. The // oplog's `cappedTruncateAfter` method was a convenient location for this logic, which, // unfortunately, conflicts with the usage above. opCtx->getServiceContext()->getStorageEngine()->setOldestTimestamp( appliedThrough.getTimestamp()); _applyToEndOfOplog(opCtx, appliedThrough.getTimestamp(), topOfOplog.getTimestamp()); } // `_recoverFromUnstableCheckpoint` is only expected to be called on startup. _storageInterface->setInitialDataTimestamp(opCtx->getServiceContext(), topOfOplog.getTimestamp()); // Ensure the `appliedThrough` is set to the top of oplog, specifically if the node was // previously running as a primary. If a crash happens before the first stable checkpoint on // upgrade, replication recovery will know it must apply from this point and not assume the // datafiles contain any writes that were taken before the crash. _consistencyMarkers->setAppliedThrough(opCtx, topOfOplog); // Force the set `appliedThrough` to become durable on disk in a checkpoint. This method would // typically take a stable checkpoint, but because we're starting up from a checkpoint that // has no checkpoint timestamp, the stable checkpoint "degrades" into an unstable checkpoint. // // Not waiting for checkpoint durability here can result in a scenario where the node takes // writes and persists them to the oplog, but crashes before a stable checkpoint persists a // "recovery timestamp". The typical startup path for data-bearing nodes with 4.0 is to use // the recovery timestamp to determine where to play oplog forward from. As this method shows, // when a recovery timestamp does not exist, the applied through is used to determine where to // start playing oplog entries from. opCtx->recoveryUnit()->waitUntilUnjournaledWritesDurable(); }
void ReplicationCoordinatorExternalStateImpl::updateCommittedSnapshot( const OpTime& newCommitPoint) { auto manager = _service->getGlobalStorageEngine()->getSnapshotManager(); if (manager) { manager->setCommittedSnapshot(newCommitPoint.getTimestamp()); } notifyOplogMetadataWaiters(newCommitPoint); }
void ReplicationRecoveryImpl::_recoverFromStableTimestamp(OperationContext* opCtx, Timestamp stableTimestamp, OpTime appliedThrough, OpTime topOfOplog) { invariant(!stableTimestamp.isNull()); invariant(!topOfOplog.isNull()); const auto truncateAfterPoint = _consistencyMarkers->getOplogTruncateAfterPoint(opCtx); log() << "Recovering from stable timestamp: " << stableTimestamp << " (top of oplog: " << topOfOplog << ", appliedThrough: " << appliedThrough << ", TruncateAfter: " << truncateAfterPoint << ")"; log() << "Starting recovery oplog application at the stable timestamp: " << stableTimestamp; _applyToEndOfOplog(opCtx, stableTimestamp, topOfOplog.getTimestamp()); }
void BackgroundSync::_produce(OperationContext* txn, executor::TaskExecutor* taskExecutor) { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() || inShutdownStrict()) { return; } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; { stdx::unique_lock<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); } OplogReader syncSourceReader; syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord); // no server found if (syncSourceReader.getHost().empty()) { sleepsecs(1); // if there is no one to sync from return; } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_pause) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; _syncSourceHost = syncSourceReader.getHost(); _replCoord->signalUpstreamUpdater(); } const Milliseconds oplogSocketTimeout(OplogReader::kSocketTimeout); // Prefer host in oplog reader to _syncSourceHost because _syncSourceHost may be cleared // if sync source feedback fails. const HostAndPort source = syncSourceReader.getHost(); syncSourceReader.resetConnection(); // no more references to oplog reader from here on. // If this status is not OK after the fetcher returns from wait(), // proceed to execute rollback Status remoteOplogStartStatus = Status::OK(); auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback, this, stdx::placeholders::_1, stdx::placeholders::_3, stdx::cref(source), lastOpTimeFetched, lastHashFetched, &remoteOplogStartStatus); auto cmdObj = BSON("find" << nsToCollectionSubstring(rsOplogName) << "filter" << BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp())) << "tailable" << true << "oplogReplay" << true << "awaitData" << true << "maxTimeMS" << int(fetcherMaxTimeMS.count())); Fetcher fetcher(taskExecutor, source, nsToDatabase(rsOplogName), cmdObj, fetcherCallback, rpc::makeEmptyMetadata()); auto scheduleStatus = fetcher.schedule(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } fetcher.wait(); // If the background sync is paused after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (isPaused()) { return; } // Execute rollback if necessary. // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. if (!remoteOplogStartStatus.isOK()) { const int messagingPortTags = 0; ConnectionPool connectionPool(messagingPortTags); std::unique_ptr<ConnectionPool::ConnectionPtr> connection; auto getConnection = [&connection, &connectionPool, oplogSocketTimeout, source]() -> DBClientBase* { if (!connection.get()) { connection.reset(new ConnectionPool::ConnectionPtr( &connectionPool, source, Date_t::now(), oplogSocketTimeout)); }; return connection->get(); }; log() << "starting rollback: " << remoteOplogStartStatus; _rollback(txn, source, getConnection); stop(); } }
void SyncSourceResolver::_firstOplogEntryFetcherCallback( const StatusWith<Fetcher::QueryResponse>& queryResult, HostAndPort candidate, OpTime earliestOpTimeSeen) { if (_isShuttingDown()) { _finishCallback(Status(ErrorCodes::CallbackCanceled, str::stream() << "sync source resolver shut down while probing candidate: " << candidate)); return; } if (ErrorCodes::CallbackCanceled == queryResult.getStatus()) { _finishCallback(queryResult.getStatus()); return; } if (!queryResult.isOK()) { // We got an error. const auto until = _taskExecutor->now() + kFetcherErrorBlacklistDuration; log() << "Blacklisting " << candidate << " due to error: '" << queryResult.getStatus() << "' for " << kFetcherErrorBlacklistDuration << " until: " << until; _syncSourceSelector->blacklistSyncSource(candidate, until); _chooseAndProbeNextSyncSource(earliestOpTimeSeen); return; } const auto& queryResponse = queryResult.getValue(); const auto remoteEarliestOpTime = _parseRemoteEarliestOpTime(candidate, queryResponse); if (remoteEarliestOpTime.isNull()) { _chooseAndProbeNextSyncSource(earliestOpTimeSeen); return; } // remoteEarliestOpTime may come from a very old config, so we cannot compare their terms. if (_lastOpTimeFetched.getTimestamp() < remoteEarliestOpTime.getTimestamp()) { // We're too stale to use this sync source. const auto blacklistDuration = kTooStaleBlacklistDuration; const auto until = _taskExecutor->now() + Minutes(1); log() << "We are too stale to use " << candidate << " as a sync source. " << "Blacklisting this sync source" << " because our last fetched timestamp: " << _lastOpTimeFetched.getTimestamp() << " is before their earliest timestamp: " << remoteEarliestOpTime.getTimestamp() << " for " << blacklistDuration << " until: " << until; _syncSourceSelector->blacklistSyncSource(candidate, until); // If all the viable sync sources are too far ahead of us (i.e. we are "too stale" relative // each sync source), we will want to return the starting timestamp of the sync source // candidate that is closest to us. See SyncSourceResolverResponse::earliestOpTimeSeen. // We use "earliestOpTimeSeen" to keep track of the current minimum starting timestamp. if (earliestOpTimeSeen.isNull() || earliestOpTimeSeen.getTimestamp() > remoteEarliestOpTime.getTimestamp()) { earliestOpTimeSeen = remoteEarliestOpTime; } _chooseAndProbeNextSyncSource(earliestOpTimeSeen); return; } // Schedules fetcher to look for '_requiredOpTime' in the remote oplog. if (!_requiredOpTime.isNull()) { auto status = _scheduleFetcher(_makeRequiredOpTimeFetcher(candidate, earliestOpTimeSeen)); if (!status.isOK()) { _finishCallback(status); } return; } _finishCallback(candidate); }
void OplogReader::connectToSyncSource(OperationContext* txn, const OpTime& lastOpTimeFetched, ReplicationCoordinator* replCoord) { const Timestamp sentinelTimestamp(duration_cast<Seconds>(Milliseconds(curTimeMillis64())), 0); const OpTime sentinel(sentinelTimestamp, std::numeric_limits<long long>::max()); OpTime oldestOpTimeSeen = sentinel; invariant(conn() == NULL); while (true) { HostAndPort candidate = replCoord->chooseNewSyncSource(lastOpTimeFetched.getTimestamp()); if (candidate.empty()) { if (oldestOpTimeSeen == sentinel) { // If, in this invocation of connectToSyncSource(), we did not successfully // connect to any node ahead of us, // we apparently have no sync sources to connect to. // This situation is common; e.g. if there are no writes to the primary at // the moment. return; } // Connected to at least one member, but in all cases we were too stale to use them // as a sync source. error() << "too stale to catch up"; log() << "our last optime : " << lastOpTimeFetched; log() << "oldest available is " << oldestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; setMinValid(txn, oldestOpTimeSeen); bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << replCoord->getMemberState(); } return; } if (!connect(candidate)) { LOG(2) << "can't connect to " << candidate.toString() << " to read operations"; resetConnection(); replCoord->blacklistSyncSource(candidate, Date_t::now() + Seconds(10)); continue; } // Read the first (oldest) op and confirm that it's not newer than our last // fetched op. Otherwise, we have fallen off the back of that source's oplog. BSONObj remoteOldestOp(findOne(rsOplogName.c_str(), Query())); OpTime remoteOldOpTime = fassertStatusOK(28776, OpTime::parseFromBSON(remoteOldestOp)); // remoteOldOpTime may come from a very old config, so we cannot compare their terms. if (!lastOpTimeFetched.isNull() && lastOpTimeFetched.getTimestamp() < remoteOldOpTime.getTimestamp()) { // We're too stale to use this sync source. resetConnection(); replCoord->blacklistSyncSource(candidate, Date_t::now() + Minutes(1)); if (oldestOpTimeSeen.getTimestamp() > remoteOldOpTime.getTimestamp()) { warning() << "we are too stale to use " << candidate.toString() << " as a sync source"; oldestOpTimeSeen = remoteOldOpTime; } continue; } // Got a valid sync source. return; } // while (true) }
void ReplicationCoordinatorExternalStateImpl::updateCommittedSnapshot(OpTime newCommitPoint) { auto manager = getGlobalServiceContext()->getGlobalStorageEngine()->getSnapshotManager(); invariant(manager); // This should never be called if there is no SnapshotManager. manager->setCommittedSnapshot(SnapshotName(newCommitPoint.getTimestamp())); }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); ApplyBatchFinalizer finalizer(replCoord); OperationContextImpl txn; OpTime originalEndOpTime(getMinValid(&txn).end); while (!inShutdown()) { OpQueue ops; Timer batchTimer; int lastTimeChecked = 0; do { int now = batchTimer.seconds(); // apply replication batch limits if (!ops.empty()) { if (now > replBatchLimitSeconds) break; if (ops.getDeque().size() > replBatchLimitOperations) break; } // occasionally check some things // (always checked in the first iteration of this do-while loop, because // ops is empty) if (ops.empty() || now > lastTimeChecked) { BackgroundSync* bgsync = BackgroundSync::get(); if (bgsync->getInitialSyncRequestedFlag()) { // got a resync command return; } lastTimeChecked = now; // can we become secondary? // we have to check this before calling mgr, as we must be a secondary to // become primary tryToGoLiveAsASecondary(&txn, replCoord); } const int slaveDelaySecs = durationCount<Seconds>(replCoord->getSlaveDelaySecs()); if (!ops.empty() && slaveDelaySecs > 0) { const BSONObj lastOp = ops.back(); const unsigned int opTimestampSecs = lastOp["ts"].timestamp().getSecs(); // Stop the batch as the lastOp is too new to be applied. If we continue // on, we can get ops that are way ahead of the delay and this will // make this thread sleep longer when handleSlaveDelay is called // and apply ops much sooner than we like. if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) { break; } } if (MONGO_FAIL_POINT(rsSyncApplyStop)) { break; } // keep fetching more ops as long as we haven't filled up a full batch yet } while (!tryPopAndWaitForMore(&txn, &ops, replCoord) && // tryPopAndWaitForMore returns // true when we need to end a // batch early (ops.getSize() < replBatchLimitBytes) && !inShutdown()); // For pausing replication in tests while (MONGO_FAIL_POINT(rsSyncApplyStop)) { sleepmillis(0); if (inShutdown()) return; } if (ops.empty()) { continue; } const BSONObj lastOp = ops.back(); handleSlaveDelay(lastOp); // Set minValid to the last OpTime that needs to be applied, in this batch or from the // (last) failed batch, whichever is larger. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating finishing. const OpTime start(getLastSetTimestamp(), OpTime::kUninitializedTerm); // Take the max of the first endOptime (if we recovered) and the end of our batch. const auto lastOpTime = fassertStatusOK(28773, OpTime::parseFromOplogEntry(lastOp)); // Setting end to the max of originalEndOpTime and lastOpTime (the end of the batch) // ensures that we keep pushing out the point where we can become consistent // and allow reads. If we recover and end up doing smaller batches we must pass the // originalEndOpTime before we are good. // // For example: // batch apply, 20-40, end = 40 // batch failure, // restart // batch apply, 20-25, end = max(25, 40) = 40 // batch apply, 25-45, end = 45 const OpTime end(std::max(originalEndOpTime, lastOpTime)); // This write will not journal/checkpoint. setMinValid(&txn, {start, end}); OpTime finalOpTime = multiApply(&txn, ops); setNewTimestamp(finalOpTime.getTimestamp()); setMinValid(&txn, end, DurableRequirement::None); finalizer.record(finalOpTime); } }
void BackgroundSync::_produce(OperationContext* txn) { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() || inShutdownStrict()) { return; } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; HostAndPort source; { stdx::unique_lock<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); } SyncSourceResolverResponse syncSourceResp = _syncSourceResolver.findSyncSource(txn, lastOpTimeFetched); if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) { // All (accessible) sync sources were too stale. error() << "too stale to catch up -- entering maintenance mode"; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; setMinValid(txn, {lastOpTimeFetched, syncSourceResp.earliestOpTimeSeen}); auto status = _replCoord->setMaintenanceMode(true); if (!status.isOK()) { warning() << "Failed to transition into maintenance mode."; } bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << _replCoord->getMemberState(); } return; } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) { stdx::lock_guard<stdx::mutex> lock(_mutex); _syncSourceHost = syncSourceResp.getSyncSource(); source = _syncSourceHost; } else { if (!syncSourceResp.isOK()) { log() << "failed to find sync source, received error " << syncSourceResp.syncSourceStatus.getStatus(); } // No sync source found. sleepsecs(1); return; } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_stopped) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; _replCoord->signalUpstreamUpdater(); } const auto isV1ElectionProtocol = _replCoord->isV1ElectionProtocol(); // Under protocol version 1, make the awaitData timeout (maxTimeMS) dependent on the election // timeout. This enables the sync source to communicate liveness of the primary to secondaries. // Under protocol version 0, use a default timeout of 2 seconds for awaitData. const Milliseconds fetcherMaxTimeMS( isV1ElectionProtocol ? _replCoord->getConfig().getElectionTimeoutPeriod() / 2 : Seconds(2)); Status fetcherReturnStatus = Status::OK(); auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback, this, stdx::placeholders::_1, stdx::placeholders::_3, stdx::cref(source), lastOpTimeFetched, lastHashFetched, fetcherMaxTimeMS, &fetcherReturnStatus); BSONObjBuilder cmdBob; cmdBob.append("find", nsToCollectionSubstring(rsOplogName)); cmdBob.append("filter", BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp()))); cmdBob.append("tailable", true); cmdBob.append("oplogReplay", true); cmdBob.append("awaitData", true); cmdBob.append("maxTimeMS", durationCount<Milliseconds>(Minutes(1))); // 1 min initial find. BSONObjBuilder metadataBob; if (isV1ElectionProtocol) { cmdBob.append("term", _replCoord->getTerm()); metadataBob.append(rpc::kReplSetMetadataFieldName, 1); } auto dbName = nsToDatabase(rsOplogName); auto cmdObj = cmdBob.obj(); auto metadataObj = metadataBob.obj(); Fetcher fetcher(&_threadPoolTaskExecutor, source, dbName, cmdObj, fetcherCallback, metadataObj, _replCoord->getConfig().getElectionTimeoutPeriod()); LOG(1) << "scheduling fetcher to read remote oplog on " << source << " starting at " << cmdObj["filter"]; auto scheduleStatus = fetcher.schedule(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } fetcher.wait(); LOG(1) << "fetcher stopped reading remote oplog on " << source; // If the background sync is stopped after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (isStopped()) { return; } if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) { // This is bad because it means that our source // has not returned oplog entries in ascending ts order, and they need to be. warning() << fetcherReturnStatus.toString(); // Do not blacklist the server here, it will be blacklisted when we try to reuse it, // if it can't return a matching oplog start from the last fetch oplog ts field. return; } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing || fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) { // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. const int messagingPortTags = 0; ConnectionPool connectionPool(messagingPortTags); std::unique_ptr<ConnectionPool::ConnectionPtr> connection; auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* { if (!connection.get()) { connection.reset(new ConnectionPool::ConnectionPtr( &connectionPool, source, Date_t::now(), oplogSocketTimeout)); }; return connection->get(); }; { stdx::lock_guard<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; } log() << "Starting rollback due to " << fetcherReturnStatus; // Wait till all buffered oplog entries have drained and been applied. auto lastApplied = _replCoord->getMyLastAppliedOpTime(); if (lastApplied != lastOpTimeFetched) { log() << "Waiting for all operations from " << lastApplied << " until " << lastOpTimeFetched << " to be applied before starting rollback."; while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) { sleepmillis(10); if (isStopped() || inShutdown()) { return; } } } // check that we are at minvalid, otherwise we cannot roll back as we may be in an // inconsistent state BatchBoundaries boundaries = getMinValid(txn); if (!boundaries.start.isNull() || boundaries.end > lastApplied) { fassertNoTrace(18750, Status(ErrorCodes::UnrecoverableRollbackError, str::stream() << "need to rollback, but in inconsistent state. " << "minvalid: " << boundaries.end.toString() << " > our last optime: " << lastApplied.toString())); } _rollback(txn, source, getConnection); stop(); } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) { Seconds blacklistDuration(60); warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source " << source << " for " << blacklistDuration << "."; _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration); } else if (!fetcherReturnStatus.isOK()) { warning() << "Fetcher error querying oplog: " << fetcherReturnStatus.toString(); } }
/* applies oplog from "now" until endOpTime using the applier threads for initial sync*/ void InitialSync::_applyOplogUntil(OperationContext* txn, const OpTime& endOpTime) { unsigned long long bytesApplied = 0; unsigned long long entriesApplied = 0; while (true) { OpQueue ops; auto replCoord = repl::ReplicationCoordinator::get(txn); while (!tryPopAndWaitForMore(txn, &ops)) { // nothing came back last time, so go again if (ops.empty()) continue; // Check if we reached the end const BSONObj currentOp = ops.back().raw; const OpTime currentOpTime = fassertStatusOK(28772, OpTime::parseFromOplogEntry(currentOp)); // When we reach the end return this batch if (currentOpTime == endOpTime) { break; } else if (currentOpTime > endOpTime) { severe() << "Applied past expected end " << endOpTime << " to " << currentOpTime << " without seeing it. Rollback?"; fassertFailedNoTrace(18693); } // apply replication batch limits if (ops.getSize() > replBatchLimitBytes) break; if (ops.getDeque().size() > replBatchLimitOperations) break; }; if (ops.empty()) { severe() << "got no ops for batch..."; fassertFailedNoTrace(18692); } const BSONObj lastOp = ops.back().raw.getOwned(); // Tally operation information bytesApplied += ops.getSize(); entriesApplied += ops.getDeque().size(); const OpTime lastOpTime = multiApply(txn, ops); replCoord->setMyLastAppliedOpTime(lastOpTime); setNewTimestamp(lastOpTime.getTimestamp()); if (inShutdown()) { return; } // if the last op applied was our end, return if (lastOpTime == endOpTime) { LOG(1) << "SyncTail applied " << entriesApplied << " entries (" << bytesApplied << " bytes) and finished at opTime " << endOpTime; return; } } // end of while (true) }
void BackgroundSync::produce(OperationContext* txn) { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { boost::unique_lock<boost::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } // Wait until we've applied the ops we have before we choose a sync target while (!_appliedBuffer && !inShutdownStrict()) { _appliedBufferCondition.wait(lock); } if (inShutdownStrict()) { return; } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; { boost::unique_lock<boost::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); } _syncSourceReader.resetConnection(); _syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord); { boost::unique_lock<boost::mutex> lock(_mutex); // no server found if (_syncSourceReader.getHost().empty()) { lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = _syncSourceReader.getHost(); _replCoord->signalUpstreamUpdater(); } _syncSourceReader.tailingQueryGTE(rsOplogName.c_str(), lastOpTimeFetched.getTimestamp()); // if target cut connections between connecting and querying (for // example, because it stepped down) we might not have a cursor if (!_syncSourceReader.haveCursor()) { return; } if (_rollbackIfNeeded(txn, _syncSourceReader)) { stop(); return; } while (!inShutdown()) { if (!_syncSourceReader.moreInCurrentBatch()) { // Check some things periodically // (whenever we run out of items in the // current cursor batch) int bs = _syncSourceReader.currentBatchMessageSize(); if( bs > 0 && bs < BatchIsSmallish ) { // on a very low latency network, if we don't wait a little, we'll be // getting ops to write almost one at a time. this will both be expensive // for the upstream server as well as potentially defeating our parallel // application of batches on the secondary. // // the inference here is basically if the batch is really small, we are // "caught up". // sleepmillis(SleepToAllowBatchingMillis); } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) { return; } // re-evaluate quality of sync target if (shouldChangeSyncSource()) { return; } { //record time for each getmore TimerHolder batchTimer(&getmoreReplStats); // This calls receiveMore() on the oplogreader cursor. // It can wait up to five seconds for more data. _syncSourceReader.more(); } networkByteStats.increment(_syncSourceReader.currentBatchMessageSize()); if (!_syncSourceReader.moreInCurrentBatch()) { // If there is still no data from upstream, check a few more things // and then loop back for another pass at getting more data { boost::unique_lock<boost::mutex> lock(_mutex); if (_pause) { return; } } _syncSourceReader.tailCheck(); if( !_syncSourceReader.haveCursor() ) { LOG(1) << "replSet end syncTail pass"; return; } continue; } } // If we are transitioning to primary state, we need to leave // this loop in order to go into bgsync-pause mode. if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) { LOG(1) << "waiting for draining or we are primary, not adding more ops to buffer"; return; } // At this point, we are guaranteed to have at least one thing to read out // of the oplogreader cursor. BSONObj o = _syncSourceReader.nextSafe().getOwned(); opsReadStats.increment(); { boost::unique_lock<boost::mutex> lock(_mutex); _appliedBuffer = false; } OCCASIONALLY { LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes"; } bufferCountGauge.increment(); bufferSizeGauge.increment(getSize(o)); _buffer.push(o); { boost::unique_lock<boost::mutex> lock(_mutex); _lastFetchedHash = o["h"].numberLong(); _lastOpTimeFetched = extractOpTime(o); LOG(3) << "lastOpTimeFetched: " << _lastOpTimeFetched; } } }
void BackgroundSync::_produce(OperationContext* txn) { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() || inShutdownStrict()) { return; } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; { stdx::unique_lock<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); } OplogReader syncSourceReader; syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord); // no server found if (syncSourceReader.getHost().empty()) { sleepsecs(1); // if there is no one to sync from return; } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_pause) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; _syncSourceHost = syncSourceReader.getHost(); _replCoord->signalUpstreamUpdater(); } const Milliseconds oplogSocketTimeout(OplogReader::kSocketTimeout); const auto isV1ElectionProtocol = _replCoord->isV1ElectionProtocol(); // Under protocol version 1, make the awaitData timeout (maxTimeMS) dependent on the election // timeout. This enables the sync source to communicate liveness of the primary to secondaries. // Under protocol version 0, use a default timeout of 2 seconds for awaitData. const Milliseconds fetcherMaxTimeMS( isV1ElectionProtocol ? _replCoord->getConfig().getElectionTimeoutPeriod() / 2 : Seconds(2)); // Prefer host in oplog reader to _syncSourceHost because _syncSourceHost may be cleared // if sync source feedback fails. const HostAndPort source = syncSourceReader.getHost(); syncSourceReader.resetConnection(); // no more references to oplog reader from here on. Status fetcherReturnStatus = Status::OK(); auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback, this, stdx::placeholders::_1, stdx::placeholders::_3, stdx::cref(source), lastOpTimeFetched, lastHashFetched, fetcherMaxTimeMS, &fetcherReturnStatus); BSONObjBuilder cmdBob; cmdBob.append("find", nsToCollectionSubstring(rsOplogName)); cmdBob.append("filter", BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp()))); cmdBob.append("tailable", true); cmdBob.append("oplogReplay", true); cmdBob.append("awaitData", true); cmdBob.append("maxTimeMS", durationCount<Milliseconds>(fetcherMaxTimeMS)); BSONObjBuilder metadataBob; if (isV1ElectionProtocol) { cmdBob.append("term", _replCoord->getTerm()); metadataBob.append(rpc::kReplSetMetadataFieldName, 1); } auto dbName = nsToDatabase(rsOplogName); auto cmdObj = cmdBob.obj(); auto metadataObj = metadataBob.obj(); Fetcher fetcher(&_threadPoolTaskExecutor, source, dbName, cmdObj, fetcherCallback, metadataObj, _replCoord->getConfig().getElectionTimeoutPeriod()); LOG(1) << "scheduling fetcher to read remote oplog on " << source << " starting at " << cmdObj["filter"]; auto scheduleStatus = fetcher.schedule(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } fetcher.wait(); LOG(1) << "fetcher stopped reading remote oplog on " << source; // If the background sync is paused after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (isPaused()) { return; } if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) { // This is bad because it means that our source // has not returned oplog entries in ascending ts order, and they need to be. warning() << fetcherReturnStatus.toString(); // Do not blacklist the server here, it will be blacklisted when we try to reuse it, // if it can't return a matching oplog start from the last fetch oplog ts field. return; } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing || fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) { // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. const int messagingPortTags = 0; ConnectionPool connectionPool(messagingPortTags); std::unique_ptr<ConnectionPool::ConnectionPtr> connection; auto getConnection = [&connection, &connectionPool, oplogSocketTimeout, source]() -> DBClientBase* { if (!connection.get()) { connection.reset(new ConnectionPool::ConnectionPtr( &connectionPool, source, Date_t::now(), oplogSocketTimeout)); }; return connection->get(); }; log() << "starting rollback: " << fetcherReturnStatus; _rollback(txn, source, getConnection); stop(); } else if (!fetcherReturnStatus.isOK()) { warning() << "Fetcher error querying oplog: " << fetcherReturnStatus.toString(); } }
void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx, const Timestamp& oplogApplicationStartPoint, const Timestamp& topOfOplog) { invariant(!oplogApplicationStartPoint.isNull()); invariant(!topOfOplog.isNull()); // Check if we have any unapplied ops in our oplog. It is important that this is done after // deleting the ragged end of the oplog. if (oplogApplicationStartPoint == topOfOplog) { log() << "No oplog entries to apply for recovery. Start point is at the top of the oplog."; return; // We've applied all the valid oplog we have. } else if (oplogApplicationStartPoint > topOfOplog) { severe() << "Applied op " << oplogApplicationStartPoint.toBSON() << " not found. Top of oplog is " << topOfOplog.toBSON() << '.'; fassertFailedNoTrace(40313); } log() << "Replaying stored operations from " << oplogApplicationStartPoint.toBSON() << " (exclusive) to " << topOfOplog.toBSON() << " (inclusive)."; OplogBufferLocalOplog oplogBuffer(oplogApplicationStartPoint); oplogBuffer.startup(opCtx); RecoveryOplogApplierStats stats; auto writerPool = OplogApplier::makeWriterPool(); OplogApplier::Options options; options.allowNamespaceNotFoundErrorsOnCrudOps = true; options.skipWritesToOplog = true; // During replication recovery, the stableTimestampForRecovery refers to the stable timestamp // from which we replay the oplog. // For startup recovery, this will be the recovery timestamp, which is the stable timestamp that // the storage engine recovered to on startup. For rollback recovery, this will be the last // stable timestamp, returned when we call recoverToStableTimestamp. // We keep track of this for prepared transactions so that when we apply a commitTransaction // oplog entry, we can check if it occurs before or after the stable timestamp and decide // whether the operations would have already been reflected in the data. options.stableTimestampForRecovery = oplogApplicationStartPoint; OplogApplierImpl oplogApplier(nullptr, &oplogBuffer, &stats, nullptr, _consistencyMarkers, _storageInterface, options, writerPool.get()); OplogApplier::BatchLimits batchLimits; batchLimits.bytes = OplogApplier::calculateBatchLimitBytes(opCtx, _storageInterface); batchLimits.ops = OplogApplier::getBatchLimitOperations(); OpTime applyThroughOpTime; OplogApplier::Operations batch; while ( !(batch = fassert(50763, oplogApplier.getNextApplierBatch(opCtx, batchLimits))).empty()) { applyThroughOpTime = uassertStatusOK(oplogApplier.multiApply(opCtx, std::move(batch))); } stats.complete(applyThroughOpTime); invariant(oplogBuffer.isEmpty(), str::stream() << "Oplog buffer not empty after applying operations. Last operation " "applied with optime: " << applyThroughOpTime.toBSON()); invariant(applyThroughOpTime.getTimestamp() == topOfOplog, str::stream() << "Did not apply to top of oplog. Applied through: " << applyThroughOpTime.toString() << ". Top of oplog: " << topOfOplog.toString()); oplogBuffer.shutdown(opCtx); // We may crash before setting appliedThrough. If we have a stable checkpoint, we will recover // to that checkpoint at a replication consistent point, and applying the oplog is safe. // If we don't have a stable checkpoint, then we must be in startup recovery, and not rollback // recovery, because we only roll back to a stable timestamp when we have a stable checkpoint. // Startup recovery from an unstable checkpoint only ever applies a single batch and it is safe // to replay the batch from any point. _consistencyMarkers->setAppliedThrough(opCtx, applyThroughOpTime); }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { OpQueueBatcher batcher(this); OperationContextImpl txn; auto replCoord = ReplicationCoordinator::get(&txn); ApplyBatchFinalizer finalizer(replCoord); OpTime originalEndOpTime(getMinValid(&txn).end); while (!inShutdown()) { OpQueue ops; do { if (BackgroundSync::get()->getInitialSyncRequestedFlag()) { // got a resync command return; } tryToGoLiveAsASecondary(&txn, replCoord); // Blocks up to a second waiting for a batch to be ready to apply. If one doesn't become // ready in time, we'll loop again so we can do the above checks periodically. ops = batcher.getNextBatch(Seconds(1)); } while (!inShutdown() && ops.empty()); if (inShutdown()) return; invariant(!ops.empty()); const BSONObj lastOp = ops.back().raw; if (lastOp.isEmpty()) { // This means that the network thread has coalesced and we have processed all of its // data. invariant(ops.getDeque().size() == 1); if (replCoord->isWaitingForApplierToDrain()) { replCoord->signalDrainComplete(&txn); } continue; // This wasn't a real op. Don't try to apply it. } handleSlaveDelay(lastOp); // Set minValid to the last OpTime that needs to be applied, in this batch or from the // (last) failed batch, whichever is larger. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating finishing. const OpTime start(getLastSetTimestamp(), OpTime::kUninitializedTerm); // Take the max of the first endOptime (if we recovered) and the end of our batch. const auto lastOpTime = fassertStatusOK(28773, OpTime::parseFromOplogEntry(lastOp)); // Setting end to the max of originalEndOpTime and lastOpTime (the end of the batch) // ensures that we keep pushing out the point where we can become consistent // and allow reads. If we recover and end up doing smaller batches we must pass the // originalEndOpTime before we are good. // // For example: // batch apply, 20-40, end = 40 // batch failure, // restart // batch apply, 20-25, end = max(25, 40) = 40 // batch apply, 25-45, end = 45 const OpTime end(std::max(originalEndOpTime, lastOpTime)); // This write will not journal/checkpoint. setMinValid(&txn, {start, end}); OpTime finalOpTime = multiApply(&txn, ops); setNewTimestamp(finalOpTime.getTimestamp()); setMinValid(&txn, end, DurableRequirement::None); finalizer.record(finalOpTime); } }
void OplogReader::connectToSyncSource(OperationContext* txn, const OpTime& lastOpTimeFetched, const OpTime& requiredOpTime, ReplicationCoordinator* replCoord) { const Timestamp sentinelTimestamp(duration_cast<Seconds>(Date_t::now().toDurationSinceEpoch()), 0); const OpTime sentinel(sentinelTimestamp, std::numeric_limits<long long>::max()); OpTime oldestOpTimeSeen = sentinel; invariant(conn() == NULL); while (true) { HostAndPort candidate = replCoord->chooseNewSyncSource(lastOpTimeFetched); if (candidate.empty()) { if (oldestOpTimeSeen == sentinel) { // If, in this invocation of connectToSyncSource(), we did not successfully // connect to any node ahead of us, // we apparently have no sync sources to connect to. // This situation is common; e.g. if there are no writes to the primary at // the moment. return; } // Connected to at least one member, but in all cases we were too stale to use them // as a sync source. error() << "too stale to catch up -- entering maintenance mode"; log() << "our last optime : " << lastOpTimeFetched; log() << "oldest available is " << oldestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; auto status = replCoord->setMaintenanceMode(true); if (!status.isOK()) { warning() << "Failed to transition into maintenance mode: " << status; } bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << replCoord->getMemberState(); } return; } if (!connect(candidate)) { LOG(2) << "can't connect to " << candidate.toString() << " to read operations"; resetConnection(); replCoord->blacklistSyncSource(candidate, Date_t::now() + Seconds(10)); continue; } // Read the first (oldest) op and confirm that it's not newer than our last // fetched op. Otherwise, we have fallen off the back of that source's oplog. BSONObj remoteOldestOp(findOne(rsOplogName.c_str(), Query())); OpTime remoteOldOpTime = fassertStatusOK(28776, OpTime::parseFromOplogEntry(remoteOldestOp)); // remoteOldOpTime may come from a very old config, so we cannot compare their terms. if (!lastOpTimeFetched.isNull() && lastOpTimeFetched.getTimestamp() < remoteOldOpTime.getTimestamp()) { // We're too stale to use this sync source. resetConnection(); replCoord->blacklistSyncSource(candidate, Date_t::now() + Minutes(1)); if (oldestOpTimeSeen.getTimestamp() > remoteOldOpTime.getTimestamp()) { warning() << "we are too stale to use " << candidate.toString() << " as a sync source"; oldestOpTimeSeen = remoteOldOpTime; } continue; } // Check if sync source contains required optime. if (!requiredOpTime.isNull()) { // This query is structured so that it is executed on the sync source using the oplog // start hack (oplogReplay=true and $gt/$gte predicate over "ts"). auto ts = requiredOpTime.getTimestamp(); tailingQuery(rsOplogName.c_str(), BSON("ts" << BSON("$gte" << ts << "$lte" << ts))); auto status = _compareRequiredOpTimeWithQueryResponse(requiredOpTime); if (!status.isOK()) { const auto blacklistDuration = Seconds(60); const auto until = Date_t::now() + blacklistDuration; warning() << "We cannot use " << candidate.toString() << " as a sync source because it does not contain the necessary " "operations for us to reach a consistent state: " << status << " last fetched optime: " << lastOpTimeFetched << ". required optime: " << requiredOpTime << ". Blacklisting this sync source for " << blacklistDuration << " until: " << until; resetConnection(); replCoord->blacklistSyncSource(candidate, until); continue; } resetCursor(); } // TODO: If we were too stale (recovering with maintenance mode on), then turn it off, to // allow becoming secondary/etc. // Got a valid sync source. return; } // while (true) }