void BackgroundSync::_produce(OperationContext* txn) { while (MONGO_FAIL_POINT(pauseRsBgSyncProducer)) { sleepmillis(0); } // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (!_replCoord->isCatchingUp() && (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary())) { return; } if (_inShutdown_inlock()) { return; } } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; HostAndPort source; SyncSourceResolverResponse syncSourceResp; SyncSourceResolver* syncSourceResolver; OpTime minValid; if (_replCoord->getMemberState().recovering()) { auto minValidSaved = StorageInterface::get(txn)->getMinValid(txn); if (minValidSaved > lastOpTimeFetched) { minValid = minValidSaved; } } { stdx::unique_lock<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); _syncSourceResolver = stdx::make_unique<SyncSourceResolver>( _replicationCoordinatorExternalState->getTaskExecutor(), _replCoord, lastOpTimeFetched, minValid, [&syncSourceResp](const SyncSourceResolverResponse& resp) { syncSourceResp = resp; }); syncSourceResolver = _syncSourceResolver.get(); } // This may deadlock if called inside the mutex because SyncSourceResolver::startup() calls // ReplicationCoordinator::chooseNewSyncSource(). ReplicationCoordinatorImpl's mutex has to // acquired before BackgroundSync's. // It is safe to call startup() outside the mutex on this instance of SyncSourceResolver because // we do not destroy this instance outside of this function. auto status = _syncSourceResolver->startup(); if (ErrorCodes::CallbackCanceled == status || ErrorCodes::isShutdownError(status.code())) { return; } fassertStatusOK(40349, status); syncSourceResolver->join(); syncSourceResolver = nullptr; { stdx::unique_lock<stdx::mutex> lock(_mutex); _syncSourceResolver.reset(); } if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) { // All (accessible) sync sources were too stale. if (_replCoord->isCatchingUp()) { warning() << "Too stale to catch up."; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen << " from " << syncSourceResp.getSyncSource(); sleepsecs(1); return; } error() << "too stale to catch up -- entering maintenance mode"; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; auto status = _replCoord->setMaintenanceMode(true); if (!status.isOK()) { warning() << "Failed to transition into maintenance mode: " << status; } bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << _replCoord->getMemberState(); } return; } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) { stdx::lock_guard<stdx::mutex> lock(_mutex); _syncSourceHost = syncSourceResp.getSyncSource(); source = _syncSourceHost; } else { if (!syncSourceResp.isOK()) { log() << "failed to find sync source, received error " << syncSourceResp.syncSourceStatus.getStatus(); } // No sync source found. sleepsecs(1); return; } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_stopped) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; if (!_replCoord->isCatchingUp()) { _replCoord->signalUpstreamUpdater(); } } // Set the applied point if unset. This is most likely the first time we've established a sync // source since stepping down or otherwise clearing the applied point. We need to set this here, // before the OplogWriter gets a chance to append to the oplog. if (StorageInterface::get(txn)->getAppliedThrough(txn).isNull()) { StorageInterface::get(txn)->setAppliedThrough(txn, _replCoord->getMyLastAppliedOpTime()); } // "lastFetched" not used. Already set in _enqueueDocuments. Status fetcherReturnStatus = Status::OK(); DataReplicatorExternalStateBackgroundSync dataReplicatorExternalState( _replCoord, _replicationCoordinatorExternalState, this); OplogFetcher* oplogFetcher; try { auto executor = _replicationCoordinatorExternalState->getTaskExecutor(); auto config = _replCoord->getConfig(); auto onOplogFetcherShutdownCallbackFn = [&fetcherReturnStatus](const Status& status, const OpTimeWithHash& lastFetched) { fetcherReturnStatus = status; }; stdx::lock_guard<stdx::mutex> lock(_mutex); _oplogFetcher = stdx::make_unique<OplogFetcher>( executor, OpTimeWithHash(lastHashFetched, lastOpTimeFetched), source, NamespaceString(rsOplogName), config, _replicationCoordinatorExternalState->getOplogFetcherMaxFetcherRestarts(), &dataReplicatorExternalState, stdx::bind(&BackgroundSync::_enqueueDocuments, this, stdx::placeholders::_1, stdx::placeholders::_2, stdx::placeholders::_3), onOplogFetcherShutdownCallbackFn); oplogFetcher = _oplogFetcher.get(); } catch (const mongo::DBException& ex) { fassertFailedWithStatus(34440, exceptionToStatus()); } LOG(1) << "scheduling fetcher to read remote oplog on " << _syncSourceHost << " starting at " << oplogFetcher->getCommandObject_forTest()["filter"]; auto scheduleStatus = oplogFetcher->startup(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } oplogFetcher->join(); LOG(1) << "fetcher stopped reading remote oplog on " << source; // If the background sync is stopped after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (isStopped()) { return; } if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) { // This is bad because it means that our source // has not returned oplog entries in ascending ts order, and they need to be. warning() << redact(fetcherReturnStatus); // Do not blacklist the server here, it will be blacklisted when we try to reuse it, // if it can't return a matching oplog start from the last fetch oplog ts field. return; } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing || fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) { if (_replCoord->isCatchingUp()) { warning() << "Rollback situation detected in catch-up mode; catch-up mode will end."; sleepsecs(1); return; } // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. const int messagingPortTags = 0; ConnectionPool connectionPool(messagingPortTags); std::unique_ptr<ConnectionPool::ConnectionPtr> connection; auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* { if (!connection.get()) { connection.reset(new ConnectionPool::ConnectionPtr( &connectionPool, source, Date_t::now(), kRollbackOplogSocketTimeout)); }; return connection->get(); }; { stdx::lock_guard<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; } log() << "Starting rollback due to " << redact(fetcherReturnStatus); // Wait till all buffered oplog entries have drained and been applied. auto lastApplied = _replCoord->getMyLastAppliedOpTime(); if (lastApplied != lastOpTimeFetched) { log() << "Waiting for all operations from " << lastApplied << " until " << lastOpTimeFetched << " to be applied before starting rollback."; while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) { sleepmillis(10); if (isStopped() || inShutdown()) { return; } } } // check that we are at minvalid, otherwise we cannot roll back as we may be in an // inconsistent state const auto minValid = StorageInterface::get(txn)->getMinValid(txn); if (lastApplied < minValid) { fassertNoTrace(18750, Status(ErrorCodes::UnrecoverableRollbackError, str::stream() << "need to rollback, but in inconsistent state. " << "minvalid: " << minValid.toString() << " > our last optime: " << lastApplied.toString())); } _rollback(txn, source, getConnection); stop(); } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) { Seconds blacklistDuration(60); warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source " << source << " for " << blacklistDuration << "."; _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration); } else if (!fetcherReturnStatus.isOK()) { warning() << "Fetcher stopped querying remote oplog with error: " << redact(fetcherReturnStatus); } }
void BackgroundSync::_produce(OperationContext* opCtx) { if (MONGO_FAIL_POINT(stopReplProducer)) { // This log output is used in js tests so please leave it. log() << "bgsync - stopReplProducer fail point " "enabled. Blocking until fail point is disabled."; // TODO(SERVER-27120): Remove the return statement and uncomment the while loop. // Currently we cannot block here or we prevent primaries from being fully elected since // we'll never call _signalNoNewDataForApplier. // while (MONGO_FAIL_POINT(stopReplProducer) && !inShutdown()) { // mongo::sleepsecs(1); // } mongo::sleepsecs(1); return; } // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (_state != ProducerState::Running) { return; } } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; HostAndPort source; HostAndPort oldSource = _syncSourceHost; SyncSourceResolverResponse syncSourceResp; { const OpTime minValidSaved = _replicationProcess->getConsistencyMarkers()->getMinValid(opCtx); stdx::lock_guard<stdx::mutex> lock(_mutex); if (_state != ProducerState::Running) { return; } const auto requiredOpTime = (minValidSaved > _lastOpTimeFetched) ? minValidSaved : OpTime(); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); _syncSourceResolver = stdx::make_unique<SyncSourceResolver>( _replicationCoordinatorExternalState->getTaskExecutor(), _replCoord, lastOpTimeFetched, requiredOpTime, [&syncSourceResp](const SyncSourceResolverResponse& resp) { syncSourceResp = resp; }); } // This may deadlock if called inside the mutex because SyncSourceResolver::startup() calls // ReplicationCoordinator::chooseNewSyncSource(). ReplicationCoordinatorImpl's mutex has to // acquired before BackgroundSync's. // It is safe to call startup() outside the mutex on this instance of SyncSourceResolver because // we do not destroy this instance outside of this function which is only called from a single // thread. auto status = _syncSourceResolver->startup(); if (ErrorCodes::CallbackCanceled == status || ErrorCodes::isShutdownError(status.code())) { return; } fassertStatusOK(40349, status); _syncSourceResolver->join(); { stdx::lock_guard<stdx::mutex> lock(_mutex); _syncSourceResolver.reset(); } if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) { // All (accessible) sync sources were too stale. if (_replCoord->getMemberState().primary()) { warning() << "Too stale to catch up."; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen << " from " << syncSourceResp.getSyncSource(); _replCoord->abortCatchupIfNeeded().transitional_ignore(); return; } // We only need to mark ourselves as too stale once. if (_tooStale) { return; } // Mark yourself as too stale. _tooStale = true; error() << "too stale to catch up -- entering maintenance mode"; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; // Activate maintenance mode and transition to RECOVERING. auto status = _replCoord->setMaintenanceMode(true); if (!status.isOK()) { warning() << "Failed to transition into maintenance mode: " << status; } status = _replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!status.isOK()) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << _replCoord->getMemberState() << causedBy(status); } return; } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) { { stdx::lock_guard<stdx::mutex> lock(_mutex); _syncSourceHost = syncSourceResp.getSyncSource(); source = _syncSourceHost; } // If our sync source has not changed, it is likely caused by our heartbeat data map being // out of date. In that case we sleep for 1 second to reduce the amount we spin waiting // for our map to update. if (oldSource == source) { log() << "Chose same sync source candidate as last time, " << source << ". Sleeping for 1 second to avoid immediately choosing a new sync source for " "the same reason as last time."; sleepsecs(1); } } else { if (!syncSourceResp.isOK()) { log() << "failed to find sync source, received error " << syncSourceResp.syncSourceStatus.getStatus(); } // No sync source found. sleepsecs(1); return; } // If we find a good sync source after having gone too stale, disable maintenance mode so we can // transition to SECONDARY. if (_tooStale) { _tooStale = false; log() << "No longer too stale. Able to sync from " << _syncSourceHost; auto status = _replCoord->setMaintenanceMode(false); if (!status.isOK()) { warning() << "Failed to leave maintenance mode: " << status; } } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_state != ProducerState::Running) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; } if (!_replCoord->getMemberState().primary()) { _replCoord->signalUpstreamUpdater(); } // Set the applied point if unset. This is most likely the first time we've established a sync // source since stepping down or otherwise clearing the applied point. We need to set this here, // before the OplogWriter gets a chance to append to the oplog. if (_replicationProcess->getConsistencyMarkers()->getAppliedThrough(opCtx).isNull()) { _replicationProcess->getConsistencyMarkers()->setAppliedThrough( opCtx, _replCoord->getMyLastAppliedOpTime()); } // "lastFetched" not used. Already set in _enqueueDocuments. Status fetcherReturnStatus = Status::OK(); DataReplicatorExternalStateBackgroundSync dataReplicatorExternalState( _replCoord, _replicationCoordinatorExternalState, this); OplogFetcher* oplogFetcher; try { auto onOplogFetcherShutdownCallbackFn = [&fetcherReturnStatus](const Status& status) { fetcherReturnStatus = status; }; // The construction of OplogFetcher has to be outside bgsync mutex, because it calls // replication coordinator. auto oplogFetcherPtr = stdx::make_unique<OplogFetcher>( _replicationCoordinatorExternalState->getTaskExecutor(), OpTimeWithHash(lastHashFetched, lastOpTimeFetched), source, NamespaceString::kRsOplogNamespace, _replCoord->getConfig(), _replicationCoordinatorExternalState->getOplogFetcherMaxFetcherRestarts(), syncSourceResp.rbid, true /* requireFresherSyncSource */, &dataReplicatorExternalState, stdx::bind(&BackgroundSync::_enqueueDocuments, this, stdx::placeholders::_1, stdx::placeholders::_2, stdx::placeholders::_3), onOplogFetcherShutdownCallbackFn, bgSyncOplogFetcherBatchSize); stdx::lock_guard<stdx::mutex> lock(_mutex); if (_state != ProducerState::Running) { return; } _oplogFetcher = std::move(oplogFetcherPtr); oplogFetcher = _oplogFetcher.get(); } catch (const mongo::DBException& ex) { fassertFailedWithStatus(34440, exceptionToStatus()); } const auto logLevel = Command::testCommandsEnabled ? 0 : 1; LOG(logLevel) << "scheduling fetcher to read remote oplog on " << _syncSourceHost << " starting at " << oplogFetcher->getFindQuery_forTest()["filter"]; auto scheduleStatus = oplogFetcher->startup(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } oplogFetcher->join(); LOG(1) << "fetcher stopped reading remote oplog on " << source; // If the background sync is stopped after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (getState() != ProducerState::Running) { log() << "Replication producer stopped after oplog fetcher finished returning a batch from " "our sync source. Abandoning this batch of oplog entries and re-evaluating our " "sync source."; return; } if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) { // This is bad because it means that our source // has not returned oplog entries in ascending ts order, and they need to be. warning() << redact(fetcherReturnStatus); // Do not blacklist the server here, it will be blacklisted when we try to reuse it, // if it can't return a matching oplog start from the last fetch oplog ts field. return; } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing) { auto storageInterface = StorageInterface::get(opCtx); _runRollback(opCtx, fetcherReturnStatus, source, syncSourceResp.rbid, storageInterface); } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) { Seconds blacklistDuration(60); warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source " << source << " for " << blacklistDuration << "."; _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration); } else if (!fetcherReturnStatus.isOK()) { warning() << "Fetcher stopped querying remote oplog with error: " << redact(fetcherReturnStatus); } }