void BackgroundSync::_rollback(OperationContext* txn, const HostAndPort& source, stdx::function<DBClientBase*()> getConnection) { // Abort only when syncRollback detects we are in a unrecoverable state. // In other cases, we log the message contained in the error status and retry later. auto status = syncRollback(txn, OplogInterfaceLocal(txn, rsOplogName), RollbackSourceImpl(getConnection, source, rsOplogName), _replCoord); if (status.isOK()) { // When the syncTail thread sees there is no new data by adding something to the buffer. _signalNoNewDataForApplier(txn); // Wait until the buffer is empty. // This is an indication that syncTail has removed the sentinal marker from the buffer // and reset its local lastAppliedOpTime via the replCoord. while (!_oplogBuffer->isEmpty()) { sleepmillis(10); if (inShutdown()) { return; } } // At this point we are about to leave rollback. Before we do, wait for any writes done // as part of rollback to be durable, and then do any necessary checks that we didn't // wind up rolling back something illegal. We must wait for the rollback to be durable // so that if we wind up shutting down uncleanly in response to something we rolled back // we know that we won't wind up right back in the same situation when we start back up // because the rollback wasn't durable. txn->recoveryUnit()->waitUntilDurable(); // If we detected that we rolled back the shardIdentity document as part of this rollback // then we must shut down to clear the in-memory ShardingState associated with the // shardIdentity document. if (ShardIdentityRollbackNotifier::get(txn)->didRollbackHappen()) { severe() << "shardIdentity document rollback detected. Shutting down to clear " "in-memory sharding state. Restarting this process should safely return it " "to a healthy state"; fassertFailedNoTrace(40276); } // It is now safe to clear the ROLLBACK state, which may result in the applier thread // transitioning to SECONDARY. This is safe because the applier thread has now reloaded // the new rollback minValid from the database. if (!_replCoord->setFollowerMode(MemberState::RS_RECOVERING)) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << "; expected to be in state " << MemberState(MemberState::RS_ROLLBACK) << " but found self in " << _replCoord->getMemberState(); } return; } if (ErrorCodes::UnrecoverableRollbackError == status.code()) { fassertNoTrace(28723, status); } warning() << "rollback cannot proceed at this time (retrying later): " << redact(status); }
void BackgroundSync::_rollback(OperationContext* txn, const HostAndPort& source, stdx::function<DBClientBase*()> getConnection) { // Abort only when syncRollback detects we are in a unrecoverable state. // In other cases, we log the message contained in the error status and retry later. auto status = syncRollback(txn, OplogInterfaceLocal(txn, rsOplogName), RollbackSourceImpl(getConnection, source, rsOplogName), _replCoord); if (status.isOK()) { // When the syncTail thread sees there is no new data by adding something to the buffer. _signalNoNewDataForApplier(); // Wait until the buffer is empty. // This is an indication that syncTail has removed the sentinal marker from the buffer // and reset its local lastAppliedOpTime via the replCoord. while (!_buffer.empty()) { sleepmillis(10); if (inShutdown()) { return; } } // It is now safe to clear the ROLLBACK state, which may result in the applier thread // transitioning to SECONDARY. This is safe because the applier thread has now reloaded // the new rollback minValid from the database. if (!_replCoord->setFollowerMode(MemberState::RS_RECOVERING)) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << "; expected to be in state " << MemberState(MemberState::RS_ROLLBACK) << " but found self in " << _replCoord->getMemberState(); } return; } if (ErrorCodes::UnrecoverableRollbackError == status.code()) { fassertNoTrace(28723, status); } warning() << "rollback cannot proceed at this time (retrying later): " << status; }
Status RollbackImpl::_transitionToRollback(OperationContext* opCtx) { invariant(opCtx); if (_isInShutdown()) { return Status(ErrorCodes::ShutdownInProgress, "rollback shutting down"); } log() << "transition to ROLLBACK"; { Lock::GlobalWrite globalWrite(opCtx); auto status = _replicationCoordinator->setFollowerMode(MemberState::RS_ROLLBACK); if (!status.isOK()) { status.addContext(str::stream() << "Cannot transition from " << _replicationCoordinator->getMemberState().toString() << " to " << MemberState(MemberState::RS_ROLLBACK).toString()); log() << status; return status; } } return Status::OK(); }
void OplogReader::connectToSyncSource(OperationContext* txn, const OpTime& lastOpTimeFetched, ReplicationCoordinator* replCoord) { const Timestamp sentinelTimestamp(duration_cast<Seconds>(Milliseconds(curTimeMillis64())), 0); const OpTime sentinel(sentinelTimestamp, std::numeric_limits<long long>::max()); OpTime oldestOpTimeSeen = sentinel; invariant(conn() == NULL); while (true) { HostAndPort candidate = replCoord->chooseNewSyncSource(lastOpTimeFetched.getTimestamp()); if (candidate.empty()) { if (oldestOpTimeSeen == sentinel) { // If, in this invocation of connectToSyncSource(), we did not successfully // connect to any node ahead of us, // we apparently have no sync sources to connect to. // This situation is common; e.g. if there are no writes to the primary at // the moment. return; } // Connected to at least one member, but in all cases we were too stale to use them // as a sync source. error() << "too stale to catch up"; log() << "our last optime : " << lastOpTimeFetched; log() << "oldest available is " << oldestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; setMinValid(txn, oldestOpTimeSeen); bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << replCoord->getMemberState(); } return; } if (!connect(candidate)) { LOG(2) << "can't connect to " << candidate.toString() << " to read operations"; resetConnection(); replCoord->blacklistSyncSource(candidate, Date_t::now() + Seconds(10)); continue; } // Read the first (oldest) op and confirm that it's not newer than our last // fetched op. Otherwise, we have fallen off the back of that source's oplog. BSONObj remoteOldestOp(findOne(rsOplogName.c_str(), Query())); OpTime remoteOldOpTime = fassertStatusOK(28776, OpTime::parseFromBSON(remoteOldestOp)); // remoteOldOpTime may come from a very old config, so we cannot compare their terms. if (!lastOpTimeFetched.isNull() && lastOpTimeFetched.getTimestamp() < remoteOldOpTime.getTimestamp()) { // We're too stale to use this sync source. resetConnection(); replCoord->blacklistSyncSource(candidate, Date_t::now() + Minutes(1)); if (oldestOpTimeSeen.getTimestamp() > remoteOldOpTime.getTimestamp()) { warning() << "we are too stale to use " << candidate.toString() << " as a sync source"; oldestOpTimeSeen = remoteOldOpTime; } continue; } // Got a valid sync source. return; } // while (true) }
void BackgroundSync::_produce(OperationContext* txn) { while (MONGO_FAIL_POINT(pauseRsBgSyncProducer)) { sleepmillis(0); } // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (!_replCoord->isCatchingUp() && (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary())) { return; } if (_inShutdown_inlock()) { return; } } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; HostAndPort source; SyncSourceResolverResponse syncSourceResp; SyncSourceResolver* syncSourceResolver; OpTime minValid; if (_replCoord->getMemberState().recovering()) { auto minValidSaved = StorageInterface::get(txn)->getMinValid(txn); if (minValidSaved > lastOpTimeFetched) { minValid = minValidSaved; } } { stdx::unique_lock<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); _syncSourceResolver = stdx::make_unique<SyncSourceResolver>( _replicationCoordinatorExternalState->getTaskExecutor(), _replCoord, lastOpTimeFetched, minValid, [&syncSourceResp](const SyncSourceResolverResponse& resp) { syncSourceResp = resp; }); syncSourceResolver = _syncSourceResolver.get(); } // This may deadlock if called inside the mutex because SyncSourceResolver::startup() calls // ReplicationCoordinator::chooseNewSyncSource(). ReplicationCoordinatorImpl's mutex has to // acquired before BackgroundSync's. // It is safe to call startup() outside the mutex on this instance of SyncSourceResolver because // we do not destroy this instance outside of this function. auto status = _syncSourceResolver->startup(); if (ErrorCodes::CallbackCanceled == status || ErrorCodes::isShutdownError(status.code())) { return; } fassertStatusOK(40349, status); syncSourceResolver->join(); syncSourceResolver = nullptr; { stdx::unique_lock<stdx::mutex> lock(_mutex); _syncSourceResolver.reset(); } if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) { // All (accessible) sync sources were too stale. if (_replCoord->isCatchingUp()) { warning() << "Too stale to catch up."; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen << " from " << syncSourceResp.getSyncSource(); sleepsecs(1); return; } error() << "too stale to catch up -- entering maintenance mode"; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; auto status = _replCoord->setMaintenanceMode(true); if (!status.isOK()) { warning() << "Failed to transition into maintenance mode: " << status; } bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << _replCoord->getMemberState(); } return; } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) { stdx::lock_guard<stdx::mutex> lock(_mutex); _syncSourceHost = syncSourceResp.getSyncSource(); source = _syncSourceHost; } else { if (!syncSourceResp.isOK()) { log() << "failed to find sync source, received error " << syncSourceResp.syncSourceStatus.getStatus(); } // No sync source found. sleepsecs(1); return; } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_stopped) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; if (!_replCoord->isCatchingUp()) { _replCoord->signalUpstreamUpdater(); } } // Set the applied point if unset. This is most likely the first time we've established a sync // source since stepping down or otherwise clearing the applied point. We need to set this here, // before the OplogWriter gets a chance to append to the oplog. if (StorageInterface::get(txn)->getAppliedThrough(txn).isNull()) { StorageInterface::get(txn)->setAppliedThrough(txn, _replCoord->getMyLastAppliedOpTime()); } // "lastFetched" not used. Already set in _enqueueDocuments. Status fetcherReturnStatus = Status::OK(); DataReplicatorExternalStateBackgroundSync dataReplicatorExternalState( _replCoord, _replicationCoordinatorExternalState, this); OplogFetcher* oplogFetcher; try { auto executor = _replicationCoordinatorExternalState->getTaskExecutor(); auto config = _replCoord->getConfig(); auto onOplogFetcherShutdownCallbackFn = [&fetcherReturnStatus](const Status& status, const OpTimeWithHash& lastFetched) { fetcherReturnStatus = status; }; stdx::lock_guard<stdx::mutex> lock(_mutex); _oplogFetcher = stdx::make_unique<OplogFetcher>( executor, OpTimeWithHash(lastHashFetched, lastOpTimeFetched), source, NamespaceString(rsOplogName), config, _replicationCoordinatorExternalState->getOplogFetcherMaxFetcherRestarts(), &dataReplicatorExternalState, stdx::bind(&BackgroundSync::_enqueueDocuments, this, stdx::placeholders::_1, stdx::placeholders::_2, stdx::placeholders::_3), onOplogFetcherShutdownCallbackFn); oplogFetcher = _oplogFetcher.get(); } catch (const mongo::DBException& ex) { fassertFailedWithStatus(34440, exceptionToStatus()); } LOG(1) << "scheduling fetcher to read remote oplog on " << _syncSourceHost << " starting at " << oplogFetcher->getCommandObject_forTest()["filter"]; auto scheduleStatus = oplogFetcher->startup(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } oplogFetcher->join(); LOG(1) << "fetcher stopped reading remote oplog on " << source; // If the background sync is stopped after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (isStopped()) { return; } if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) { // This is bad because it means that our source // has not returned oplog entries in ascending ts order, and they need to be. warning() << redact(fetcherReturnStatus); // Do not blacklist the server here, it will be blacklisted when we try to reuse it, // if it can't return a matching oplog start from the last fetch oplog ts field. return; } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing || fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) { if (_replCoord->isCatchingUp()) { warning() << "Rollback situation detected in catch-up mode; catch-up mode will end."; sleepsecs(1); return; } // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. const int messagingPortTags = 0; ConnectionPool connectionPool(messagingPortTags); std::unique_ptr<ConnectionPool::ConnectionPtr> connection; auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* { if (!connection.get()) { connection.reset(new ConnectionPool::ConnectionPtr( &connectionPool, source, Date_t::now(), kRollbackOplogSocketTimeout)); }; return connection->get(); }; { stdx::lock_guard<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; } log() << "Starting rollback due to " << redact(fetcherReturnStatus); // Wait till all buffered oplog entries have drained and been applied. auto lastApplied = _replCoord->getMyLastAppliedOpTime(); if (lastApplied != lastOpTimeFetched) { log() << "Waiting for all operations from " << lastApplied << " until " << lastOpTimeFetched << " to be applied before starting rollback."; while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) { sleepmillis(10); if (isStopped() || inShutdown()) { return; } } } // check that we are at minvalid, otherwise we cannot roll back as we may be in an // inconsistent state const auto minValid = StorageInterface::get(txn)->getMinValid(txn); if (lastApplied < minValid) { fassertNoTrace(18750, Status(ErrorCodes::UnrecoverableRollbackError, str::stream() << "need to rollback, but in inconsistent state. " << "minvalid: " << minValid.toString() << " > our last optime: " << lastApplied.toString())); } _rollback(txn, source, getConnection); stop(); } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) { Seconds blacklistDuration(60); warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source " << source << " for " << blacklistDuration << "."; _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration); } else if (!fetcherReturnStatus.isOK()) { warning() << "Fetcher stopped querying remote oplog with error: " << redact(fetcherReturnStatus); } }
void BackgroundSync::_produce(OperationContext* txn) { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() || inShutdownStrict()) { return; } } while (MONGO_FAIL_POINT(rsBgSyncProduce)) { sleepmillis(0); } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; HostAndPort source; { stdx::unique_lock<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); } SyncSourceResolverResponse syncSourceResp = _syncSourceResolver.findSyncSource(txn, lastOpTimeFetched); if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) { // All (accessible) sync sources were too stale. error() << "too stale to catch up -- entering maintenance mode"; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; setMinValid(txn, {lastOpTimeFetched, syncSourceResp.earliestOpTimeSeen}); auto status = _replCoord->setMaintenanceMode(true); if (!status.isOK()) { warning() << "Failed to transition into maintenance mode."; } bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << _replCoord->getMemberState(); } return; } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) { stdx::lock_guard<stdx::mutex> lock(_mutex); _syncSourceHost = syncSourceResp.getSyncSource(); source = _syncSourceHost; } else { if (!syncSourceResp.isOK()) { log() << "failed to find sync source, received error " << syncSourceResp.syncSourceStatus.getStatus(); } // No sync source found. sleepsecs(1); return; } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_stopped) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; _replCoord->signalUpstreamUpdater(); } const auto isV1ElectionProtocol = _replCoord->isV1ElectionProtocol(); // Under protocol version 1, make the awaitData timeout (maxTimeMS) dependent on the election // timeout. This enables the sync source to communicate liveness of the primary to secondaries. // Under protocol version 0, use a default timeout of 2 seconds for awaitData. const Milliseconds fetcherMaxTimeMS( isV1ElectionProtocol ? _replCoord->getConfig().getElectionTimeoutPeriod() / 2 : Seconds(2)); Status fetcherReturnStatus = Status::OK(); auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback, this, stdx::placeholders::_1, stdx::placeholders::_3, stdx::cref(source), lastOpTimeFetched, lastHashFetched, fetcherMaxTimeMS, &fetcherReturnStatus); BSONObjBuilder cmdBob; cmdBob.append("find", nsToCollectionSubstring(rsOplogName)); cmdBob.append("filter", BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp()))); cmdBob.append("tailable", true); cmdBob.append("oplogReplay", true); cmdBob.append("awaitData", true); cmdBob.append("maxTimeMS", durationCount<Milliseconds>(Minutes(1))); // 1 min initial find. BSONObjBuilder metadataBob; if (isV1ElectionProtocol) { cmdBob.append("term", _replCoord->getTerm()); metadataBob.append(rpc::kReplSetMetadataFieldName, 1); } auto dbName = nsToDatabase(rsOplogName); auto cmdObj = cmdBob.obj(); auto metadataObj = metadataBob.obj(); Fetcher fetcher(&_threadPoolTaskExecutor, source, dbName, cmdObj, fetcherCallback, metadataObj, _replCoord->getConfig().getElectionTimeoutPeriod()); LOG(1) << "scheduling fetcher to read remote oplog on " << source << " starting at " << cmdObj["filter"]; auto scheduleStatus = fetcher.schedule(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } fetcher.wait(); LOG(1) << "fetcher stopped reading remote oplog on " << source; // If the background sync is stopped after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (isStopped()) { return; } if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) { // This is bad because it means that our source // has not returned oplog entries in ascending ts order, and they need to be. warning() << fetcherReturnStatus.toString(); // Do not blacklist the server here, it will be blacklisted when we try to reuse it, // if it can't return a matching oplog start from the last fetch oplog ts field. return; } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing || fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) { // Rollback is a synchronous operation that uses the task executor and may not be // executed inside the fetcher callback. const int messagingPortTags = 0; ConnectionPool connectionPool(messagingPortTags); std::unique_ptr<ConnectionPool::ConnectionPtr> connection; auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* { if (!connection.get()) { connection.reset(new ConnectionPool::ConnectionPtr( &connectionPool, source, Date_t::now(), oplogSocketTimeout)); }; return connection->get(); }; { stdx::lock_guard<stdx::mutex> lock(_mutex); lastOpTimeFetched = _lastOpTimeFetched; } log() << "Starting rollback due to " << fetcherReturnStatus; // Wait till all buffered oplog entries have drained and been applied. auto lastApplied = _replCoord->getMyLastAppliedOpTime(); if (lastApplied != lastOpTimeFetched) { log() << "Waiting for all operations from " << lastApplied << " until " << lastOpTimeFetched << " to be applied before starting rollback."; while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) { sleepmillis(10); if (isStopped() || inShutdown()) { return; } } } // check that we are at minvalid, otherwise we cannot roll back as we may be in an // inconsistent state BatchBoundaries boundaries = getMinValid(txn); if (!boundaries.start.isNull() || boundaries.end > lastApplied) { fassertNoTrace(18750, Status(ErrorCodes::UnrecoverableRollbackError, str::stream() << "need to rollback, but in inconsistent state. " << "minvalid: " << boundaries.end.toString() << " > our last optime: " << lastApplied.toString())); } _rollback(txn, source, getConnection); stop(); } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) { Seconds blacklistDuration(60); warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source " << source << " for " << blacklistDuration << "."; _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration); } else if (!fetcherReturnStatus.isOK()) { warning() << "Fetcher error querying oplog: " << fetcherReturnStatus.toString(); } }
Status ReplSetHeartbeatResponse::initialize(const BSONObj& doc, long long term) { // Old versions set this even though they returned not "ok" _mismatch = doc[kMismatchFieldName].trueValue(); if (_mismatch) return Status(ErrorCodes::InconsistentReplicaSetNames, "replica set name doesn't match."); // Old versions sometimes set the replica set name ("set") but ok:0 const BSONElement replSetNameElement = doc[kReplSetFieldName]; if (replSetNameElement.eoo()) { _setName.clear(); } else if (replSetNameElement.type() != String) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kReplSetFieldName << "\" field in response to replSetHeartbeat to have " "type String, but found " << typeName(replSetNameElement.type())); } else { _setName = replSetNameElement.String(); } if (_setName.empty() && !doc[kOkFieldName].trueValue()) { std::string errMsg = doc[kErrMsgFieldName].str(); BSONElement errCodeElem = doc[kErrorCodeFieldName]; if (errCodeElem.ok()) { if (!errCodeElem.isNumber()) return Status(ErrorCodes::BadValue, "Error code is not a number!"); int errorCode = errCodeElem.numberInt(); return Status(ErrorCodes::Error(errorCode), errMsg); } return Status(ErrorCodes::UnknownError, errMsg); } const BSONElement hasDataElement = doc[kHasDataFieldName]; _hasDataSet = !hasDataElement.eoo(); _hasData = hasDataElement.trueValue(); const BSONElement electionTimeElement = doc[kElectionTimeFieldName]; if (electionTimeElement.eoo()) { _electionTimeSet = false; } else if (electionTimeElement.type() == bsonTimestamp) { _electionTimeSet = true; _electionTime = electionTimeElement.timestamp(); } else if (electionTimeElement.type() == Date) { _electionTimeSet = true; _electionTime = Timestamp(electionTimeElement.date()); } else { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kElectionTimeFieldName << "\" field in response to replSetHeartbeat " "command to have type Date or Timestamp, but found type " << typeName(electionTimeElement.type())); } const BSONElement timeElement = doc[kTimeFieldName]; if (timeElement.eoo()) { _timeSet = false; } else if (timeElement.isNumber()) { _timeSet = true; _time = Seconds(timeElement.numberLong()); } else { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kTimeFieldName << "\" field in response to replSetHeartbeat " "command to have a numeric type, but found type " << typeName(timeElement.type())); } _isReplSet = doc[kIsReplSetFieldName].trueValue(); Status termStatus = bsonExtractIntegerField(doc, kTermFieldName, &_term); if (!termStatus.isOK() && termStatus != ErrorCodes::NoSuchKey) { return termStatus; } // In order to support both the 3.0(V0) and 3.2(V1) heartbeats we must parse the OpTime // field based on its type. If it is a Date, we parse it as the timestamp and use // initialize's term argument to complete the OpTime type. If it is an Object, then it's // V1 and we construct an OpTime out of its nested fields. const BSONElement opTimeElement = doc[kOpTimeFieldName]; if (opTimeElement.eoo()) { _opTimeSet = false; } else if (opTimeElement.type() == bsonTimestamp) { _opTimeSet = true; _opTime = OpTime(opTimeElement.timestamp(), term); } else if (opTimeElement.type() == Date) { _opTimeSet = true; _opTime = OpTime(Timestamp(opTimeElement.date()), term); } else if (opTimeElement.type() == Object) { Status status = bsonExtractOpTimeField(doc, kOpTimeFieldName, &_opTime); _opTimeSet = true; // since a v1 OpTime was in the response, the member must be part of a replset _isReplSet = true; } else { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kOpTimeFieldName << "\" field in response to replSetHeartbeat " "command to have type Date or Timestamp, but found type " << typeName(opTimeElement.type())); } const BSONElement electableElement = doc[kIsElectableFieldName]; if (electableElement.eoo()) { _electableSet = false; } else { _electableSet = true; _electable = electableElement.trueValue(); } const BSONElement memberStateElement = doc[kMemberStateFieldName]; if (memberStateElement.eoo()) { _stateSet = false; } else if (memberStateElement.type() != NumberInt && memberStateElement.type() != NumberLong) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kMemberStateFieldName << "\" field in response to replSetHeartbeat " "command to have type NumberInt or NumberLong, but found type " << typeName(memberStateElement.type())); } else { long long stateInt = memberStateElement.numberLong(); if (stateInt < 0 || stateInt > MemberState::RS_MAX) { return Status(ErrorCodes::BadValue, str::stream() << "Value for \"" << kMemberStateFieldName << "\" in response to replSetHeartbeat is " "out of range; legal values are non-negative and no more than " << MemberState::RS_MAX); } _stateSet = true; _state = MemberState(static_cast<int>(stateInt)); } _stateDisagreement = doc[kHasStateDisagreementFieldName].trueValue(); // Not required for the case of uninitialized members -- they have no config const BSONElement configVersionElement = doc[kConfigVersionFieldName]; // If we have an optime then we must have a configVersion if (_opTimeSet && configVersionElement.eoo()) { return Status(ErrorCodes::NoSuchKey, str::stream() << "Response to replSetHeartbeat missing required \"" << kConfigVersionFieldName << "\" field even though initialized"); } // If there is a "v" (config version) then it must be an int. if (!configVersionElement.eoo() && configVersionElement.type() != NumberInt) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kConfigVersionFieldName << "\" field in response to replSetHeartbeat to have " "type NumberInt, but found " << typeName(configVersionElement.type())); } _configVersion = configVersionElement.numberInt(); const BSONElement hbMsgElement = doc[kHbMessageFieldName]; if (hbMsgElement.eoo()) { _hbmsg.clear(); } else if (hbMsgElement.type() != String) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kHbMessageFieldName << "\" field in response to replSetHeartbeat to have " "type String, but found " << typeName(hbMsgElement.type())); } else { _hbmsg = hbMsgElement.String(); } const BSONElement syncingToElement = doc[kSyncSourceFieldName]; if (syncingToElement.eoo()) { _syncingTo = HostAndPort(); } else if (syncingToElement.type() != String) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kSyncSourceFieldName << "\" field in response to replSetHeartbeat to " "have type String, but found " << typeName(syncingToElement.type())); } else { _syncingTo = HostAndPort(syncingToElement.String()); } const BSONElement rsConfigElement = doc[kConfigFieldName]; if (rsConfigElement.eoo()) { _configSet = false; _config = ReplicaSetConfig(); return Status::OK(); } else if (rsConfigElement.type() != Object) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kConfigFieldName << "\" in response to replSetHeartbeat to have type " "Object, but found " << typeName(rsConfigElement.type())); } _configSet = true; return _config.initialize(rsConfigElement.Obj()); }
MemberState TopologyCoordinatorImpl::getMemberState() const { // TODO return MemberState(); }
void BackgroundSync::_produce(OperationContext* opCtx) { if (MONGO_FAIL_POINT(stopReplProducer)) { // This log output is used in js tests so please leave it. log() << "bgsync - stopReplProducer fail point " "enabled. Blocking until fail point is disabled."; // TODO(SERVER-27120): Remove the return statement and uncomment the while loop. // Currently we cannot block here or we prevent primaries from being fully elected since // we'll never call _signalNoNewDataForApplier. // while (MONGO_FAIL_POINT(stopReplProducer) && !inShutdown()) { // mongo::sleepsecs(1); // } mongo::sleepsecs(1); return; } // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced { stdx::unique_lock<stdx::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { // then we're initial syncing and we're still waiting for this to be set lock.unlock(); sleepsecs(1); // if there is no one to sync from return; } if (_state != ProducerState::Running) { return; } } // find a target to sync from the last optime fetched OpTime lastOpTimeFetched; HostAndPort source; HostAndPort oldSource = _syncSourceHost; SyncSourceResolverResponse syncSourceResp; { const OpTime minValidSaved = _replicationProcess->getConsistencyMarkers()->getMinValid(opCtx); stdx::lock_guard<stdx::mutex> lock(_mutex); if (_state != ProducerState::Running) { return; } const auto requiredOpTime = (minValidSaved > _lastOpTimeFetched) ? minValidSaved : OpTime(); lastOpTimeFetched = _lastOpTimeFetched; _syncSourceHost = HostAndPort(); _syncSourceResolver = stdx::make_unique<SyncSourceResolver>( _replicationCoordinatorExternalState->getTaskExecutor(), _replCoord, lastOpTimeFetched, requiredOpTime, [&syncSourceResp](const SyncSourceResolverResponse& resp) { syncSourceResp = resp; }); } // This may deadlock if called inside the mutex because SyncSourceResolver::startup() calls // ReplicationCoordinator::chooseNewSyncSource(). ReplicationCoordinatorImpl's mutex has to // acquired before BackgroundSync's. // It is safe to call startup() outside the mutex on this instance of SyncSourceResolver because // we do not destroy this instance outside of this function which is only called from a single // thread. auto status = _syncSourceResolver->startup(); if (ErrorCodes::CallbackCanceled == status || ErrorCodes::isShutdownError(status.code())) { return; } fassertStatusOK(40349, status); _syncSourceResolver->join(); { stdx::lock_guard<stdx::mutex> lock(_mutex); _syncSourceResolver.reset(); } if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) { // All (accessible) sync sources were too stale. if (_replCoord->getMemberState().primary()) { warning() << "Too stale to catch up."; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen << " from " << syncSourceResp.getSyncSource(); _replCoord->abortCatchupIfNeeded().transitional_ignore(); return; } // We only need to mark ourselves as too stale once. if (_tooStale) { return; } // Mark yourself as too stale. _tooStale = true; error() << "too stale to catch up -- entering maintenance mode"; log() << "Our newest OpTime : " << lastOpTimeFetched; log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; // Activate maintenance mode and transition to RECOVERING. auto status = _replCoord->setMaintenanceMode(true); if (!status.isOK()) { warning() << "Failed to transition into maintenance mode: " << status; } status = _replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!status.isOK()) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << _replCoord->getMemberState() << causedBy(status); } return; } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) { { stdx::lock_guard<stdx::mutex> lock(_mutex); _syncSourceHost = syncSourceResp.getSyncSource(); source = _syncSourceHost; } // If our sync source has not changed, it is likely caused by our heartbeat data map being // out of date. In that case we sleep for 1 second to reduce the amount we spin waiting // for our map to update. if (oldSource == source) { log() << "Chose same sync source candidate as last time, " << source << ". Sleeping for 1 second to avoid immediately choosing a new sync source for " "the same reason as last time."; sleepsecs(1); } } else { if (!syncSourceResp.isOK()) { log() << "failed to find sync source, received error " << syncSourceResp.syncSourceStatus.getStatus(); } // No sync source found. sleepsecs(1); return; } // If we find a good sync source after having gone too stale, disable maintenance mode so we can // transition to SECONDARY. if (_tooStale) { _tooStale = false; log() << "No longer too stale. Able to sync from " << _syncSourceHost; auto status = _replCoord->setMaintenanceMode(false); if (!status.isOK()) { warning() << "Failed to leave maintenance mode: " << status; } } long long lastHashFetched; { stdx::lock_guard<stdx::mutex> lock(_mutex); if (_state != ProducerState::Running) { return; } lastOpTimeFetched = _lastOpTimeFetched; lastHashFetched = _lastFetchedHash; } if (!_replCoord->getMemberState().primary()) { _replCoord->signalUpstreamUpdater(); } // Set the applied point if unset. This is most likely the first time we've established a sync // source since stepping down or otherwise clearing the applied point. We need to set this here, // before the OplogWriter gets a chance to append to the oplog. if (_replicationProcess->getConsistencyMarkers()->getAppliedThrough(opCtx).isNull()) { _replicationProcess->getConsistencyMarkers()->setAppliedThrough( opCtx, _replCoord->getMyLastAppliedOpTime()); } // "lastFetched" not used. Already set in _enqueueDocuments. Status fetcherReturnStatus = Status::OK(); DataReplicatorExternalStateBackgroundSync dataReplicatorExternalState( _replCoord, _replicationCoordinatorExternalState, this); OplogFetcher* oplogFetcher; try { auto onOplogFetcherShutdownCallbackFn = [&fetcherReturnStatus](const Status& status) { fetcherReturnStatus = status; }; // The construction of OplogFetcher has to be outside bgsync mutex, because it calls // replication coordinator. auto oplogFetcherPtr = stdx::make_unique<OplogFetcher>( _replicationCoordinatorExternalState->getTaskExecutor(), OpTimeWithHash(lastHashFetched, lastOpTimeFetched), source, NamespaceString::kRsOplogNamespace, _replCoord->getConfig(), _replicationCoordinatorExternalState->getOplogFetcherMaxFetcherRestarts(), syncSourceResp.rbid, true /* requireFresherSyncSource */, &dataReplicatorExternalState, stdx::bind(&BackgroundSync::_enqueueDocuments, this, stdx::placeholders::_1, stdx::placeholders::_2, stdx::placeholders::_3), onOplogFetcherShutdownCallbackFn, bgSyncOplogFetcherBatchSize); stdx::lock_guard<stdx::mutex> lock(_mutex); if (_state != ProducerState::Running) { return; } _oplogFetcher = std::move(oplogFetcherPtr); oplogFetcher = _oplogFetcher.get(); } catch (const mongo::DBException& ex) { fassertFailedWithStatus(34440, exceptionToStatus()); } const auto logLevel = Command::testCommandsEnabled ? 0 : 1; LOG(logLevel) << "scheduling fetcher to read remote oplog on " << _syncSourceHost << " starting at " << oplogFetcher->getFindQuery_forTest()["filter"]; auto scheduleStatus = oplogFetcher->startup(); if (!scheduleStatus.isOK()) { warning() << "unable to schedule fetcher to read remote oplog on " << source << ": " << scheduleStatus; return; } oplogFetcher->join(); LOG(1) << "fetcher stopped reading remote oplog on " << source; // If the background sync is stopped after the fetcher is started, we need to // re-evaluate our sync source and oplog common point. if (getState() != ProducerState::Running) { log() << "Replication producer stopped after oplog fetcher finished returning a batch from " "our sync source. Abandoning this batch of oplog entries and re-evaluating our " "sync source."; return; } if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) { // This is bad because it means that our source // has not returned oplog entries in ascending ts order, and they need to be. warning() << redact(fetcherReturnStatus); // Do not blacklist the server here, it will be blacklisted when we try to reuse it, // if it can't return a matching oplog start from the last fetch oplog ts field. return; } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing) { auto storageInterface = StorageInterface::get(opCtx); _runRollback(opCtx, fetcherReturnStatus, source, syncSourceResp.rbid, storageInterface); } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) { Seconds blacklistDuration(60); warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source " << source << " for " << blacklistDuration << "."; _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration); } else if (!fetcherReturnStatus.isOK()) { warning() << "Fetcher stopped querying remote oplog with error: " << redact(fetcherReturnStatus); } }
Status ReplSetHeartbeatResponseV1::initialize(const BSONObj& doc) { Status status = bsonCheckOnlyHasFields("ReplSetHeartbeatResponse", doc, kLegalHeartbeatFieldNames); if (!status.isOK()) return status; status = bsonExtractBooleanField(doc, kIsReplSetFieldName, &_isReplSet); if (!status.isOK()) return status; status = bsonExtractStringField(doc, kReplSetFieldName, &_setName); if (!status.isOK()) return status; long long stateInt; status = bsonExtractIntegerField(doc, kMemberStateFieldName, &stateInt); if (!status.isOK()) return status; if (stateInt < 0 || stateInt > MemberState::RS_MAX) { return Status(ErrorCodes::BadValue, str::stream() << "Value for \"" << kMemberStateFieldName << "\" in response to replSetHeartbeat is " "out of range; legal values are non-negative and no more than " << MemberState::RS_MAX); } _state = MemberState(static_cast<int>(stateInt)); // extracting the lastCommittedOp is a bit of a process BSONObj lastOpTime = doc[kLastOpTimeFieldName].Obj(); Timestamp ts; status = bsonExtractTimestampField(lastOpTime, kOpTimeFieldName, &ts); if (!status.isOK()) return status; long long term; status = bsonExtractIntegerField(lastOpTime, kTermFieldName, &term); if (!status.isOK()) return status; _lastOpTime = OpTime(lastOpTime[kOpTimeFieldName].timestamp(), lastOpTime[kTermFieldName].Long()); status = bsonExtractStringField(doc, kSyncSourceFieldName, &_syncingTo); if (!status.isOK()) return status; status = bsonExtractIntegerField(doc, kConfigVersionFieldName, &_configVersion); if (!status.isOK()) return status; status = bsonExtractIntegerField(doc, kPrimaryIdFieldName, &_primaryId); if (!status.isOK()) return status; status = bsonExtractIntegerField(doc, kTermFieldName, &_term); if (!status.isOK()) return status; const BSONElement hasDataElement = doc[kHasDataFieldName]; _hasDataSet = !hasDataElement.eoo(); _hasData = hasDataElement.trueValue(); const BSONElement rsConfigElement = doc[kConfigFieldName]; if (rsConfigElement.eoo()) { _configSet = false; _config = ReplicaSetConfig(); return Status::OK(); } else if (rsConfigElement.type() != Object) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kConfigFieldName << "\" in response to replSetHeartbeat to have type " "Object, but found " << typeName(rsConfigElement.type())); } _configSet = true; return _config.initialize(rsConfigElement.Obj()); }
Status ReplSetHeartbeatResponse::initialize(const BSONObj& doc) { // Old versions set this even though they returned not "ok" _mismatch = doc[kMismatchFieldName].trueValue(); if (_mismatch) return Status(ErrorCodes::InconsistentReplicaSetNames, "replica set name doesn't match."); // Old versions sometimes set the replica set name ("set") but ok:0 const BSONElement replSetNameElement = doc[kReplSetFieldName]; if (replSetNameElement.eoo()) { _setName.clear(); } else if (replSetNameElement.type() != String) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kReplSetFieldName << "\" field in response to replSetHeartbeat to have " "type String, but found " << typeName(replSetNameElement.type())); } else { _setName = replSetNameElement.String(); } if (_setName.empty() && !doc[kOkFieldName].trueValue()) { std::string errMsg = doc[kErrMsgFieldName].str(); BSONElement errCodeElem = doc[kErrorCodeFieldName]; if (errCodeElem.ok()) { if (!errCodeElem.isNumber()) return Status(ErrorCodes::BadValue, "Error code is not a number!"); int errorCode = errCodeElem.numberInt(); return Status(ErrorCodes::Error(errorCode), errMsg); } return Status(ErrorCodes::UnknownError, errMsg); } const BSONElement hasDataElement = doc[kHasDataFieldName]; _hasDataSet = !hasDataElement.eoo(); _hasData = hasDataElement.trueValue(); const BSONElement electionTimeElement = doc[kElectionTimeFieldName]; if (electionTimeElement.eoo()) { _electionTimeSet = false; } else if (electionTimeElement.type() == Timestamp) { _electionTimeSet = true; _electionTime = electionTimeElement._opTime(); } else if (electionTimeElement.type() == Date) { _electionTimeSet = true; _electionTime = OpTime(electionTimeElement.date()); } else { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kElectionTimeFieldName << "\" field in response to replSetHeartbeat " "command to have type Date or Timestamp, but found type " << typeName(electionTimeElement.type())); } const BSONElement timeElement = doc[kTimeFieldName]; if (timeElement.eoo()) { _timeSet = false; } else if (timeElement.isNumber()) { _timeSet = true; _time = Seconds(timeElement.numberLong()); } else { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kTimeFieldName << "\" field in response to replSetHeartbeat " "command to have a numeric type, but found type " << typeName(timeElement.type())); } const BSONElement opTimeElement = doc[kOpTimeFieldName]; if (opTimeElement.eoo()) { _opTimeSet = false; } else if (opTimeElement.type() == Timestamp) { _opTimeSet = true; _opTime = opTimeElement._opTime(); } else if (opTimeElement.type() == Date) { _opTimeSet = true; _opTime = OpTime(opTimeElement.date()); } else { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kOpTimeFieldName << "\" field in response to replSetHeartbeat " "command to have type Date or Timestamp, but found type " << typeName(opTimeElement.type())); } const BSONElement electableElement = doc[kIsElectableFieldName]; if (electableElement.eoo()) { _electableSet = false; } else { _electableSet = true; _electable = electableElement.trueValue(); } _isReplSet = doc[kIsReplSetFieldName].trueValue(); const BSONElement memberStateElement = doc[kMemberStateFieldName]; if (memberStateElement.eoo()) { _stateSet = false; } else if (memberStateElement.type() != NumberInt && memberStateElement.type() != NumberLong) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kMemberStateFieldName << "\" field in response to replSetHeartbeat " "command to have type NumberInt or NumberLong, but found type " << typeName(memberStateElement.type())); } else { long long stateInt = memberStateElement.numberLong(); if (stateInt < 0 || stateInt > MemberState::RS_MAX) { return Status(ErrorCodes::BadValue, str::stream() << "Value for \"" << kMemberStateFieldName << "\" in response to replSetHeartbeat is " "out of range; legal values are non-negative and no more than " << MemberState::RS_MAX); } _stateSet = true; _state = MemberState(static_cast<int>(stateInt)); } _stateDisagreement = doc[kHasStateDisagreementFieldName].trueValue(); // Not required for the case of uninitialized members -- they have no config const BSONElement versionElement = doc[kConfigVersionFieldName]; // If we have an optime then we must have a version if (_opTimeSet && versionElement.eoo()) { return Status(ErrorCodes::NoSuchKey, str::stream() << "Response to replSetHeartbeat missing required \"" << kConfigVersionFieldName << "\" field even though initialized"); } // If there is a "v" (config version) then it must be an int. if (!versionElement.eoo() && versionElement.type() != NumberInt) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kConfigVersionFieldName << "\" field in response to replSetHeartbeat to have " "type NumberInt, but found " << typeName(versionElement.type())); } _version = versionElement.numberInt(); const BSONElement hbMsgElement = doc[kHbMessageFieldName]; if (hbMsgElement.eoo()) { _hbmsg.clear(); } else if (hbMsgElement.type() != String) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kHbMessageFieldName << "\" field in response to replSetHeartbeat to have " "type String, but found " << typeName(hbMsgElement.type())); } else { _hbmsg = hbMsgElement.String(); } const BSONElement syncingToElement = doc[kSyncSourceFieldName]; if (syncingToElement.eoo()) { _syncingTo.clear(); } else if (syncingToElement.type() != String) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kSyncSourceFieldName << "\" field in response to replSetHeartbeat to " "have type String, but found " << typeName(syncingToElement.type())); } else { _syncingTo = syncingToElement.String(); } const BSONElement rsConfigElement = doc[kConfigFieldName]; if (rsConfigElement.eoo()) { _configSet = false; _config = ReplicaSetConfig(); return Status::OK(); } else if (rsConfigElement.type() != Object) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kConfigFieldName << "\" in response to replSetHeartbeat to have type " "Object, but found " << typeName(rsConfigElement.type())); } _configSet = true; return _config.initialize(rsConfigElement.Obj()); }
void OplogReader::connectToSyncSource(OperationContext* txn, OpTime lastOpTimeFetched, ReplicationCoordinator* replCoord) { const OpTime sentinel(Milliseconds(curTimeMillis64()).total_seconds(), 0); OpTime oldestOpTimeSeen = sentinel; invariant(conn() == NULL); while (true) { HostAndPort candidate = replCoord->chooseNewSyncSource(lastOpTimeFetched); if (candidate.empty()) { if (oldestOpTimeSeen == sentinel) { // If, in this invocation of connectToSyncSource(), we did not successfully // connect to any node ahead of us, // we apparently have no sync sources to connect to. // This situation is common; e.g. if there are no writes to the primary at // the moment. return; } // Connected to at least one member, but in all cases we were too stale to use them // as a sync source. log() << "replSet error RS102 too stale to catch up"; log() << "replSet our last optime : " << lastOpTimeFetched.toStringLong(); log() << "replSet oldest available is " << oldestOpTimeSeen.toStringLong(); log() << "replSet " "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; setMinValid(txn, oldestOpTimeSeen); bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << replCoord->getMemberState(); } return; } if (!connect(candidate)) { LOG(2) << "replSet can't connect to " << candidate.toString() << " to read operations"; resetConnection(); replCoord->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 10 * 1000)); continue; } // Read the first (oldest) op and confirm that it's not newer than our last // fetched op. Otherwise, we have fallen off the back of that source's oplog. BSONObj remoteOldestOp(findOne(rsoplog, Query())); BSONElement tsElem(remoteOldestOp["ts"]); if (tsElem.type() != Timestamp) { // This member's got a bad op in its oplog. warning() << "oplog invalid format on node " << candidate.toString(); resetConnection(); replCoord->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 600 * 1000)); continue; } OpTime remoteOldOpTime = tsElem._opTime(); if (!lastOpTimeFetched.isNull() && lastOpTimeFetched < remoteOldOpTime) { // We're too stale to use this sync source. resetConnection(); replCoord->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 600 * 1000)); if (oldestOpTimeSeen > remoteOldOpTime) { warning() << "we are too stale to use " << candidate.toString() << " as a sync source"; oldestOpTimeSeen = remoteOldOpTime; } continue; } // Got a valid sync source. return; } // while (true) }
Status ReplSetHeartbeatResponse::initialize(const BSONObj& doc) { const BSONElement electionTimeElement = doc[kElectionTimeFieldName]; if (electionTimeElement.eoo()) { _electionTimeSet = false; } else if (electionTimeElement.type() == Timestamp) { _electionTimeSet = true; _electionTime = electionTimeElement._opTime(); } else if (electionTimeElement.type() == Date) { _electionTime = true; _electionTime = OpTime(electionTimeElement.date()); } else { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kElectionTimeFieldName << "\" field in response to replSetHeartbeat " "command to have type Date or Timestamp, but found type " << typeName(electionTimeElement.type())); } const BSONElement timeElement = doc[kTimeFieldName]; if (timeElement.eoo()) { _timeSet = false; } else if (timeElement.isNumber()) { _timeSet = true; _time = Seconds(timeElement.numberLong()); } else { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kTimeFieldName << "\" field in reponse to replSetHeartbeat " "command to have a numeric type, but found type " << typeName(timeElement.type())); } const BSONElement opTimeElement = doc[kOpTimeFieldName]; if (opTimeElement.eoo()) { _opTimeSet = false; } else if (opTimeElement.type() == Timestamp) { _opTimeSet = true; _opTime = opTimeElement._opTime(); } else if (opTimeElement.type() == Date) { _opTimeSet = true; _opTime = OpTime(opTimeElement.date()); } else { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kOpTimeFieldName << "\" field in response to replSetHeartbeat " "command to have type Date or Timestamp, but found type " << typeName(opTimeElement.type())); } const BSONElement electableElement = doc[kIsElectableFieldName]; if (electableElement.eoo()) { _electableSet = false; } else { _electableSet = true; _electable = electableElement.trueValue(); } _mismatch = doc[kMismatchFieldName].trueValue(); _isReplSet = doc[kIsReplSetFieldName].trueValue(); const BSONElement memberStateElement = doc[kMemberStateFieldName]; if (memberStateElement.eoo()) { _stateSet = false; } else if (memberStateElement.type() != NumberInt && memberStateElement.type() != NumberLong) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kMemberStateFieldName << "\" field in response to replSetHeartbeat " " command to have type NumberInt or NumberLong, but found type " << typeName(memberStateElement.type())); } else { long long stateInt = memberStateElement.numberLong(); if (stateInt < 0 || stateInt > MemberState::RS_MAX) { return Status(ErrorCodes::BadValue, str::stream() << "Value for \"" << kMemberStateFieldName << "\" in response to replSetHeartbeat is " " out of range; legal values are non-negative and no more than " << MemberState::RS_MAX); } _state = MemberState(static_cast<int>(stateInt)); } _stateDisagreement = doc[kHasStateDisagreementFieldName].trueValue(); const BSONElement versionElement = doc[kConfigVersionFieldName]; if (versionElement.eoo()) { return Status(ErrorCodes::NoSuchKey, str::stream() << "Response to replSetHeartbeat missing required \"" << kConfigVersionFieldName << " field"); } if (versionElement.type() != NumberInt) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kConfigVersionFieldName << "\" field in response to replSetHeartbeat to have " "type NumberInt, but found " << typeName(versionElement.type())); } _version = versionElement.numberInt(); const BSONElement replSetNameElement = doc[kReplSetFieldName]; if (replSetNameElement.eoo()) { return Status(ErrorCodes::NoSuchKey, str::stream() << "Response to replSetHeartbeat missing required \"" << kReplSetFieldName << "\" field"); } if (replSetNameElement.type() != String) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kReplSetFieldName << "\" field in response to replSetHeartbeat to have " "type String, but found " << typeName(replSetNameElement.type())); } _setName = replSetNameElement.String(); const BSONElement hbMsgElement = doc[kHbMessageFieldName]; if (hbMsgElement.eoo()) { _hbmsg.clear(); } else if (hbMsgElement.type() != String) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kHbMessageFieldName << "\" field in response to replSetHeartbeat to have " "type String, but found " << typeName(hbMsgElement.type())); } _hbmsg = hbMsgElement.String(); const BSONElement syncingToElement = doc[kSyncSourceFieldName]; if (syncingToElement.eoo()) { _syncingTo.clear(); } else if (syncingToElement.type() != String) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kSyncSourceFieldName << "\" field in response to replSetHeartbeat to " "have type String, but found " << typeName(syncingToElement.type())); } _syncingTo = syncingToElement.String(); const BSONElement rsConfigElement = doc[kConfigFieldName]; if (rsConfigElement.eoo()) { _configSet = false; _config = ReplicaSetConfig(); } else if (rsConfigElement.type() != Object) { return Status(ErrorCodes::TypeMismatch, str::stream() << "Expected \"" << kConfigFieldName << "\" in response to replSetHeartbeat to have type " "Object, but found " << typeName(rsConfigElement.type())); } _configSet = true; return _config.initialize(rsConfigElement.Obj()); }
void OplogReader::connectToSyncSource(OperationContext* txn, const OpTime& lastOpTimeFetched, const OpTime& requiredOpTime, ReplicationCoordinator* replCoord) { const Timestamp sentinelTimestamp(duration_cast<Seconds>(Date_t::now().toDurationSinceEpoch()), 0); const OpTime sentinel(sentinelTimestamp, std::numeric_limits<long long>::max()); OpTime oldestOpTimeSeen = sentinel; invariant(conn() == NULL); while (true) { HostAndPort candidate = replCoord->chooseNewSyncSource(lastOpTimeFetched); if (candidate.empty()) { if (oldestOpTimeSeen == sentinel) { // If, in this invocation of connectToSyncSource(), we did not successfully // connect to any node ahead of us, // we apparently have no sync sources to connect to. // This situation is common; e.g. if there are no writes to the primary at // the moment. return; } // Connected to at least one member, but in all cases we were too stale to use them // as a sync source. error() << "too stale to catch up -- entering maintenance mode"; log() << "our last optime : " << lastOpTimeFetched; log() << "oldest available is " << oldestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; auto status = replCoord->setMaintenanceMode(true); if (!status.isOK()) { warning() << "Failed to transition into maintenance mode: " << status; } bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << replCoord->getMemberState(); } return; } if (!connect(candidate)) { LOG(2) << "can't connect to " << candidate.toString() << " to read operations"; resetConnection(); replCoord->blacklistSyncSource(candidate, Date_t::now() + Seconds(10)); continue; } // Read the first (oldest) op and confirm that it's not newer than our last // fetched op. Otherwise, we have fallen off the back of that source's oplog. BSONObj remoteOldestOp(findOne(rsOplogName.c_str(), Query())); OpTime remoteOldOpTime = fassertStatusOK(28776, OpTime::parseFromOplogEntry(remoteOldestOp)); // remoteOldOpTime may come from a very old config, so we cannot compare their terms. if (!lastOpTimeFetched.isNull() && lastOpTimeFetched.getTimestamp() < remoteOldOpTime.getTimestamp()) { // We're too stale to use this sync source. resetConnection(); replCoord->blacklistSyncSource(candidate, Date_t::now() + Minutes(1)); if (oldestOpTimeSeen.getTimestamp() > remoteOldOpTime.getTimestamp()) { warning() << "we are too stale to use " << candidate.toString() << " as a sync source"; oldestOpTimeSeen = remoteOldOpTime; } continue; } // Check if sync source contains required optime. if (!requiredOpTime.isNull()) { // This query is structured so that it is executed on the sync source using the oplog // start hack (oplogReplay=true and $gt/$gte predicate over "ts"). auto ts = requiredOpTime.getTimestamp(); tailingQuery(rsOplogName.c_str(), BSON("ts" << BSON("$gte" << ts << "$lte" << ts))); auto status = _compareRequiredOpTimeWithQueryResponse(requiredOpTime); if (!status.isOK()) { const auto blacklistDuration = Seconds(60); const auto until = Date_t::now() + blacklistDuration; warning() << "We cannot use " << candidate.toString() << " as a sync source because it does not contain the necessary " "operations for us to reach a consistent state: " << status << " last fetched optime: " << lastOpTimeFetched << ". required optime: " << requiredOpTime << ". Blacklisting this sync source for " << blacklistDuration << " until: " << until; resetConnection(); replCoord->blacklistSyncSource(candidate, until); continue; } resetCursor(); } // TODO: If we were too stale (recovering with maintenance mode on), then turn it off, to // allow becoming secondary/etc. // Got a valid sync source. return; } // while (true) }