void ReplicationCoordinatorImpl::_onVoteRequestComplete(long long originalTerm) { invariant(_voteRequester); invariant(!_electionWinnerDeclarer); LoseElectionGuardV1 lossGuard(this); if (_topCoord->getTerm() != originalTerm) { log() << "not becoming primary, we have been superceded already"; return; } const VoteRequester::VoteRequestResult endResult = _voteRequester->getResult(); if (endResult == VoteRequester::InsufficientVotes) { log() << "not becoming primary, we received insufficient votes"; return; } else if (endResult == VoteRequester::StaleTerm) { log() << "not becoming primary, we have been superceded already"; return; } else if (endResult != VoteRequester::SuccessfullyElected) { log() << "not becoming primary, we received an unexpected problem"; return; } log() << "election succeeded, assuming primary role in term " << _topCoord->getTerm(); // Prevent last committed optime from updating until we finish draining. _setFirstOpTimeOfMyTerm( OpTime(Timestamp(std::numeric_limits<int>::max(), 0), std::numeric_limits<int>::max())); _performPostMemberStateUpdateAction(kActionWinElection); _voteRequester.reset(nullptr); _replExecutor.signalEvent(_electionFinishedEvent); lossGuard.dismiss(); }
void ReplicationCoordinatorImpl::_heartbeatReconfigFinish( const ReplicationExecutor::CallbackData& cbData, const ReplicaSetConfig& newConfig, StatusWith<int> myIndex) { if (cbData.status == ErrorCodes::CallbackCanceled) { return; } boost::unique_lock<boost::mutex> lk(_mutex); invariant(_rsConfigState == kConfigHBReconfiguring); invariant(!_rsConfig.isInitialized() || _rsConfig.getConfigVersion() < newConfig.getConfigVersion()); if (_getMemberState_inlock().primary() && !cbData.txn) { // Not having an OperationContext in the CallbackData means we definitely aren't holding // the global lock. Since we're primary and this reconfig could cause us to stepdown, // reschedule this work with the global exclusive lock so the stepdown is safe. // TODO(spencer): When we *do* have an OperationContext, consult it to confirm that // we are indeed holding the global lock. _replExecutor.scheduleWorkWithGlobalExclusiveLock( stdx::bind(&ReplicationCoordinatorImpl::_heartbeatReconfigFinish, this, stdx::placeholders::_1, newConfig, myIndex)); return; } if (!myIndex.isOK()) { switch (myIndex.getStatus().code()) { case ErrorCodes::NodeNotFound: log() << "Cannot find self in new replica set configuration; I must be removed; " << myIndex.getStatus(); break; case ErrorCodes::DuplicateKey: error() << "Several entries in new config represent this node; " "Removing self until an acceptable configuration arrives; " << myIndex.getStatus(); break; default: error() << "Could not validate configuration received from remote node; " "Removing self until an acceptable configuration arrives; " << myIndex.getStatus(); break; } myIndex = StatusWith<int>(-1); } const PostMemberStateUpdateAction action = _setCurrentRSConfig_inlock(newConfig, myIndex.getValue()); lk.unlock(); _performPostMemberStateUpdateAction(action); }
void ReplicationCoordinatorImpl::_onElectCmdRunnerComplete() { stdx::unique_lock<stdx::mutex> lk(_mutex); LoseElectionGuard lossGuard(_topCoord.get(), _replExecutor.get(), &_freshnessChecker, &_electCmdRunner, &_electionFinishedEvent); invariant(_freshnessChecker); invariant(_electCmdRunner); if (_electCmdRunner->isCanceled()) { LOG(2) << "Election canceled during elect self phase"; return; } const int receivedVotes = _electCmdRunner->getReceivedVotes(); if (receivedVotes < _rsConfig.getMajorityVoteCount()) { log() << "couldn't elect self, only received " << receivedVotes << " votes, but needed at least " << _rsConfig.getMajorityVoteCount(); // Suppress ourselves from standing for election again, giving other nodes a chance // to win their elections. const auto ms = Milliseconds(_nextRandomInt64_inlock(1000) + 50); const Date_t now(_replExecutor->now()); const Date_t nextCandidateTime = now + ms; log() << "waiting until " << nextCandidateTime << " before standing for election again"; _topCoord->setElectionSleepUntil(nextCandidateTime); _scheduleWorkAt(nextCandidateTime, stdx::bind(&ReplicationCoordinatorImpl::_recoverFromElectionTie, this, stdx::placeholders::_1)); return; } if (_rsConfig.getConfigVersion() != _freshnessChecker->getOriginalConfigVersion()) { log() << "config version changed during our election, ignoring result"; return; } log() << "election succeeded, assuming primary role"; lossGuard.dismiss(); _freshnessChecker.reset(NULL); _electCmdRunner.reset(NULL); auto electionFinishedEvent = _electionFinishedEvent; lk.unlock(); _performPostMemberStateUpdateAction(kActionWinElection); _replExecutor->signalEvent(electionFinishedEvent); }
void ReplicationCoordinatorImpl::_onVoteRequestComplete(long long originalTerm) { invariant(_voteRequester); LoseElectionGuardV1 lossGuard(this); LockGuard lk(_topoMutex); if (_topCoord->getTerm() != originalTerm) { log() << "not becoming primary, we have been superceded already"; return; } const VoteRequester::Result endResult = _voteRequester->getResult(); switch (endResult) { case VoteRequester::Result::kInsufficientVotes: log() << "not becoming primary, we received insufficient votes"; return; case VoteRequester::Result::kStaleTerm: log() << "not becoming primary, we have been superceded already"; return; case VoteRequester::Result::kSuccessfullyElected: log() << "election succeeded, assuming primary role in term " << _topCoord->getTerm(); break; } { // Mark all nodes that responded to our vote request as up to avoid immediately // relinquishing primary. stdx::lock_guard<stdx::mutex> lk(_mutex); Date_t now = _replExecutor.now(); const unordered_set<HostAndPort> liveNodes = _voteRequester->getResponders(); for (auto& nodeInfo : _slaveInfo) { if (liveNodes.count(nodeInfo.hostAndPort)) { nodeInfo.down = false; nodeInfo.lastUpdate = now; } } } // Prevent last committed optime from updating until we finish draining. _setFirstOpTimeOfMyTerm( OpTime(Timestamp(std::numeric_limits<int>::max(), 0), std::numeric_limits<int>::max())); _performPostMemberStateUpdateAction(kActionWinElection); _voteRequester.reset(nullptr); _replExecutor.signalEvent(_electionFinishedEvent); lossGuard.dismiss(); }
void ReplicationCoordinatorImpl::_heartbeatStepDownFinish( const ReplicationExecutor::CallbackData& cbData) { if (cbData.status == ErrorCodes::CallbackCanceled) { return; } invariant(cbData.txn); // TODO Add invariant that we've got global shared or global exclusive lock, when supported // by lock manager. boost::unique_lock<boost::mutex> lk(_mutex); _topCoord->stepDownIfPending(); const PostMemberStateUpdateAction action = _updateMemberStateFromTopologyCoordinator_inlock(); lk.unlock(); _performPostMemberStateUpdateAction(action); }
void ReplicationCoordinatorImpl::_onVoteRequestComplete(long long originalTerm) { invariant(_voteRequester); invariant(!_electionWinnerDeclarer); LoseElectionGuardV1 lossGuard(_topCoord.get(), &_replExecutor, &_voteRequester, &_electionWinnerDeclarer, &_electionFinishedEvent); if (_topCoord->getTerm() != originalTerm) { log() << "not becoming primary, we have been superceded already"; return; } const VoteRequester::VoteRequestResult endResult = _voteRequester->getResult(); if (endResult == VoteRequester::InsufficientVotes) { log() << "not becoming primary, we received insufficient votes"; return; } else if (endResult == VoteRequester::StaleTerm) { log() << "not becoming primary, we have been superceded already"; return; } else if (endResult != VoteRequester::SuccessfullyElected) { log() << "not becoming primary, we received an unexpected problem"; return; } log() << "election succeeded, assuming primary role"; _performPostMemberStateUpdateAction(kActionWinElection); _electionWinnerDeclarer.reset(new ElectionWinnerDeclarer); StatusWith<ReplicationExecutor::EventHandle> nextPhaseEvh = _electionWinnerDeclarer->start( &_replExecutor, _rsConfig.getReplSetName(), _rsConfig.getMemberAt(_selfIndex).getId(), _topCoord->getTerm(), _topCoord->getMaybeUpHostAndPorts(), stdx::bind(&ReplicationCoordinatorImpl::_onElectionWinnerDeclarerComplete, this)); if (nextPhaseEvh.getStatus() == ErrorCodes::ShutdownInProgress) { return; } fassert(28644, nextPhaseEvh.getStatus()); lossGuard.dismiss(); }
void ReplicationCoordinatorImpl::_onVoteRequestComplete(long long newTerm) { stdx::unique_lock<stdx::mutex> lk(_mutex); LoseElectionGuardV1 lossGuard(this); invariant(_voteRequester); if (_topCoord->getTerm() != newTerm) { log() << "not becoming primary, we have been superseded already during election. " << "election term: " << newTerm << ", current term: " << _topCoord->getTerm(); return; } const VoteRequester::Result endResult = _voteRequester->getResult(); invariant(endResult != VoteRequester::Result::kPrimaryRespondedNo); switch (endResult) { case VoteRequester::Result::kInsufficientVotes: log() << "not becoming primary, we received insufficient votes"; return; case VoteRequester::Result::kStaleTerm: log() << "not becoming primary, we have been superseded already"; return; case VoteRequester::Result::kSuccessfullyElected: log() << "election succeeded, assuming primary role in term " << _topCoord->getTerm(); break; case VoteRequester::Result::kPrimaryRespondedNo: // This is impossible because we would only require the primary's // vote during a dry run. invariant(false); } // Mark all nodes that responded to our vote request as up to avoid immediately // relinquishing primary. Date_t now = _replExecutor->now(); _topCoord->resetMemberTimeouts(now, _voteRequester->getResponders()); _voteRequester.reset(); auto electionFinishedEvent = _electionFinishedEvent; lk.unlock(); _performPostMemberStateUpdateAction(kActionWinElection); _replExecutor->signalEvent(electionFinishedEvent); lossGuard.dismiss(); }
void ReplicationCoordinatorImpl::_handleHeartbeatResponseAction( const HeartbeatResponseAction& action, const StatusWith<ReplSetHeartbeatResponse>& responseStatus) { switch (action.getAction()) { case HeartbeatResponseAction::NoAction: // Update the cached member state if different than the current topology member state if (_memberState != _topCoord->getMemberState()) { boost::unique_lock<boost::mutex> lk(_mutex); const PostMemberStateUpdateAction postUpdateAction = _updateMemberStateFromTopologyCoordinator_inlock(); lk.unlock(); _performPostMemberStateUpdateAction(postUpdateAction); } break; case HeartbeatResponseAction::Reconfig: invariant(responseStatus.isOK()); _scheduleHeartbeatReconfig(responseStatus.getValue().getConfig()); break; case HeartbeatResponseAction::StartElection: if (isV1ElectionProtocol()) { _startElectSelfV1(); } else { _startElectSelf(); } break; case HeartbeatResponseAction::StepDownSelf: invariant(action.getPrimaryConfigIndex() == _selfIndex); _heartbeatStepDownStart(); break; case HeartbeatResponseAction::StepDownRemotePrimary: { invariant(action.getPrimaryConfigIndex() != _selfIndex); _requestRemotePrimaryStepdown( _rsConfig.getMemberAt(action.getPrimaryConfigIndex()).getHostAndPort()); break; } default: severe() << "Illegal heartbeat response action code " << int(action.getAction()); invariant(false); } }