void ReplicationCoordinatorImpl::_recoverFromElectionTie( const ReplicationExecutor::CallbackArgs& cbData) { if (!cbData.status.isOK()) { return; } if (_topCoord->checkShouldStandForElection(_replExecutor.now(), getMyLastOptime())) { _startElectSelf(); } }
void ReplicationCoordinatorImpl::_recoverFromElectionTie( const ReplicationExecutor::CallbackArgs& cbData) { if (!cbData.status.isOK()) { return; } auto now = _replExecutor.now(); auto lastOpApplied = getMyLastOptime(); if (_topCoord->checkShouldStandForElection(now, lastOpApplied)) { fassert(28817, _topCoord->becomeCandidateIfElectable(now, lastOpApplied)); _startElectSelf(); } }
void ReplicationCoordinatorImpl::_onDryRunComplete(long long originalTerm) { invariant(_voteRequester); invariant(!_electionWinnerDeclarer); LoseElectionGuardV1 lossGuard(_topCoord.get(), &_replExecutor, &_voteRequester, &_electionWinnerDeclarer, &_electionFinishedEvent); if (_topCoord->getTerm() != originalTerm) { log() << "not running for primary, we have been superceded already"; return; } const VoteRequester::VoteRequestResult endResult = _voteRequester->getResult(); if (endResult == VoteRequester::InsufficientVotes) { log() << "not running for primary, we received insufficient votes"; return; } else if (endResult == VoteRequester::StaleTerm) { log() << "not running for primary, we have been superceded already"; return; } else if (endResult != VoteRequester::SuccessfullyElected) { log() << "not running for primary, we received an unexpected problem"; return; } log() << "dry election run succeeded, running for election"; _topCoord->incrementTerm(); // Secure our vote for ourself first _topCoord->voteForMyselfV1(); _voteRequester.reset(new VoteRequester); StatusWith<ReplicationExecutor::EventHandle> nextPhaseEvh = _voteRequester->start( &_replExecutor, _rsConfig, _rsConfig.getMemberAt(_selfIndex).getId(), _topCoord->getTerm(), false, getMyLastOptime(), stdx::bind(&ReplicationCoordinatorImpl::_onVoteRequestComplete, this, originalTerm + 1)); if (nextPhaseEvh.getStatus() == ErrorCodes::ShutdownInProgress) { return; } fassert(28643, nextPhaseEvh.getStatus()); lossGuard.dismiss(); }
void ReplicationCoordinatorImpl::_startVoteRequester(long long newTerm) { invariant(_voteRequester); invariant(!_electionWinnerDeclarer); LoseElectionGuardV1 lossGuard(this); _voteRequester.reset(new VoteRequester); StatusWith<ReplicationExecutor::EventHandle> nextPhaseEvh = _voteRequester->start( &_replExecutor, _rsConfig, _rsConfig.getMemberAt(_selfIndex).getId(), _topCoord->getTerm(), false, getMyLastOptime(), stdx::bind(&ReplicationCoordinatorImpl::_onVoteRequestComplete, this, newTerm)); if (nextPhaseEvh.getStatus() == ErrorCodes::ShutdownInProgress) { return; } fassert(28643, nextPhaseEvh.getStatus()); lossGuard.dismiss(); }
void ReplicationCoordinatorImpl::_startElectSelfV1() { invariant(!_electionWinnerDeclarer); invariant(!_voteRequester); invariant(!_freshnessChecker); boost::unique_lock<boost::mutex> lk(_mutex); switch (_rsConfigState) { case kConfigSteady: break; case kConfigInitiating: case kConfigReconfiguring: case kConfigHBReconfiguring: LOG(2) << "Not standing for election; processing a configuration change"; // Transition out of candidate role. _topCoord->processLoseElection(); return; default: severe() << "Entered replica set election code while in illegal config state " << int(_rsConfigState); fassertFailed(28641); } const StatusWith<ReplicationExecutor::EventHandle> finishEvh = _replExecutor.makeEvent(); if (finishEvh.getStatus() == ErrorCodes::ShutdownInProgress) { return; } fassert(28642, finishEvh.getStatus()); _electionFinishedEvent = finishEvh.getValue(); LoseElectionGuardV1 lossGuard(_topCoord.get(), &_replExecutor, &_voteRequester, &_electionWinnerDeclarer, &_electionFinishedEvent); invariant(_rsConfig.getMemberAt(_selfIndex).isElectable()); OpTime lastOpTimeApplied(_getMyLastOptime_inlock()); if (lastOpTimeApplied == OpTime()) { log() << "not trying to elect self, " "do not yet have a complete set of data from any point in time"; return; } log() << "conducting a dry run election to see if we could be elected"; _voteRequester.reset(new VoteRequester); // This is necessary because the voteRequester may call directly into winning an // election, if there are no other MaybeUp nodes. Winning an election attempts to lock // _mutex again. lk.unlock(); long long term = _topCoord->getTerm(); StatusWith<ReplicationExecutor::EventHandle> nextPhaseEvh = _voteRequester->start( &_replExecutor, _rsConfig, _rsConfig.getMemberAt(_selfIndex).getId(), _topCoord->getTerm(), true, // dry run getMyLastOptime(), stdx::bind(&ReplicationCoordinatorImpl::_onDryRunComplete, this, term)); if (nextPhaseEvh.getStatus() == ErrorCodes::ShutdownInProgress) { return; } fassert(28685, nextPhaseEvh.getStatus()); lossGuard.dismiss(); }
/* tail an oplog. ok to return, will be re-called. */ void SyncTail::oplogApplication() { OpQueueBatcher batcher(this); OperationContextImpl txn; auto replCoord = ReplicationCoordinator::get(&txn); ApplyBatchFinalizer finalizer(replCoord); auto minValidBoundaries = getMinValid(&txn); OpTime originalEndOpTime(minValidBoundaries.end); OpTime lastWriteOpTime{replCoord->getMyLastOptime()}; while (!inShutdown()) { OpQueue ops; do { if (BackgroundSync::get()->getInitialSyncRequestedFlag()) { // got a resync command return; } tryToGoLiveAsASecondary(&txn, replCoord, minValidBoundaries, lastWriteOpTime); // Blocks up to a second waiting for a batch to be ready to apply. If one doesn't become // ready in time, we'll loop again so we can do the above checks periodically. ops = batcher.getNextBatch(Seconds(1)); } while (!inShutdown() && ops.empty()); if (inShutdown()) return; invariant(!ops.empty()); const BSONObj lastOp = ops.back().raw; if (lastOp.isEmpty()) { // This means that the network thread has coalesced and we have processed all of its // data. invariant(ops.getDeque().size() == 1); if (replCoord->isWaitingForApplierToDrain()) { replCoord->signalDrainComplete(&txn); } continue; // This wasn't a real op. Don't try to apply it. } handleSlaveDelay(lastOp); // Set minValid to the last OpTime that needs to be applied, in this batch or from the // (last) failed batch, whichever is larger. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating finishing. const OpTime start(getLastSetTimestamp(), OpTime::kUninitializedTerm); // Take the max of the first endOptime (if we recovered) and the end of our batch. const auto lastOpTime = fassertStatusOK(28773, OpTime::parseFromOplogEntry(lastOp)); // Setting end to the max of originalEndOpTime and lastOpTime (the end of the batch) // ensures that we keep pushing out the point where we can become consistent // and allow reads. If we recover and end up doing smaller batches we must pass the // originalEndOpTime before we are good. // // For example: // batch apply, 20-40, end = 40 // batch failure, // restart // batch apply, 20-25, end = max(25, 40) = 40 // batch apply, 25-45, end = 45 const OpTime end(std::max(originalEndOpTime, lastOpTime)); // This write will not journal/checkpoint. setMinValid(&txn, {start, end}); lastWriteOpTime = multiApply(&txn, ops); setNewTimestamp(lastWriteOpTime.getTimestamp()); setMinValid(&txn, end, DurableRequirement::None); minValidBoundaries.start = {}; minValidBoundaries.end = end; finalizer.record(lastWriteOpTime); } }
void ReplicationCoordinatorImpl::_handleHeartbeatResponse( const ReplicationExecutor::RemoteCommandCallbackData& cbData, int targetIndex) { // remove handle from queued heartbeats _untrackHeartbeatHandle(cbData.myHandle); // Parse and validate the response. At the end of this step, if responseStatus is OK then // hbResponse is valid. Status responseStatus = cbData.response.getStatus(); if (responseStatus == ErrorCodes::CallbackCanceled) { return; } const HostAndPort& target = cbData.request.target; ReplSetHeartbeatResponse hbResponse; BSONObj resp; if (responseStatus.isOK()) { resp = cbData.response.getValue().data; responseStatus = hbResponse.initialize(resp); } const bool isUnauthorized = (responseStatus.code() == ErrorCodes::Unauthorized) || (responseStatus.code() == ErrorCodes::AuthenticationFailed); const Date_t now = _replExecutor.now(); const Timestamp lastApplied = getMyLastOptime(); // Locks and unlocks _mutex. Milliseconds networkTime(0); StatusWith<ReplSetHeartbeatResponse> hbStatusResponse(hbResponse); if (responseStatus.isOK()) { networkTime = cbData.response.getValue().elapsedMillis; } else { log() << "Error in heartbeat request to " << target << "; " << responseStatus; if (!resp.isEmpty()) { LOG(3) << "heartbeat response: " << resp; } if (isUnauthorized) { networkTime = cbData.response.getValue().elapsedMillis; } hbStatusResponse = StatusWith<ReplSetHeartbeatResponse>(responseStatus); } HeartbeatResponseAction action = _topCoord->processHeartbeatResponse( now, networkTime, target, hbStatusResponse, lastApplied); if (action.getAction() == HeartbeatResponseAction::NoAction && hbStatusResponse.isOK() && hbStatusResponse.getValue().hasOpTime() && targetIndex >= 0 && hbStatusResponse.getValue().hasState() && hbStatusResponse.getValue().getState() != MemberState::RS_PRIMARY) { boost::unique_lock<boost::mutex> lk(_mutex); if (hbStatusResponse.getValue().getVersion() == _rsConfig.getConfigVersion()) { _updateOpTimeFromHeartbeat_inlock(targetIndex, hbStatusResponse.getValue().getOpTime()); // TODO: Enable with Data Replicator //lk.unlock(); //_dr.slavesHaveProgressed(); } } _signalStepDownWaiters(); _scheduleHeartbeatToTarget( target, targetIndex, std::max(now, action.getNextHeartbeatStartDate())); _handleHeartbeatResponseAction(action, hbStatusResponse); }