/* check members OTHER THAN US to see if they think they are primary */ const Member * Manager::findOtherPrimary() { Member *m = rs->head(); Member *p = 0; while( m ) { if( m->state().primary() && m->hbinfo().up() ) { if( p ) throw "twomasters"; // our polling is asynchronous, so this is often ok. p = m; } m = m->next(); } if( p ) noteARemoteIsPrimary(p); return p; }
/* check members OTHER THAN US to see if they think they are primary */ const Member * Manager::findOtherPrimary(bool& two) { two = false; Member *m = rs->head(); Member *p = 0; while( m ) { DEV verify( m != rs->_self ); if( m->state().primary() && m->hbinfo().up() ) { if( p ) { two = true; return 0; } p = m; } m = m->next(); } if( p ) noteARemoteIsPrimary(p); return p; }
/** called as the health threads get new results */ void Manager::msgCheckNewState() { { theReplSet->assertValid(); rs->assertValid(); RSBase::lock lk(rs); if( busyWithElectSelf ) return; checkElectableSet(); checkAuth(); const Member *p = rs->box.getPrimary(); if( p && p != rs->_self ) { if( !p->hbinfo().up() || !p->hbinfo().hbstate.primary() ) { p = 0; rs->box.setOtherPrimary(0); } } const Member *p2; { bool two; p2 = findOtherPrimary(two); if( two ) { /* two other nodes think they are primary (asynchronously polled) -- wait for things to settle down. */ log() << "replSet info two primaries (transiently)" << rsLog; return; } } if( p2 ) { noteARemoteIsPrimary(p2); return; } /* didn't find anyone who wants to be primary */ if( p ) { /* we are already primary */ if( p != rs->_self ) { rs->sethbmsg("error p != rs->self in checkNewState"); log() << "replSet " << p->fullName() << rsLog; log() << "replSet " << rs->_self->fullName() << rsLog; return; } if( rs->elect.shouldRelinquish() ) { log() << "can't see a majority of the set, relinquishing primary" << rsLog; rs->relinquish(); } return; } if( !rs->iAmPotentiallyHot() ) { // if not we never try to be primary OCCASIONALLY log() << "replSet I don't see a primary and I can't elect myself" << endl; return; } /* no one seems to be primary. shall we try to elect ourself? */ if( !rs->elect.aMajoritySeemsToBeUp() ) { static time_t last; static int n; int ll = 0; if( ++n > 5 ) ll++; if( last + 60 > time(0 ) ) ll++; log(ll) << "replSet can't see a majority, will not try to elect self" << rsLog; last = time(0); return; } if( !rs->iAmElectable() ) { return; } busyWithElectSelf = true; // don't try to do further elections & such while we are already working on one. } try { rs->elect.electSelf(); } catch(RetryAfterSleepException&) { /* we want to process new inbounds before trying this again. so we just put a checkNewstate in the queue for eval later. */ requeue(); } catch(...) { log() << "replSet error unexpected assertion in rs manager" << rsLog; } busyWithElectSelf = false; }
/** called as the health threads get new results */ void Manager::msgCheckNewState() { { theReplSet->assertValid(); rs->assertValid(); RSBase::lock lk(rs); if( busyWithElectSelf ) return; checkElectableSet(); const Member *p = rs->box.getPrimary(); if( p && p != rs->_self ) { if( !p->hbinfo().up() || !p->hbinfo().hbstate.primary() ) { p = 0; rs->box.setOtherPrimary(0); } } const Member *p2; { bool two; p2 = findOtherPrimary(two); if( two ) { /* two other nodes think they are primary (asynchronously polled) -- wait for things to settle down. */ log() << "replSet info two primaries (transiently)" << rsLog; return; } } if( p2 ) { /* someone else thinks they are primary. */ if( p == p2 ) { // we thought the same; all set. return; } if( p == 0 ) { noteARemoteIsPrimary(p2); return; } // todo xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx if( p != rs->_self ) { // switch primary from oldremotep->newremotep2 noteARemoteIsPrimary(p2); return; } /* we thought we were primary, yet now someone else thinks they are. */ if( !rs->elect.aMajoritySeemsToBeUp() ) { /* we can't see a majority. so the other node is probably the right choice. */ noteARemoteIsPrimary(p2); return; } /* ignore for now, keep thinking we are master. this could just be timing (we poll every couple seconds) or could indicate a problem? if it happens consistently for a duration of time we should alert the sysadmin. */ return; } /* didn't find anyone who wants to be primary */ if( p ) { /* we are already primary */ if( p != rs->_self ) { rs->sethbmsg("error p != rs->self in checkNewState"); log() << "replSet " << p->fullName() << rsLog; log() << "replSet " << rs->_self->fullName() << rsLog; return; } if( rs->elect.shouldRelinquish() ) { log() << "can't see a majority of the set, relinquishing primary" << rsLog; rs->relinquish(); } return; } if( !rs->iAmPotentiallyHot() ) // if not we never try to be primary return; /* no one seems to be primary. shall we try to elect ourself? */ if( !rs->elect.aMajoritySeemsToBeUp() ) { static time_t last; static int n; int ll = 0; if( ++n > 5 ) ll++; if( last + 60 > time(0 ) ) ll++; log(ll) << "replSet can't see a majority, will not try to elect self" << rsLog; last = time(0); return; } if( !rs->iAmElectable() ) { return; } busyWithElectSelf = true; // don't try to do further elections & such while we are already working on one. } try { rs->elect.electSelf(); } catch(RetryAfterSleepException&) { /* we want to process new inbounds before trying this again. so we just put a checkNewstate in the queue for eval later. */ requeue(); } catch(...) { log() << "replSet error unexpected assertion in rs manager" << rsLog; } busyWithElectSelf = false; }
/** called as the health threads get new results */ void Manager::msgCheckNewState() { bool authIssue = false; { theReplSet->assertValid(); rs->assertValid(); boost::unique_lock<boost::mutex> lock(rs->stateChangeMutex); { RSBase::lock lk(rs); if( busyWithElectSelf ) return; checkElectableSet(); authIssue = checkAuth(); if (!authIssue) { const Member *p = rs->box.getPrimary(); if( p && p != rs->_self ) { if( !p->hbinfo().up() || !p->hbinfo().hbstate.primary() ) { p = 0; rs->box.setOtherPrimary(0); } } const Member *p2; { bool two; p2 = findOtherPrimary(two); if( two ) { /* two other nodes think they are primary (asynchronously polled) -- wait for things to settle down. */ log() << "replSet info two primaries (transiently)" << rsLog; return; } } if( p2 ) { noteARemoteIsPrimary(p2); return; } /* didn't find anyone who wants to be primary */ if( p ) { /* we are already primary */ if( p != rs->_self ) { rs->sethbmsg("error p != rs->self in checkNewState"); log() << "replSet " << p->fullName() << rsLog; log() << "replSet " << rs->_self->fullName() << rsLog; return; } if( rs->elect.shouldRelinquish() ) { log() << "can't see a majority of the set, relinquishing primary" << rsLog; rs->relinquish(); } if (GTID::cmp(theReplSet->gtidManager->getLiveState(), theReplSet->lastOtherGTID()) < 0) { // this can happen if we transiently have two primaries, which can // happen if a primary loses contact with the replica set, // triggering an election, but it connects back before it has a // chance to step down log() << "we see a secondary that is ahead, relinquishing primary" << rsLog; rs->relinquish(); } return; } if( !rs->iAmPotentiallyHot() ) { // if not we never try to be primary OCCASIONALLY log() << "replSet I don't see a primary and I can't elect myself" << endl; return; } /* no one seems to be primary. shall we try to elect ourself? */ if( !rs->elect.aMajoritySeemsToBeUp() ) { static time_t last; static int n; int ll = 0; if( ++n > 5 ) ll++; if( last + 60 > time(0 ) ) ll++; LOG(ll) << "replSet can't see a majority, will not try to elect self" << rsLog; last = time(0); return; } if( !rs->iAmElectable() ) { return; } busyWithElectSelf = true; // don't try to do further elections & such while we are already working on one. } } // blockSync outside of rslock // can't hold rslock because we may try to stop the opsync thread if (authIssue) { { RSBase::lock lk(rs); if (rs->box.getPrimary() == rs->_self) { log() << "auth problems, relinquishing primary" << rsLog; rs->relinquish(); } } rs->blockSync(true); return; } } try { rs->elect.electSelf(); } catch(RetryAfterSleepException&) { /* we want to process new inbounds before trying this again. so we just put a checkNewstate in the queue for eval later. */ requeue(); } catch(...) { log() << "replSet error unexpected assertion in rs manager" << rsLog; } busyWithElectSelf = false; }