Example #1
0
        void doWork() {
            if ( !theReplSet ) {
                LOG(2) << "replSet not initialized yet, skipping health poll this round" << rsLog;
                return;
            }

            HeartbeatInfo mem = m;
            HeartbeatInfo old = mem;
            bool needsNewStateChecked = false;
            try {
                BSONObj info;
                int theirConfigVersion = -10000;

                bool ok = _requestHeartbeat(mem, info, theirConfigVersion);

                // weight new ping with old pings
                // on the first ping, just use the ping value
                if (old.ping != 0) {
                    mem.ping = (unsigned int)((old.ping * .8) + (mem.ping * .2));
                }

                if( ok ) {
                    up(info, mem, &needsNewStateChecked);
                }
                else if (!info["errmsg"].eoo() && info["errmsg"].str() == "unauthorized") {
                    authIssue(mem);
                }
                else {
                    down(mem, info.getStringField("errmsg"));
                }
            }
            catch (const DBException& e) {
                log() << "replSet health poll task caught a DBException: " << e.what();
                down(mem, e.what());
            }
            catch (const std::exception& e) {
                log() << "replSet health poll task caught an exception: " << e.what();
                down(mem, e.what());
            }
            m = mem;

            theReplSet->mgr->send( boost::bind(&ReplSet::msgUpdateHBInfo, theReplSet, mem) );

            static time_t last = 0;
            time_t now = time(0);
            bool changed = mem.changed(old);
            if( changed ) {
                if( old.hbstate != mem.hbstate )
                    log() << "replSet member " << h.toString() << " is now in state " << mem.hbstate.toString() << rsLog;
            }
            if( needsNewStateChecked || changed || now-last>4 ) {
                last = now;
                theReplSet->mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) );
            }
        }
Example #2
0
        void up(const BSONObj& info, HeartbeatInfo& mem) {
            HeartbeatInfo::numPings++;
            mem.authIssue = false;

            if( mem.upSince == 0 ) {
                log() << "replSet member " << h.toString() << " is up" << rsLog;
                mem.upSince = mem.lastHeartbeat;
            }
            mem.health = 1.0;
            mem.lastHeartbeatMsg = info["hbmsg"].String();
            if (info.hasElement("syncingTo")) {
                mem.syncingTo = info["syncingTo"].String();
            }

            if( info.hasElement("opTime") )
                mem.opTime = info["opTime"].Date();

            // see if this member is in the electable set
            if( info["e"].eoo() ) {
                // for backwards compatibility
                const Member *member = theReplSet->findById(mem.id());
                if (member && member->config().potentiallyHot()) {
                    theReplSet->addToElectable(mem.id());
                }
                else {
                    theReplSet->rmFromElectable(mem.id());
                }
            }
            // add this server to the electable set if it is within 10
            // seconds of the latest optime we know of
            else if( info["e"].trueValue() &&
                     mem.opTime >= theReplSet->lastOpTimeWritten.getSecs() - 10) {
                unsigned lastOp = theReplSet->lastOtherOpTime().getSecs();
                if (lastOp > 0 && mem.opTime >= lastOp - 10) {
                    theReplSet->addToElectable(mem.id());
                }
            }
            else {
                theReplSet->rmFromElectable(mem.id());
            }

            be cfg = info["config"];
            if( cfg.ok() ) {
                // received a new config
                boost::function<void()> f =
                    boost::bind(&Manager::msgReceivedNewConfig, theReplSet->mgr, cfg.Obj().copy());
                theReplSet->mgr->send(f);
            }
        }
Example #3
0
        void down(HeartbeatInfo& mem, string msg) {
            // if we've received a heartbeat from this member within the last two seconds, don't
            // change its state to down (if it's already down, leave it down since we don't have
            // any info about it other than it's heartbeating us)
            if (m.lastHeartbeatRecv+2 >= time(0)) {
                log() << "replset info " << h.toString()
                      << " just heartbeated us, but our heartbeat failed: " << msg
                      << ", not changing state" << rsLog;
                // we don't update any of the heartbeat info, though, since we didn't get any info
                // other than "not down" from having it heartbeat us
                return;
            }

            mem.authIssue = false;
            mem.health = 0.0;
            mem.ping = 0;
            if( mem.upSince || mem.downSince == 0 ) {
                mem.upSince = 0;
                mem.downSince = jsTime();
                mem.hbstate = MemberState::RS_DOWN;
                log() << "replSet info " << h.toString() << " is down (or slow to respond): " << msg << rsLog;
            }
            mem.lastHeartbeatMsg = msg;
            theReplSet->rmFromElectable(mem.id());
        }
Example #4
0
        void down(HeartbeatInfo& mem, string msg) {
            // if we've received a heartbeat from this member within the last two seconds, don't
            // change its state to down (if it's already down, leave it down since we don't have
            // any info about it other than it's heartbeating us)

            // This code is essentially a no-op in vanilla MongoDB thanks to
            // SERVER-11280. I (Zardosht) am reluctant to fix it because
            // I don't know what impact it may have on elections and failover.
            // For now, commenting out because we are moving lastHeartbeatRecv
            // out of HeartbeatInfo and into Member
            /*
            if (m.lastHeartbeatRecv+2 >= time(0)) {
                log() << "replset info " << h.toString()
                      << " just heartbeated us, but our heartbeat failed: " << msg
                      << ", not changing state" << rsLog;
                // we don't update any of the heartbeat info, though, since we didn't get any info
                // other than "not down" from having it heartbeat us
                return;
            }
            */

            mem.authIssue = false;
            mem.health = 0.0;
            mem.ping = 0;
            if( mem.upSince || mem.downSince == 0 ) {
                mem.upSince = 0;
                mem.downSince = jsTime();
                mem.hbstate = MemberState::RS_DOWN;
                log() << "replSet info " << h.toString() << " is down (or slow to respond): " << msg << rsLog;
            }
            mem.lastHeartbeatMsg = msg;
            theReplSet->rmFromElectable(mem.id());
        }
Example #5
0
        void authIssue(HeartbeatInfo& mem) {
            mem.authIssue = true;
            mem.hbstate = MemberState::RS_UNKNOWN;

            // set health to 0 so that this doesn't count towards majority
            mem.health = 0.0;
            theReplSet->rmFromElectable(mem.id());
        }
Example #6
0
 void ReplSetImpl::msgUpdateHBInfo(HeartbeatInfo h) {
     for (Member *m = _members.head(); m; m=m->next()) {
         if (static_cast<int>(m->id()) == h.id()) {
             m->_hbinfo.updateFromLastPoll(h);
             return;
         }
     }
 }
Example #7
0
File: rs.cpp Project: sivy/mongo
 void ReplSetImpl::msgUpdateHBInfo(HeartbeatInfo h) { 
     for( Member *m = _members.head(); m; m=m->next() ) {
         if( m->id() == h.id() ) {
             m->_hbinfo = h;
             return;
         }
     }
 }
Example #8
0
        void doWork() { 
            HeartbeatInfo mem = m;
            HeartbeatInfo old = mem;
            try { 
                BSONObj info;
                int theirConfigVersion = -10000;
                bool ok = requestHeartbeat(theReplSet->name(), h.toString(), info, theReplSet->config().version, theirConfigVersion);
                mem.lastHeartbeat = time(0); // we set this on any response - we don't get this far if couldn't connect because exception is thrown
                {
                    be state = info["state"];
                    if( state.ok() )
                        mem.hbstate = (MemberState) state.Int();
                }
                if( ok ) {
                    if( mem.upSince == 0 ) {
                        log() << "replSet info " << h.toString() << " is now up" << rsLog;
                        mem.upSince = mem.lastHeartbeat;
                    }
                    mem.health = 1.0;
                    mem.lastHeartbeatMsg = "";

                    be cfg = info["config"];
                    if( cfg.ok() ) {
                        // received a new config
                        boost::function<void()> f = 
                            boost::bind(&ReplSet::Manager::msgReceivedNewConfig, theReplSet->mgr, cfg.Obj().copy());
                        theReplSet->mgr->send(f);
                    }
                }
                else { 
                    down(mem, info.getStringField("errmsg"));
                }
            }
            catch(...) { 
                down(mem, "connect/transport error");             
            }
            m = mem;
            theReplSet->mgr->send( boost::bind(&ReplSet::msgUpdateHBInfo, theReplSet, mem) );

            static time_t last = 0;
            time_t now = time(0);
            if( mem.changed(old) || now-last>4 ) {
                last = now;
                theReplSet->mgr->send( boost::bind(&ReplSet::Manager::msgCheckNewState, theReplSet->mgr) );
            }
        }
Example #9
0
        void authIssue(HeartbeatInfo& mem) {
            if (!mem.authIssue) {
                log() << "replSet member " << h.toString() << " has an auth issue" << rsLog;
            }
            mem.authIssue = true;
            mem.hbstate = MemberState::RS_UNKNOWN;

            // set health to 0 so that this doesn't count towards majority
            mem.health = 0.0;
            theReplSet->rmFromElectable(mem.id());
        }
Example #10
0
 void down(HeartbeatInfo& mem, string msg) {
     mem.health = 0.0;
     if( mem.upSince || mem.downSince == 0 ) {
         mem.upSince = 0;
         mem.downSince = jsTime();
         mem.hbstate = MemberState::RS_DOWN;
         log() << "replSet info " << h.toString() << " is down (or slow to respond): " << msg << rsLog;
     }
     mem.lastHeartbeatMsg = msg;
     theReplSet->rmFromElectable(mem.id());
 }
    // update internal state with heartbeat response, and run topology checks
    void TopologyCoordinatorImpl::updateHeartbeatInfo(Date_t now, const HeartbeatInfo& newInfo) {

        // Fill in the new heartbeat data for the appropriate member
        for (Member *m = _otherMembers.head(); m; m=m->next()) {
            if (m->id() == newInfo.id()) {
                m->get_hbinfo().updateFromLastPoll(newInfo);
                break;
            }
        }

        // Don't bother to make any changes if we are an election candidate
        if (_busyWithElectSelf) return;

        // ex-checkelectableset begins here
        unsigned int latestOp = _latestKnownOpTime().getSecs();
        
        // make sure the electable set is up-to-date
        if (_aMajoritySeemsToBeUp()
            && !_currentConfig.self->arbiterOnly    // not an arbiter
            && (_currentConfig.self->priority > 0)  // not priority 0
            && (_stepDownUntil <= now)              // stepDown timer has expired
            && (_memberState == MemberState::RS_SECONDARY)
            // we are within 10 seconds of primary
            && (latestOp == 0 || _lastApplied.getSecs() >= latestOp - 10)) {
            _electableSet.insert(_currentConfig.self->_id);
        }
        else {
            _electableSet.erase(_currentConfig.self->_id);
        }

        // check if we should ask the primary (possibly ourselves) to step down
        const Member* highestPriority = _getHighestPriorityElectable();
        const Member* primary = _currentPrimary;
        
        if (primary && highestPriority &&
            highestPriority->config().priority > primary->config().priority &&
            // if we're stepping down to allow another member to become primary, we
            // better have another member (latestOp), and it should be up-to-date
            latestOp != 0 && highestPriority->hbinfo().opTime.getSecs() >= latestOp - 10) {
            log() << "stepping down " << primary->fullName() << " (priority " <<
                primary->config().priority << "), " << highestPriority->fullName() <<
                " is priority " << highestPriority->config().priority << " and " <<
                (latestOp - highestPriority->hbinfo().opTime.getSecs()) << " seconds behind";

            // Are we primary?
            if (primary->h().isSelf()) {
                // replSetStepDown tries to acquire the same lock
                // msgCheckNewState takes, so we can't call replSetStepDown on
                // ourselves.
                // XXX Eric: schedule relinquish
                //rs->relinquish();
            }
            else {
                // We are not primary.  Step down the remote node.
                BSONObj cmd = BSON( "replSetStepDown" << 1 );
/*                ScopedConn conn(primary->fullName());
                BSONObj result;
                // XXX Eric: schedule stepdown command

                try {
                    if (!conn.runCommand("admin", cmd, result, 0)) {
                        log() << "stepping down " << primary->fullName()
                              << " failed: " << result << endl;
                    }
                }
                catch (DBException &e) {
                    log() << "stepping down " << primary->fullName() << " threw exception: "
                          << e.toString() << endl;
                }

*/
            }
        }


        // ex-checkauth begins here
        {
            int down = 0, authIssue = 0, total = 0;

            for( Member *m = _otherMembers.head(); m; m=m->next() ) {
                total++;

                // all authIssue servers will also be not up
                if (!m->hbinfo().up()) {
                    down++;
                    if (m->hbinfo().authIssue) {
                        authIssue++;
                    }
                }
            }

            // if all nodes are down or failed auth AND at least one failed
            // auth, go into recovering.  If all nodes are down, stay a
            // secondary.
            if (authIssue > 0 && down == total) {
                log() << "replset error could not reach/authenticate against any members";

                if (_currentPrimary == _self) {
                    log() << "auth problems, relinquishing primary" << rsLog;
                    // XXX Eric: schedule relinquish
                    //rs->relinquish();
                }

                _blockSync = true;
                // syncing is how we get into SECONDARY state, so we'll be stuck in
                // RECOVERING until we unblock
                _changeMemberState(MemberState::RS_RECOVERING);
            }
            else {
                _blockSync = false;
            }
        }

        // If a remote is primary, check that it is still up.
        if (_currentPrimary && _currentPrimary->id() != _self->id()) {
            if (!_currentPrimary->hbinfo().up() || 
                !_currentPrimary->hbinfo().hbstate.primary()) {
                _currentPrimary = NULL;
            }
        }

        // Scan the member list's heartbeat data for who is primary, and update ourselves if it's
        // not what _currentPrimary is.
        {
            const Member* remotePrimary(NULL);
            Member* m = _otherMembers.head();
            while (m) {
                DEV verify( m != _self );
                if( m->state().primary() && m->hbinfo().up() ) {
                    if( remotePrimary ) {
                        /* two other nodes think they are primary (asynchronously polled) -- wait for things to settle down. */
                        log() << "replSet info two primaries (transiently)" << rsLog;
                        return;
                    }
                    remotePrimary = m;
                }
                m = m->next();
            }

            if (remotePrimary) {
                // If it's the same as last time, don't do anything further.
                if (_currentPrimary == remotePrimary) {
                    return;
                }
                // Clear last heartbeat message on ourselves (why?)
                _self->lhb() = "";

                // insanity: this is what actually puts arbiters into ARBITER state
                if (_currentConfig.self->arbiterOnly) {
                    _changeMemberState(MemberState::RS_ARBITER);
                    return;
                }

                // If we are also primary, this is a problem.  Determine who should step down.
                if (_memberState == MemberState::RS_PRIMARY) {
                    OpTime remoteElectionTime = remotePrimary->hbinfo().electionTime;
                    log() << "replset: another primary seen with election time " 
                          << remoteElectionTime; 
                    // Step down whoever has the older election time.
                    if (remoteElectionTime > _electionTime) {
                        log() << "stepping down; another primary was elected more recently";
                        // XXX Eric: schedule a relinquish
                        //rs->relinquish();
                        // after completion, set currentprimary to remotePrimary.
                    }
                    else {
                        // else, stick around
                        log() << "another PRIMARY detected but it should step down"
                            " since it was elected earlier than me";
                        return;
                    }
                }

                _currentPrimary = remotePrimary;
                return;
            }
            /* didn't find anyone who is currently primary */
        }

        // If we are primary, check if we can still see majority of the set;
        // stepdown if we can't.
        if (_currentPrimary) {
            /* we must be primary */
            fassert(18505, _currentPrimary == _self);

            if (_shouldRelinquish()) {
                log() << "can't see a majority of the set, relinquishing primary" << rsLog;
                // XXX Eric: schedule a relinquish
                //rs->relinquish();
            }

            return;
        }

        // At this point, there is no primary anywhere.  Check to see if we should become an
        // election candidate.

        // If we can't elect ourselves due to config, can't become a candidate.
        if (!_currentConfig.self->arbiterOnly       // not an arbiter
            && (_currentConfig.self->priority > 0)  // not priority 0
            && (_stepDownUntil <= now)              // stepDown timer has expired
            && (_memberState == MemberState::RS_SECONDARY)) {
            OCCASIONALLY log() << "replSet I don't see a primary and I can't elect myself";
            return;
        }

        // If we can't see a majority, can't become a candidate.
        if (!_aMajoritySeemsToBeUp()) {
            static Date_t last;
            static int n = 0;
            int ll = 0;
            if( ++n > 5 ) ll++;
            if( last + 60 > now ) ll++;
            LOG(ll) << "replSet can't see a majority, will not try to elect self" << rsLog;
            last = now;
            return;
        }

        // If we can't elect ourselves due to the current electable set;
        // we are in the set if we are within 10 seconds of the latest known op (via heartbeats)
        if (!(_electableSet.find(_self->id()) != _electableSet.end())) {
            // we are too far behind to become primary
            return;
        }

        // All checks passed, become a candidate and start election proceedings.

        // don't try to do further elections & such while we are already working on one.
        _busyWithElectSelf = true; 

    // XXX: schedule an election
/*
        try {
            rs->elect.electSelf();
        }
        catch(RetryAfterSleepException&) {
            // we want to process new inbounds before trying this again.  so we just put a checkNewstate in the queue for eval later. 
            requeue();
        }
        catch(...) {
            log() << "replSet error unexpected assertion in rs manager" << rsLog;
        }
        
    }


*/
        _busyWithElectSelf = false;
    }
Example #12
0
        void doWork() {
            if ( !theReplSet ) {
                log(2) << "theReplSet not initialized yet, skipping health poll this round" << rsLog;
                return;
            }

            HeartbeatInfo mem = m;
            HeartbeatInfo old = mem;
            try {
                BSONObj info;
                int theirConfigVersion = -10000;

                Timer timer;

                bool ok = requestHeartbeat(theReplSet->name(), theReplSet->selfFullName(), h.toString(), info, theReplSet->config().version, theirConfigVersion);
                
                mem.ping = (unsigned int)timer.micros();

                time_t before = timer.startTime() / 1000000;
                // we set this on any response - we don't get this far if
                // couldn't connect because exception is thrown
                time_t after = mem.lastHeartbeat = before + (mem.ping / 1000000);
                
                if ( info["time"].isNumber() ) {
                    long long t = info["time"].numberLong();
                    if( t > after )
                        mem.skew = (int) (t - after);
                    else if( t < before )
                        mem.skew = (int) (t - before); // negative
                }
                else {
                    // it won't be there if remote hasn't initialized yet
                    if( info.hasElement("time") )
                        warning() << "heatbeat.time isn't a number: " << info << endl;
                    mem.skew = INT_MIN;
                }

                {
                    be state = info["state"];
                    if( state.ok() )
                        mem.hbstate = MemberState(state.Int());
                }
                if( ok ) {
                    if( mem.upSince == 0 ) {
                        log() << "replSet info " << h.toString() << " is up" << rsLog;
                        mem.upSince = mem.lastHeartbeat;
                    }
                    mem.health = 1.0;
                    mem.lastHeartbeatMsg = info["hbmsg"].String();
                    if( info.hasElement("opTime") )
                        mem.opTime = info["opTime"].Date();

                    be cfg = info["config"];
                    if( cfg.ok() ) {
                        // received a new config
                        boost::function<void()> f =
                            boost::bind(&Manager::msgReceivedNewConfig, theReplSet->mgr, cfg.Obj().copy());
                        theReplSet->mgr->send(f);
                    }
                }
                else {
                    down(mem, info.getStringField("errmsg"));
                }
            }
            catch(DBException& e) {
                down(mem, e.what());
            }
            catch(...) {
                down(mem, "something unusual went wrong");
            }
            m = mem;

            theReplSet->mgr->send( boost::bind(&ReplSet::msgUpdateHBInfo, theReplSet, mem) );

            static time_t last = 0;
            time_t now = time(0);
            bool changed = mem.changed(old);
            if( changed ) {
                if( old.hbstate != mem.hbstate )
                    log() << "replSet member " << h.toString() << ' ' << mem.hbstate.toString() << rsLog;
            }
            if( changed || now-last>4 ) {
                last = now;
                theReplSet->mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) );
            }
        }
Example #13
0
        void up(const BSONObj& info, HeartbeatInfo& mem, bool* needsNewStateChecked) {
            HeartbeatInfo::numPings++;
            mem.authIssue = false;

            if( mem.upSince == 0 ) {
                log() << "replSet member " << h.toString() << " is up" << rsLog;
                mem.upSince = mem.lastHeartbeat;
            }
            mem.health = 1.0;
            mem.lastHeartbeatMsg = info["hbmsg"].String();
            if (info.hasElement("syncingTo")) {
                mem.syncingTo = info["syncingTo"].String();
            }
            if( info.hasElement("opTime") ) {
                mem.opTime = info["opTime"].Date();
            }
            if ( info.hasElement("GTID")) {
                mem.gtid = getGTIDFromBSON("GTID", info);
            }
            if ( info.hasElement("lastUnappliedGTID")) {
                mem.lastUnappliedGTID = getGTIDFromBSON("lastUnappliedGTID", info);
            }
            if ( info.hasElement("minLiveGTID")) {
                mem.minLiveGTID= getGTIDFromBSON("minLiveGTID", info);
            }
            if ( info.hasElement("minUnappliedGTID")) {
                mem.minUnappliedGTID= getGTIDFromBSON("minUnappliedGTID", info);
            }
            if ( info.hasElement("oplogVersion")) {
                mem.oplogVersion = info["oplogVersion"].numberLong();
            }
            else {
                mem.oplogVersion = 0;
            }
            // for "highest known primary"
            if ( info.hasElement("hkp")) {
                mem.highestKnownPrimaryInSet = info["hkp"].numberLong();
                // if the highest known primary across the replica set has changed,
                // communicate that to the caller so that Manager::msgCheckNewState
                // eventually gets called
                *needsNewStateChecked = theReplSet->handleHighestKnownPrimaryOfMember(mem.highestKnownPrimaryInSet);
            }
            else {
                mem.highestKnownPrimaryInSet = 0;
            }
            // see if this member is in the electable set
            if( info["e"].eoo() ) {
                // for backwards compatibility
                const Member *member = theReplSet->findById(mem.id());
                if (member && member->config().potentiallyHot()) {
                    theReplSet->addToElectable(mem.id());
                }
                else {
                    theReplSet->rmFromElectable(mem.id());
                }
            }
            // add this server to the electable set if it is within 10
            // seconds of the latest optime we know of
            else if( info["e"].trueValue() &&
                     mem.opTime + 10000 >= (theReplSet->gtidManager ? theReplSet->gtidManager->getCurrTimestamp() : 0)) 
            {
                unsigned lastOp = theReplSet->lastOtherOpTime();
                if (lastOp > 0 && mem.opTime + 10000 >= lastOp) {
                    theReplSet->addToElectable(mem.id());
                }
            }
            else {
                theReplSet->rmFromElectable(mem.id());
            }

            be cfg = info["config"];
            if( cfg.ok() ) {
                // received a new config
                boost::function<void()> f =
                    boost::bind(&Manager::msgReceivedNewConfig, theReplSet->mgr, cfg.Obj().copy());
                theReplSet->mgr->send(f);
            }
        }
Example #14
0
        void doWork() {
            if ( !theReplSet ) {
                LOG(2) << "replSet not initialized yet, skipping health poll this round" << rsLog;
                return;
            }

            HeartbeatInfo mem = m;
            HeartbeatInfo old = mem;
            try {
                BSONObj info;
                int theirConfigVersion = -10000;

                Timer timer;

                bool ok = requestHeartbeat(theReplSet->name(), theReplSet->selfFullName(), h.toString(), info, theReplSet->config().version, theirConfigVersion);

                mem.ping = (unsigned int)timer.millis();

                time_t before = timer.startTime() / 1000000;
                // we set this on any response - we don't get this far if
                // couldn't connect because exception is thrown
                time_t after = mem.lastHeartbeat = before + (mem.ping / 1000);

                // weight new ping with old pings
                // on the first ping, just use the ping value
                if (old.ping != 0) {
                    mem.ping = (unsigned int)((old.ping * .8) + (mem.ping * .2));
                }

                if ( info["time"].isNumber() ) {
                    long long t = info["time"].numberLong();
                    if( t > after )
                        mem.skew = (int) (t - after);
                    else if( t < before )
                        mem.skew = (int) (t - before); // negative
                }
                else {
                    // it won't be there if remote hasn't initialized yet
                    if( info.hasElement("time") )
                        warning() << "heatbeat.time isn't a number: " << info << endl;
                    mem.skew = INT_MIN;
                }

                {
                    be state = info["state"];
                    if( state.ok() )
                        mem.hbstate = MemberState(state.Int());
                }
                if( ok ) {
                    HeartbeatInfo::numPings++;

                    if( mem.upSince == 0 ) {
                        log() << "replSet info member " << h.toString() << " is up" << rsLog;
                        mem.upSince = mem.lastHeartbeat;
                    }
                    mem.health = 1.0;
                    mem.lastHeartbeatMsg = info["hbmsg"].String();
                    if( info.hasElement("opTime") )
                        mem.opTime = info["opTime"].Date();

                    // see if this member is in the electable set
                    if( info["e"].eoo() ) {
                        // for backwards compatibility
                        const Member *member = theReplSet->findById(mem.id());
                        if (member && member->config().potentiallyHot()) {
                            theReplSet->addToElectable(mem.id());
                        }
                        else {
                            theReplSet->rmFromElectable(mem.id());
                        }
                    }
                    // add this server to the electable set if it is within 10
                    // seconds of the latest optime we know of
                    else if( info["e"].trueValue() &&
                             mem.opTime >= theReplSet->lastOpTimeWritten.getSecs() - 10) {
                        unsigned lastOp = theReplSet->lastOtherOpTime().getSecs();
                        if (lastOp > 0 && mem.opTime >= lastOp - 10) {
                            theReplSet->addToElectable(mem.id());
                        }
                    }
                    else {
                        theReplSet->rmFromElectable(mem.id());
                    }
                    
                    be cfg = info["config"];
                    if( cfg.ok() ) {
                        // received a new config
                        boost::function<void()> f =
                            boost::bind(&Manager::msgReceivedNewConfig, theReplSet->mgr, cfg.Obj().copy());
                        theReplSet->mgr->send(f);
                    }
                }
                else {
                    down(mem, info.getStringField("errmsg"));
                }
            }
            catch(DBException& e) {
                down(mem, e.what());
            }
            catch(...) {
                down(mem, "replSet unexpected exception in ReplSetHealthPollTask");
            }
            m = mem;

            theReplSet->mgr->send( boost::bind(&ReplSet::msgUpdateHBInfo, theReplSet, mem) );

            static time_t last = 0;
            time_t now = time(0);
            bool changed = mem.changed(old);
            if( changed ) {
                if( old.hbstate != mem.hbstate )
                    log() << "replSet member " << h.toString() << " is now in state " << mem.hbstate.toString() << rsLog;
            }
            if( changed || now-last>4 ) {
                last = now;
                theReplSet->mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) );
            }
        }
Example #15
0
        void doWork() { 
            if ( !theReplSet ) {
                log(2) << "theReplSet not initialized yet, skipping health poll this round" << rsLog;
                return;
            }

            HeartbeatInfo mem = m;
            HeartbeatInfo old = mem;
            try { 
                BSONObj info;
                int theirConfigVersion = -10000;

                time_t before = time(0);

                bool ok = requestHeartbeat(theReplSet->name(), theReplSet->selfFullName(), h.toString(), info, theReplSet->config().version, theirConfigVersion);

                time_t after = mem.lastHeartbeat = time(0); // we set this on any response - we don't get this far if couldn't connect because exception is thrown

                try {
                    mem.skew = 0;
                    long long t = info["time"].Long();
                    if( t > after ) 
                        mem.skew = (int) (t - after);
                    else if( t < before ) 
                        mem.skew = (int) (t - before); // negative
                }
                catch(...) { 
                    mem.skew = INT_MIN;
                }

                {
                    be state = info["state"];
                    if( state.ok() )
                        mem.hbstate = MemberState(state.Int());
                }
                if( ok ) {
                    if( mem.upSince == 0 ) {
                        log() << "replSet info " << h.toString() << " is now up" << rsLog;
                        mem.upSince = mem.lastHeartbeat;
                    }
                    mem.health = 1.0;
                    mem.lastHeartbeatMsg = info["hbmsg"].String();
                    if( info.hasElement("opTime") )
                        mem.opTime = info["opTime"].Date();

                    be cfg = info["config"];
                    if( cfg.ok() ) {
                        // received a new config
                        boost::function<void()> f = 
                            boost::bind(&Manager::msgReceivedNewConfig, theReplSet->mgr, cfg.Obj().copy());
                        theReplSet->mgr->send(f);
                    }
                }
                else { 
                    down(mem, info.getStringField("errmsg"));
                }
            }
            catch(...) { 
                down(mem, "connect/transport error");             
            }
            m = mem;

            theReplSet->mgr->send( boost::bind(&ReplSet::msgUpdateHBInfo, theReplSet, mem) );

            static time_t last = 0;
            time_t now = time(0);
            bool changed = mem.changed(old);
            if( changed ) { 
                if( old.hbstate != mem.hbstate ) 
                    log() << "replSet " << h.toString() << ' ' << mem.hbstate.toString() << rsLog;
            }
            if( changed || now-last>4 ) {
                last = now;
                theReplSet->mgr->send( boost::bind(&Manager::msgCheckNewState, theReplSet->mgr) );
            }
        }