Exemple #1
0
    void ReplSetImpl::syncThread() {
        while( 1 ) {
            // After a reconfig, we may not be in the replica set anymore, so
            // check that we are in the set (and not an arbiter) before
            // trying to sync with other replicas.
            if( ! _self ) {
                log() << "replSet warning did not receive a valid config yet, sleeping 20 seconds " << rsLog;
                sleepsecs(20);
                continue;
            }
            if( myConfig().arbiterOnly ) {
                return;
            }

            fassert(16113, !Lock::isLocked());

            try {
                _syncThread();
            }
            catch(const DBException& e) {
                sethbmsg(str::stream() << "syncThread: " << e.toString());
                sleepsecs(10);
            }
            catch(...) {
                sethbmsg("unexpected exception in syncThread()");
                // TODO : SET NOT SECONDARY here?
                sleepsecs(60);
            }
            sleepsecs(1);
        }
    }
Exemple #2
0
    void BackgroundSync::producerThread() {
        Client::initThread("rsBackgroundSync");
        replLocalAuth();

        while (!inShutdown()) {
            if (!theReplSet) {
                log() << "replSet warning did not receive a valid config yet, sleeping 20 seconds " << rsLog;
                sleepsecs(20);
                continue;
            }

            try {
                _producerThread();
            }
            catch (DBException& e) {
                sethbmsg(str::stream() << "db exception in producer: " << e.toString());
                sleepsecs(10);
            }
            catch (std::exception& e2) {
                sethbmsg(str::stream() << "exception in producer: " << e2.what());
                sleepsecs(60);
            }
        }

        cc().shutdown();
    }
Exemple #3
0
    /* should be in RECOVERING state on arrival here.
       readlocks
       @return true if transitioned to SECONDARY
    */
    bool ReplSetImpl::tryToGoLiveAsASecondary(OpTime& /*out*/ minvalid) {
        bool golive = false;

        {
            lock lk( this );

            if (_maintenanceMode > 0) {
                // we're not actually going live
                return true;
            }
        }

        {
            readlock lk("local.replset.minvalid");
            BSONObj mv;
            if( Helpers::getSingleton("local.replset.minvalid", mv) ) {
                minvalid = mv["ts"]._opTime();
                if( minvalid <= lastOpTimeWritten ) {
                    golive=true;
                }
            }
            else
                golive = true; /* must have been the original member */
        }
        if( golive ) {
            sethbmsg("");
            changeState(MemberState::RS_SECONDARY);
        }
        return golive;
    }
Exemple #4
0
    void ReplSetImpl::syncTail() { 
        // todo : locking vis a vis the mgr...

        const Member *primary = box.getPrimary();
        if( primary == 0 ) return;
        string hn = primary->h().toString();
        OplogReader r;
        if( !r.connect(primary->h().toString()) ) { 
            log(2) << "replSet can't connect to " << hn << " to read operations" << rsLog;
            return;
        }

        /* first make sure we are not hopelessly out of sync by being very stale. */
        {
            BSONObj remoteOldestOp = r.findOne(rsoplog, Query());
            OpTime ts = remoteOldestOp["ts"]._opTime();
            DEV log() << "remoteOldestOp: " << ts.toStringPretty() << endl;
            else log(3) << "remoteOldestOp: " << ts.toStringPretty() << endl;
            if( lastOpTimeWritten < ts ) { 
                log() << "replSet error too stale to catch up, at least from primary " << hn << rsLog;
                log() << "replSet our last optime : " << lastOpTimeWritten.toStringPretty() << rsLog;
                log() << "replSet oldest at " << hn << " : " << ts.toStringPretty() << rsLog;
                log() << "replSet See http://www.mongodb.org/display/DOCS/Resyncing+a+Very+Stale+Replica+Set+Member" << rsLog;
                sethbmsg("error too stale to catch up");
                sleepsecs(120);
                return;
            }
        }
Exemple #5
0
    /* should be in RECOVERING state on arrival here.
       readlocks
       @return true if transitioned to SECONDARY
    */
    bool ReplSetImpl::tryToGoLiveAsASecondary(OperationContext* txn, OpTime& /*out*/ minvalid) {
        bool golive = false;

        lock rsLock( this );

        if (_maintenanceMode > 0) {
            // we're not actually going live
            return true;
        }

        // if we're blocking sync, don't change state
        if (_blockSync) {
            return false;
        }

        Lock::GlobalWrite writeLock(txn->lockState());

        // make sure we're not primary, secondary, rollback, or fatal already
        if (box.getState().primary() || box.getState().secondary() ||
            box.getState().fatal()) {
            return false;
        }

        minvalid = getMinValid(txn);
        if( minvalid <= lastOpTimeWritten ) {
            golive=true;
        }
        else {
            sethbmsg(str::stream() << "still syncing, not yet to minValid optime " <<
                     minvalid.toString());
        }

        if( golive ) {
            sethbmsg("");
            changeState(MemberState::RS_SECONDARY);
        }
        return golive;
    }
        virtual bool run(OperationContext* txn, const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
            log() << "replSet replSetTest command received: " << cmdObj.toString() << rsLog;

            if( cmdObj.hasElement("forceInitialSyncFailure") ) {
                replSetForceInitialSyncFailure = (unsigned) cmdObj["forceInitialSyncFailure"].Number();
                return true;
            }

            if( !check(errmsg, result) )
                return false;

            if( cmdObj.hasElement("blind") ) {
                replSetBlind = cmdObj.getBoolField("blind");
                return true;
            }

            if (cmdObj.hasElement("sethbmsg")) {
                sethbmsg(cmdObj["sethbmsg"].String());
                return true;
            }

            return false;
        }
        virtual bool run(OperationContext* txn, const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
            log() << "replSet replSetTest command received: " << cmdObj.toString() << rsLog;

            if( cmdObj.hasElement("forceInitialSyncFailure") ) {
                replSetForceInitialSyncFailure = (unsigned) cmdObj["forceInitialSyncFailure"].Number();
                return true;
            }

            Status status = getGlobalReplicationCoordinator()->checkReplEnabledForCommand(&result);
            if (!status.isOK())
                return appendCommandStatus(result, status);

            if( cmdObj.hasElement("blind") ) {
                replSetBlind = cmdObj.getBoolField("blind");
                return true;
            }

            if (cmdObj.hasElement("sethbmsg")) {
                sethbmsg(cmdObj["sethbmsg"].String());
                return true;
            }

            return false;
        }
Exemple #8
0
    bool replset::InitialSync::oplogApplication(OplogReader& r, const Member* source,
        const OpTime& applyGTE, const OpTime& minValid) {

        const string hn = source->fullName();
        try {
            r.tailingQueryGTE( rsoplog, applyGTE );
            if ( !r.haveCursor() ) {
                log() << "replSet initial sync oplog query error" << rsLog;
                return false;
            }

            {
                if( !r.more() ) {
                    sethbmsg("replSet initial sync error reading remote oplog");
                    log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog;
                    return false;
                }
                bo op = r.next();
                OpTime t = op["ts"]._opTime();
                r.putBack(op);

                if( op.firstElementFieldName() == string("$err") ) {
                    log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog;
                    return false;
                }

                uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() );
                if( t > applyGTE ) {
                    sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync");
                    log() << "replSet initial sync expected first optime of " << applyGTE << rsLog;
                    log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog;
                    return false;
                }

                sethbmsg(str::stream() << "initial oplog application from " << hn << " starting at "
                         << t.toStringPretty() << " to " << minValid.toStringPretty());
            }
        }
        catch(DBException& e) {
            log() << "replSet initial sync failing: " << e.toString() << rsLog;
            return false;
        }

        /* we lock outside the loop to avoid the overhead of locking on every operation. */
        writelock lk("");

        // todo : use exhaust
        OpTime ts;
        time_t start = time(0);
        unsigned long long n = 0;
        int fails = 0;
        while( ts < minValid ) {
            try {
                // There are some special cases with initial sync (see the catch block), so we
                // don't want to break out of this while until we've reached minvalid. Thus, we'll
                // keep trying to requery.
                if( !r.more() ) {
                    OCCASIONALLY log() << "replSet initial sync oplog: no more records" << endl;
                    sleepsecs(1);

                    r.resetCursor();
                    r.tailingQueryGTE(rsoplog, theReplSet->lastOpTimeWritten);
                    if ( !r.haveCursor() ) {
                        if (fails++ > 30) {
                            log() << "replSet initial sync tried to query oplog 30 times, giving up" << endl;
                            return false;
                        }
                    }

                    continue;
                }

                BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */
                ts = o["ts"]._opTime();

                {
                    if( (source->state() != MemberState::RS_PRIMARY &&
                            source->state() != MemberState::RS_SECONDARY) ||
                            replSetForceInitialSyncFailure ) {

                        int f = replSetForceInitialSyncFailure;
                        if( f > 0 ) {
                            replSetForceInitialSyncFailure = f-1;
                            log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog;
                            throw DBException("forced error",0);
                        }
                        log() << "replSet we are now primary" << rsLog;
                        throw DBException("primary changed",0);
                    }

                    applyOp(o, applyGTE);
                }

                if ( ++n % 1000 == 0 ) {
                    time_t now = time(0);
                    if (now - start > 10) {
                        // simple progress metering
                        log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to "
                              << ts.toStringPretty() << rsLog;
                        start = now;
                    }
                }

                getDur().commitIfNeeded();
            }
            catch (DBException& e) {
                // Skip duplicate key exceptions.
                // These are relatively common on initial sync: if a document is inserted
                // early in the clone step, the insert will be replayed but the document
                // will probably already have been cloned over.
                if( e.getCode() == 11000 || e.getCode() == 11001 || e.getCode() == 12582) {
                    continue;
                }
                
                // handle cursor not found (just requery)
                if( e.getCode() == 13127 ) {
                    log() << "replSet requerying oplog after cursor not found condition, ts: " << ts.toStringPretty() << endl;
                    r.resetCursor();
                    r.tailingQueryGTE(rsoplog, ts);
                    if( r.haveCursor() ) {
                        continue;
                    }
                }

                // TODO: handle server restart

                if( ts <= minValid ) {
                    // didn't make it far enough
                    log() << "replSet initial sync failing, error applying oplog : " << e.toString() << rsLog;
                    return false;
                }

                // otherwise, whatever, we'll break out of the loop and catch
                // anything that's really wrong in syncTail
            }
        }
        return true;
    }
Exemple #9
0
    /* initial oplog application, during initial sync, after cloning. 
       @return false on failure.  
       this method returns an error and doesn't throw exceptions (i think).
    */
    bool ReplSetImpl::initialSyncOplogApplication(
        const Member *source,
        OpTime applyGTE,
        OpTime minValid)
    { 
        if( source == 0 ) return false;

        const string hn = source->h().toString();
        OpTime ts;
        try {
            OplogReader r;
            if( !r.connect(hn) ) { 
                log() << "replSet initial sync error can't connect to " << hn << " to read " << rsoplog << rsLog;
                return false;
            }

            {
                BSONObjBuilder q;
                q.appendDate("$gte", applyGTE.asDate());
                BSONObjBuilder query;
                query.append("ts", q.done());
                BSONObj queryObj = query.done();
                r.query(rsoplog, queryObj);
            }
            assert( r.haveCursor() );

            /* we lock outside the loop to avoid the overhead of locking on every operation.  server isn't usable yet anyway! */
            writelock lk("");

            {
                if( !r.more() ) { 
                    sethbmsg("replSet initial sync error reading remote oplog");
                    log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog;
                    return false;
                }
                bo op = r.next();
                OpTime t = op["ts"]._opTime();
                r.putBack(op);

                if( op.firstElement().fieldName() == string("$err") ) { 
                    log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog;
                    return false;
                }

                uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() );
                if( t > applyGTE ) {
                    sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync");
                    log() << "replSet initial sync expected first optime of " << applyGTE << rsLog;
                    log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog;
                    return false;
                }
            }

            // todo : use exhaust
            unsigned long long n = 0;
            while( 1 ) { 

                if( !r.more() )
                    break;
                BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */
                {
                    ts = o["ts"]._opTime();

                    /* if we have become primary, we dont' want to apply things from elsewhere
                        anymore. assumePrimary is in the db lock so we are safe as long as 
                        we check after we locked above. */
                    if( (source->state() != MemberState::RS_PRIMARY &&
                         source->state() != MemberState::RS_SECONDARY) ||
                        replSetForceInitialSyncFailure ) {
                        
                        int f = replSetForceInitialSyncFailure;
                        if( f > 0 ) {
                            replSetForceInitialSyncFailure = f-1;
                            log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog;
                            throw DBException("forced error",0);
                        }
                        log() << "replSet we are now primary" << rsLog;
                        throw DBException("primary changed",0);
                    }

                    if( ts >= applyGTE ) {
                        // optimes before we started copying need not be applied.
                        syncApply(o);
                    }
                    _logOpObjRS(o);   /* with repl sets we write the ops to our oplog too */
                }
                if( ++n % 100000 == 0 ) { 
                    // simple progress metering
                    log() << "replSet initialSyncOplogApplication " << n << rsLog;
                }
            }
        }
        catch(DBException& e) { 
            if( ts <= minValid ) {
                // didn't make it far enough
                log() << "replSet initial sync failing, error applying oplog " << e.toString() << rsLog;
                return false;
            }
        }
        return true;
    }
    void TopologyCoordinatorImpl::chooseNewSyncSource(Date_t now) {
        // if we have a target we've requested to sync from, use it
/*
  This should be a HostAndPort.
*/
        // XXX Eric
/*
        if (_forceSyncTarget) {
            Member* target = _forceSyncTarget;
            _forceSyncTarget = 0;
            sethbmsg( str::stream() << "syncing to: " << target->fullName() << " by request", 0);
            return target;
        }
*/

        // wait for 2N pings before choosing a sync target
        int needMorePings = _currentConfig.members.size()*2 - HeartbeatInfo::numPings;

        if (needMorePings > 0) {
            OCCASIONALLY log() << "waiting for " << needMorePings 
                               << " pings from other members before syncing";
            return;
        }

        // If we are only allowed to sync from the primary, set that
        if (!_currentConfig.chainingAllowed) {
            // Sets NULL if we cannot reach the primary
            _syncSource = _currentPrimary;
        }

        // find the member with the lowest ping time that has more data than me

        // Find primary's oplog time. Reject sync candidates that are more than
        // maxSyncSourceLagSecs seconds behind.
        OpTime primaryOpTime;
        if (_currentPrimary)
            primaryOpTime = _currentPrimary->hbinfo().opTime;
        else
            // choose a time that will exclude no candidates, since we don't see a primary
            primaryOpTime = OpTime(maxSyncSourceLagSecs, 0);

        if (primaryOpTime.getSecs() < static_cast<unsigned int>(maxSyncSourceLagSecs)) {
            // erh - I think this means there was just a new election
            // and we don't yet know the new primary's optime
            primaryOpTime = OpTime(maxSyncSourceLagSecs, 0);
        }

        OpTime oldestSyncOpTime(primaryOpTime.getSecs() - maxSyncSourceLagSecs, 0);

        Member *closest = 0;

        // Make two attempts.  The first attempt, we ignore those nodes with
        // slave delay higher than our own.  The second attempt includes such
        // nodes, in case those are the only ones we can reach.
        // This loop attempts to set 'closest'.
        for (int attempts = 0; attempts < 2; ++attempts) {
            for (Member *m = _otherMembers.head(); m; m = m->next()) {
                if (!m->syncable())
                    continue;

                if (m->state() == MemberState::RS_SECONDARY) {
                    // only consider secondaries that are ahead of where we are
                    if (m->hbinfo().opTime <= _lastApplied)
                        continue;
                    // omit secondaries that are excessively behind, on the first attempt at least.
                    if (attempts == 0 &&
                        m->hbinfo().opTime < oldestSyncOpTime)
                        continue;
                }

                // omit nodes that are more latent than anything we've already considered
                if (closest &&
                    (m->hbinfo().ping > closest->hbinfo().ping))
                    continue;

                if (attempts == 0 &&
                    (_currentConfig.self->slaveDelay < m->config().slaveDelay 
                     || m->config().hidden)) {
                    continue; // skip this one in the first attempt
                }

                std::map<HostAndPort,Date_t>::iterator vetoed = _syncSourceBlacklist.find(m->h());
                if (vetoed != _syncSourceBlacklist.end()) {
                    // Do some veto housekeeping

                    // if this was on the veto list, check if it was vetoed in the last "while".
                    // if it was, skip.
                    if (vetoed->second >= now) {
                        if (now % 5 == 0) {
                            log() << "replSet not trying to sync from " << (*vetoed).first
                                  << ", it is vetoed for " << ((*vetoed).second - now) 
                                  << " more seconds" << rsLog;
                        }
                        continue;
                    }
                    _syncSourceBlacklist.erase(vetoed);
                    // fall through, this is a valid candidate now
                }
                // This candidate has passed all tests; set 'closest'
                closest = m;
            }
            if (closest) break; // no need for second attempt
        }

        if (!closest) {
            return;
        }

        sethbmsg( str::stream() << "syncing to: " << closest->fullName(), 0);
        _syncSource = closest;
    }
Exemple #11
0
    void ReplSetImpl::loadConfig(OperationContext* txn) {
        startupStatus = LOADINGCONFIG;
        startupStatusMsg.set("loading " + rsConfigNs + " config (LOADINGCONFIG)");
        LOG(1) << "loadConfig() " << rsConfigNs << endl;

        while (1) {
            try {
                OwnedPointerVector<ReplSetConfig> configs;
                try {
                    configs.mutableVector().push_back(ReplSetConfig::makeDirect(txn));
                }
                catch (DBException& e) {
                    log() << "replSet exception loading our local replset configuration object : "
                          << e.toString() << rsLog;
                }
                for (vector<HostAndPort>::const_iterator i = _seeds->begin();
                        i != _seeds->end();
                        i++) {
                    try {
                        configs.mutableVector().push_back(ReplSetConfig::make(txn, *i));
                    }
                    catch (DBException& e) {
                        log() << "replSet exception trying to load config from " << *i
                              << " : " << e.toString() << rsLog;
                    }
                }
                ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings();
                {
                    scoped_lock lck(replSettings.discoveredSeeds_mx);
                    if (replSettings.discoveredSeeds.size() > 0) {
                        for (set<string>::iterator i = replSettings.discoveredSeeds.begin(); 
                             i != replSettings.discoveredSeeds.end(); 
                             i++) {
                            try {
                                configs.mutableVector().push_back(
                                                            ReplSetConfig::make(txn, HostAndPort(*i)));
                            }
                            catch (DBException&) {
                                LOG(1) << "replSet exception trying to load config from discovered "
                                          "seed " << *i << rsLog;
                                replSettings.discoveredSeeds.erase(*i);
                            }
                        }
                    }
                }

                if (!replSettings.reconfig.isEmpty()) {
                    try {
                        configs.mutableVector().push_back(ReplSetConfig::make(txn, replSettings.reconfig,
                                                                       true));
                    }
                    catch (DBException& re) {
                        log() << "replSet couldn't load reconfig: " << re.what() << rsLog;
                        replSettings.reconfig = BSONObj();
                    }
                }

                int nok = 0;
                int nempty = 0;
                for (vector<ReplSetConfig*>::iterator i = configs.mutableVector().begin();
                     i != configs.mutableVector().end(); i++) {
                    if ((*i)->ok())
                        nok++;
                    if ((*i)->empty())
                        nempty++;
                }
                if (nok == 0) {

                    if (nempty == (int) configs.mutableVector().size()) {
                        startupStatus = EMPTYCONFIG;
                        startupStatusMsg.set("can't get " + rsConfigNs +
                                             " config from self or any seed (EMPTYCONFIG)");
                        log() << "replSet can't get " << rsConfigNs
                              << " config from self or any seed (EMPTYCONFIG)" << rsLog;
                        static unsigned once;
                        if (++once == 1) {
                            log() << "replSet info you may need to run replSetInitiate -- rs.initia"
                                     "te() in the shell -- if that is not already done" << rsLog;
                        }
                        if (_seeds->size() == 0) {
                            LOG(1) << "replSet info no seed hosts were specified on the --replSet "
                                      "command line" << rsLog;
                        }
                    }
                    else {
                        startupStatus = EMPTYUNREACHABLE;
                        startupStatusMsg.set("can't currently get " + rsConfigNs +
                                             " config from self or any seed (EMPTYUNREACHABLE)");
                        log() << "replSet can't get " << rsConfigNs
                              << " config from self or any seed (yet)" << rsLog;
                    }

                    sleepsecs(1);
                    continue;
                }

                if (!_loadConfigFinish(txn, configs.mutableVector())) {
                    log() << "replSet info Couldn't load config yet. Sleeping 3 sec and will try "
                             "again." << rsLog;
                    sleepsecs(3);
                    continue;
                }
            }
            catch (DBException& e) {
                startupStatus = BADCONFIG;
                startupStatusMsg.set("replSet error loading set config (BADCONFIG)");
                log() << "replSet error loading configurations " << e.toString() << rsLog;
                log() << "replSet error replication will not start" << rsLog;
                sethbmsg("error loading set config");
                fassertFailedNoTrace(18754);
                throw;
            }
            break;
        }
        startupStatusMsg.set("? started");
        startupStatus = STARTED;
    }
Exemple #12
0
    void BackgroundSync::applyOpsFromOplog() {
        GTID lastLiveGTID;
        GTID lastUnappliedGTID;
        while (1) {
            try {
                BSONObj curr;
                {
                    boost::unique_lock<boost::mutex> lck(_mutex);
                    // wait until we know an item has been produced
                    while (_deque.size() == 0 && !_applierShouldExit) {
                        _queueDone.notify_all();
                        _queueCond.wait(lck);
                    }
                    if (_deque.size() == 0 && _applierShouldExit) {
                        return; 
                    }
                    curr = _deque.front();
                }
                GTID currEntry = getGTIDFromOplogEntry(curr);
                theReplSet->gtidManager->noteApplyingGTID(currEntry);
                // we must do applyTransactionFromOplog in a loop
                // because once we have called noteApplyingGTID, we must
                // continue until we are successful in applying the transaction.
                for (uint32_t numTries = 0; numTries <= 100; numTries++) {
                    try {
                        numTries++;
                        TimerHolder timer(&applyBatchStats);
                        applyTransactionFromOplog(curr);
                        opsAppliedStats.increment();
                        break;
                    }
                    catch (std::exception &e) {
                        log() << "exception during applying transaction from oplog: " << e.what() << endl;
                        log() << "oplog entry: " << curr.str() << endl;
                        if (numTries == 100) {
                            // something is really wrong if we fail 100 times, let's abort
                            ::abort();
                        }
                        sleepsecs(1);
                    }
                }
                LOG(3) << "applied " << curr.toString(false, true) << endl;
                theReplSet->gtidManager->noteGTIDApplied(currEntry);

                {
                    boost::unique_lock<boost::mutex> lck(_mutex);
                    dassert(_deque.size() > 0);
                    _deque.pop_front();
                    bufferCountGauge.increment(-1);
                    bufferSizeGauge.increment(-curr.objsize());
                    
                    // this is a flow control mechanism, with bad numbers
                    // hard coded for now just to get something going.
                    // If the opSync thread notices that we have over 20000
                    // transactions in the queue, it waits until we get below
                    // 10000. This is where we signal that we have gotten there
                    // Once we have spilling of transactions working, this
                    // logic will need to be redone
                    if (_deque.size() == 10000) {
                        _queueCond.notify_all();
                    }
                }
            }
            catch (DBException& e) {
                sethbmsg(str::stream() << "db exception in producer on applier thread: " << e.toString());
                sleepsecs(2);
            }
            catch (std::exception& e2) {
                sethbmsg(str::stream() << "exception in producer on applier thread: " << e2.what());
                sleepsecs(2);
            }
        }
    }
Exemple #13
0
    bool ReplSetImpl::_initialSyncOplogApplication(OplogReader& r, const Member *source,
        const OpTime& applyGTE, const OpTime& minValid) {

        const string hn = source->fullName();
        OplogReader missingObjReader;
        try {
            r.tailingQueryGTE( rsoplog, applyGTE );
            if ( !r.haveCursor() ) {
                log() << "replSet initial sync oplog query error" << rsLog;
                return false;
            }

            {
                if( !r.more() ) {
                    sethbmsg("replSet initial sync error reading remote oplog");
                    log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog;
                    return false;
                }
                bo op = r.next();
                OpTime t = op["ts"]._opTime();
                r.putBack(op);

                if( op.firstElementFieldName() == string("$err") ) {
                    log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog;
                    return false;
                }

                uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() );
                if( t > applyGTE ) {
                    sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync");
                    log() << "replSet initial sync expected first optime of " << applyGTE << rsLog;
                    log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog;
                    return false;
                }

                sethbmsg(str::stream() << "initial oplog application from " << hn << " starting at "
                         << t.toStringPretty() << " to " << minValid.toStringPretty());
            }
        }
        catch(DBException& e) {
            log() << "replSet initial sync failing: " << e.toString() << rsLog;
            return false;
        }

        /* we lock outside the loop to avoid the overhead of locking on every operation. */
        writelock lk("");

        // todo : use exhaust
        OpTime ts;
        time_t start = time(0);
        unsigned long long n = 0;
        while( 1 ) {
            try {
                if( !r.more() )
                    break;
                BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */
                ts = o["ts"]._opTime();

                {
                    if( (source->state() != MemberState::RS_PRIMARY &&
                            source->state() != MemberState::RS_SECONDARY) ||
                            replSetForceInitialSyncFailure ) {

                        int f = replSetForceInitialSyncFailure;
                        if( f > 0 ) {
                            replSetForceInitialSyncFailure = f-1;
                            log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog;
                            throw DBException("forced error",0);
                        }
                        log() << "replSet we are now primary" << rsLog;
                        throw DBException("primary changed",0);
                    }

                    if( ts >= applyGTE ) { // optimes before we started copying need not be applied.
                        bool failedUpdate = syncApply(o);
                        if( failedUpdate ) {
                            // we don't have the object yet, which is possible on initial sync.  get it.
                            log() << "replSet info adding missing object" << endl; // rare enough we can log
                            if( !missingObjReader.connect(hn) ) { // ok to call more than once
                                log() << "replSet initial sync fails, couldn't connect to " << hn << endl;
                                return false;
                            }
                            const char *ns = o.getStringField("ns");
                            BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj(); // might be more than just _id in the update criteria
                            BSONObj missingObj;
                            try {
                                missingObj = missingObjReader.findOne(
                                    ns, 
                                    query );
                            } catch(...) { 
                                log() << "replSet assertion fetching missing object" << endl;
                                throw;
                            }
                            if( missingObj.isEmpty() ) { 
                                log() << "replSet missing object not found on source. presumably deleted later in oplog" << endl;
                                log() << "replSet o2: " << o.getObjectField("o2").toString() << endl;
                                log() << "replSet o firstfield: " << o.getObjectField("o").firstElementFieldName() << endl;
                            }
                            else {
                                Client::Context ctx(ns);
                                try {
                                    DiskLoc d = theDataFileMgr.insert(ns, (void*) missingObj.objdata(), missingObj.objsize());
                                    assert( !d.isNull() );
                                } catch(...) { 
                                    log() << "replSet assertion during insert of missing object" << endl;
                                    throw;
                                }
                                // now reapply the update from above
                                bool failed = syncApply(o);
                                if( failed ) {
                                    log() << "replSet update still fails after adding missing object " << ns << endl;
                                    assert(false);
                                }
                            }
                        }
                    }
                    _logOpObjRS(o);   /* with repl sets we write the ops to our oplog too */
                }

                if ( ++n % 1000 == 0 ) {
                    time_t now = time(0);
                    if (now - start > 10) {
                        // simple progress metering
                        log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to "
                              << ts.toStringPretty() << rsLog;
                        start = now;
                    }
                }

                if ( ts > minValid ) {
                    break;
                }

                getDur().commitIfNeeded();
            }
            catch (DBException& e) {
                // skip duplicate key exceptions
                if( e.getCode() == 11000 || e.getCode() == 11001 ) {
                    continue;
                }
                
                // handle cursor not found (just requery)
                if( e.getCode() == 13127 ) {
                    log() << "replSet requerying oplog after cursor not found condition, ts: " << ts.toStringPretty() << endl;
                    r.resetCursor();
                    r.tailingQueryGTE(rsoplog, ts);
                    if( r.haveCursor() ) {
                        continue;
                    }
                }

                // TODO: handle server restart

                if( ts <= minValid ) {
                    // didn't make it far enough
                    log() << "replSet initial sync failing, error applying oplog : " << e.toString() << rsLog;
                    return false;
                }

                // otherwise, whatever
                break;
            }
        }
        return true;
    }
Exemple #14
0
    void BackgroundSync::producerThread() {
        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            _opSyncInProgress = true;
        }
        Client::initThread("rsBackgroundSync");
        replLocalAuth();
        uint32_t timeToSleep = 0;

        while (!_opSyncShouldExit) {
            try {
                if (timeToSleep) {
                    {
                        boost::unique_lock<boost::mutex> lck(_mutex);
                        _opSyncRunning = false;
                        // notify other threads that we are not running
                        _opSyncRunningCondVar.notify_all();
                    }
                    for (uint32_t i = 0; i < timeToSleep; i++) {
                        sleepsecs(1);
                        // get out if we need to
                        if (_opSyncShouldExit) { break; }
                    }
                    timeToSleep = 0;
                }
                // get out if we need to
                if (_opSyncShouldExit) { break; }

                {
                    boost::unique_lock<boost::mutex> lck(_mutex);
                    _opSyncRunning = false;

                    while (!_opSyncShouldRun && !_opSyncShouldExit) {
                        // notify other threads that we are not running
                        _opSyncRunningCondVar.notify_all();
                        // wait for permission that we can run
                        _opSyncCanRunCondVar.wait(lck);
                    }

                    // notify other threads that we are running
                    _opSyncRunningCondVar.notify_all();
                    _opSyncRunning = true;
                }
                // get out if we need to
                if (_opSyncShouldExit) { break; }

                MemberState state = theReplSet->state();
                if (state.fatal() || state.startup()) {
                    timeToSleep = 5;
                    continue;
                }
                // this does the work of reading a remote oplog
                // and writing it to our oplog
                timeToSleep = produce();
            }
            catch (DBException& e) {
                sethbmsg(str::stream() << "db exception in producer: " << e.toString());
                timeToSleep = 10;
            }
            catch (std::exception& e2) {
                sethbmsg(str::stream() << "exception in producer: " << e2.what());
                timeToSleep = 10;
            }
        }

        cc().shutdown();
        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            _opSyncRunning = false;
            _opSyncInProgress = false;
        }
    }
Exemple #15
0
    void BackgroundSync::connectOplogReader(OperationContext* txn, 
                                            ReplicationCoordinatorImpl* replCoordImpl,
                                            OplogReader* reader) {
        OpTime lastOpTimeFetched;
        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            lastOpTimeFetched = _lastOpTimeFetched;
        }
        Date_t now(curTimeMillis64());
        OpTime oldestOpTimeSeen(now,0);

        while (true) {
            HostAndPort candidate = replCoordImpl->chooseNewSyncSource();

            if (candidate.empty()) {
                if (oldestOpTimeSeen == OpTime(now,0)) {
                    // If, in this invocation of connectOplogReader(), we did not successfully 
                    // connect to any node ahead of us,
                    // we apparently have no sync sources to connect to.
                    // This situation is common; e.g. if there are no writes to the primary at
                    // the moment.
                    return;
                }

                // Connected to at least one member, but in all cases we were too stale to use them
                // as a sync source.
                log() << "replSet error RS102 too stale to catch up" << rsLog;
                log() << "replSet our last optime : " << lastOpTimeFetched.toStringLong() << rsLog;
                log() << "replSet oldest available is " << oldestOpTimeSeen.toStringLong() <<
                    rsLog;
                log() << "replSet "
                    "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember" 
                      << rsLog;
                sethbmsg("error RS102 too stale to catch up");
                theReplSet->setMinValid(txn, oldestOpTimeSeen);
                replCoordImpl->setFollowerMode(MemberState::RS_RECOVERING);
                return;
            }

            if (!reader->connect(candidate)) {
                LOG(2) << "replSet can't connect to " << candidate.toString() << 
                    " to read operations" << rsLog;
                reader->resetConnection();
                replCoordImpl->blacklistSyncSource(candidate, Date_t(curTimeMillis64() + 10*1000));
                continue;
            }
            // Read the first (oldest) op and confirm that it's not newer than our last
            // fetched op. Otherwise, we have fallen off the back of that source's oplog.
            BSONObj remoteOldestOp(reader->findOne(rsoplog, Query()));
            BSONElement tsElem(remoteOldestOp["ts"]);
            if (tsElem.type() != Timestamp) {
                // This member's got a bad op in its oplog.
                warning() << "oplog invalid format on node " << candidate.toString();
                reader->resetConnection();
                replCoordImpl->blacklistSyncSource(candidate, 
                                                    Date_t(curTimeMillis64() + 600*1000));
                continue;
            }
            OpTime remoteOldOpTime = tsElem._opTime();

            if (lastOpTimeFetched < remoteOldOpTime) {
                // We're too stale to use this sync source.
                reader->resetConnection();
                replCoordImpl->blacklistSyncSource(candidate, 
                                                    Date_t(curTimeMillis64() + 600*1000));
                if (oldestOpTimeSeen > remoteOldOpTime) {
                    warning() << "we are too stale to use " << candidate.toString() << 
                        " as a sync source";
                    oldestOpTimeSeen = remoteOldOpTime;
                }
                continue;
            }

            // Got a valid sync source.
            return;
        } // while (true)

    }
Exemple #16
0
    void runSyncThread() {
        Client::initThread("rsSync");
        replLocalAuth();
        ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator();

        // Set initial indexPrefetch setting
        std::string& prefetch = replCoord->getSettings().rsIndexPrefetch;
        if (!prefetch.empty()) {
            BackgroundSync::IndexPrefetchConfig prefetchConfig = BackgroundSync::PREFETCH_ALL;
            if (prefetch == "none")
                prefetchConfig = BackgroundSync::PREFETCH_NONE;
            else if (prefetch == "_id_only")
                prefetchConfig = BackgroundSync::PREFETCH_ID_ONLY;
            else if (prefetch == "all")
                prefetchConfig = BackgroundSync::PREFETCH_ALL;
            else {
                warning() << "unrecognized indexPrefetch setting " << prefetch << ", defaulting "
                          << "to \"all\"";
            }
            BackgroundSync::get()->setIndexPrefetchConfig(prefetchConfig);
        }

        while (!inShutdown()) {
            // After a reconfig, we may not be in the replica set anymore, so
            // check that we are in the set (and not an arbiter) before
            // trying to sync with other replicas.
            // TODO(spencer): Use a condition variable to await loading a config
            if (replCoord->getReplicationMode() != ReplicationCoordinator::modeReplSet) {
                log() << "replSet warning did not receive a valid config yet, sleeping 5 seconds "
                      << rsLog;
                sleepsecs(5);
                continue;
            }

            const MemberState memberState = replCoord->getCurrentMemberState();
            if (replCoord->getCurrentMemberState().arbiter()) {
                break;
            }

            try {

                if (memberState.primary()) {
                    sleepsecs(1);
                    continue;
                }

                bool initialSyncRequested = BackgroundSync::get()->getInitialSyncRequestedFlag();
                // Check criteria for doing an initial sync:
                // 1. If the oplog is empty, do an initial sync
                // 2. If minValid has _initialSyncFlag set, do an initial sync
                // 3. If initialSyncRequested is true
                if (getGlobalReplicationCoordinator()->getMyLastOptime().isNull() ||
                        getInitialSyncFlag() ||
                        initialSyncRequested) {
                    syncDoInitialSync();
                    continue; // start from top again in case sync failed.
                }
                replCoord->setFollowerMode(MemberState::RS_RECOVERING);

                /* we have some data.  continue tailing. */
                SyncTail tail(BackgroundSync::get(), multiSyncApply);
                tail.oplogApplication();
            }
            catch(const DBException& e) {
                log() << "Received exception while syncing: " << e.toString();
                sleepsecs(10);
            }
            catch(...) {
                sethbmsg("unexpected exception in syncThread()");
                // TODO : SET NOT SECONDARY here?
                sleepsecs(60);
            }
        }
        cc().shutdown();
    }
Exemple #17
0
    /* initial oplog application, during initial sync, after cloning.
       @return false on failure.
       this method returns an error and doesn't throw exceptions (i think).
    */
    bool ReplSetImpl::initialSyncOplogApplication(
        const Member *source,
        OpTime applyGTE,
        OpTime minValid) {
        if( source == 0 ) return false;

        const string hn = source->h().toString();
        OplogReader r;
        try {
            if( !r.connect(hn) ) {
                log() << "replSet initial sync error can't connect to " << hn << " to read " << rsoplog << rsLog;
                return false;
            }

            r.queryGTE( rsoplog, applyGTE );
            assert( r.haveCursor() );

            {
                if( !r.more() ) {
                    sethbmsg("replSet initial sync error reading remote oplog");
                    log() << "replSet initial sync error remote oplog (" << rsoplog << ") on host " << hn << " is empty?" << rsLog;
                    return false;
                }
                bo op = r.next();
                OpTime t = op["ts"]._opTime();
                r.putBack(op);

                if( op.firstElement().fieldName() == string("$err") ) {
                    log() << "replSet initial sync error querying " << rsoplog << " on " << hn << " : " << op.toString() << rsLog;
                    return false;
                }

                uassert( 13508 , str::stream() << "no 'ts' in first op in oplog: " << op , !t.isNull() );
                if( t > applyGTE ) {
                    sethbmsg(str::stream() << "error " << hn << " oplog wrapped during initial sync");
                    log() << "replSet initial sync expected first optime of " << applyGTE << rsLog;
                    log() << "replSet initial sync but received a first optime of " << t << " from " << hn << rsLog;
                    return false;
                }

                sethbmsg(str::stream() << "initial oplog application from " << hn << " starting at "
                         << t.toStringPretty() << " to " << minValid.toStringPretty());
            }
        }
        catch(DBException& e) {
            log() << "replSet initial sync failing: " << e.toString() << rsLog;
            return false;
        }

        /* we lock outside the loop to avoid the overhead of locking on every operation. */
        writelock lk("");

        // todo : use exhaust
        OpTime ts;
        time_t start = time(0);
        unsigned long long n = 0;
        while( 1 ) {
            try {
                if( !r.more() )
                    break;
                BSONObj o = r.nextSafe(); /* note we might get "not master" at some point */
                {
                    ts = o["ts"]._opTime();

                    /* if we have become primary, we dont' want to apply things from elsewhere
                        anymore. assumePrimary is in the db lock so we are safe as long as
                        we check after we locked above. */
                    if( (source->state() != MemberState::RS_PRIMARY &&
                            source->state() != MemberState::RS_SECONDARY) ||
                            replSetForceInitialSyncFailure ) {

                        int f = replSetForceInitialSyncFailure;
                        if( f > 0 ) {
                            replSetForceInitialSyncFailure = f-1;
                            log() << "replSet test code invoked, replSetForceInitialSyncFailure" << rsLog;
                            throw DBException("forced error",0);
                        }
                        log() << "replSet we are now primary" << rsLog;
                        throw DBException("primary changed",0);
                    }

                    if( ts >= applyGTE ) {
                        // optimes before we started copying need not be applied.
                        syncApply(o);
                    }
                    _logOpObjRS(o);   /* with repl sets we write the ops to our oplog too */
                }

                if ( ++n % 1000 == 0 ) {
                    time_t now = time(0);
                    if (now - start > 10) {
                        // simple progress metering
                        log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to "
                              << ts.toStringPretty() << rsLog;
                        start = now;
                    }
                }
                
                getDur().commitIfNeeded();
            }
            catch (DBException& e) {
                // skip duplicate key exceptions
                if( e.getCode() == 11000 || e.getCode() == 11001 ) {
                    continue;
                }
                
                // handle cursor not found (just requery)
                if( e.getCode() == 13127 ) {
                    r.resetCursor();
                    r.queryGTE(rsoplog, ts);
                    if( r.haveCursor() ) {
                        continue;
                    }
                }

                // TODO: handle server restart

                if( ts <= minValid ) {
                    // didn't make it far enough
                    log() << "replSet initial sync failing, error applying oplog " << e.toString() << rsLog;
                    return false;
                }

                // otherwise, whatever
                break;
            }
        }
        return true;
    }