Exemplo n.º 1
0
    /** thread for timing out old cursors */
    void ClientCursorMonitor::run() {
        Client::initThread("clientcursormon");
        Client& client = cc();

        unsigned old = curTimeMillis();

        const int Secs = 4;
        unsigned n = 0;
        while ( ! inShutdown() ) {
            unsigned now = curTimeMillis();
            ClientCursor::idleTimeReport( now - old );
            old = now;
            sleepsecs(Secs);
            if( ++n % (60/4) == 0 /*once a minute*/ ) { 
                sayMemoryStatus();
            }
        }

        client.shutdown();
    }
Exemplo n.º 2
0
TEST(LockManagerTest, TxShutdown) {
    LockManager lm;
    ClientTransaction t1(&lm, 1);
    ClientTransaction t2(&lm, 2);

    t1.acquire(kShared, 1, ACQUIRED);
    lm.shutdown(3000);

    // t1 can still do work while quiescing
    t1.release(kShared, 1);
    t1.acquire(kShared, 2, ACQUIRED);
#ifdef TRANSACTION_REGISTRATION
    // t2 is new and should be refused
    t2.acquire(kShared, 3, ABORTED);
#else
    t2.quit();
#endif
    // after the quiescing period, t1's request should be refused
    sleepsecs(3);
    t1.acquire(kShared, 4, ABORTED);
}
Exemplo n.º 3
0
    void DataFileSync::run() {
        Client::initThread( name().c_str() );

        if (mmapv1GlobalOptions.syncdelay == 0) {
            log() << "warning: --syncdelay 0 is not recommended and can have strange performance" << endl;
        }
        else if (mmapv1GlobalOptions.syncdelay == 1) {
            log() << "--syncdelay 1" << endl;
        }
        else if (mmapv1GlobalOptions.syncdelay != 60) {
            LOG(1) << "--syncdelay " << mmapv1GlobalOptions.syncdelay << endl;
        }
        int time_flushing = 0;
        while ( ! inShutdown() ) {
            _diaglog.flush();
            if (mmapv1GlobalOptions.syncdelay == 0) {
                // in case at some point we add an option to change at runtime
                sleepsecs(5);
                continue;
            }

            sleepmillis((long long) std::max(0.0, (mmapv1GlobalOptions.syncdelay * 1000) - time_flushing));

            if ( inShutdown() ) {
                // occasional issue trying to flush during shutdown when sleep interrupted
                break;
            }

            Date_t start = jsTime();
            StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine();
            int numFiles = storageEngine->flushAllFiles( true );
            time_flushing = (int) (jsTime() - start);

            _flushed(time_flushing);

            if( logger::globalLogDomain()->shouldLog(logger::LogSeverity::Debug(1)) || time_flushing >= 10000 ) {
                log() << "flushing mmaps took " << time_flushing << "ms " << " for " << numFiles << " files" << endl;
            }
        }
    }
Exemplo n.º 4
0
    /* note: not yet in mutex at this point.
       returns >= 0 if ok.  return -1 if you want to reconnect.
       return value of zero indicates no sleep necessary before next call
    */
    int ReplSource::sync(int& nApplied) {
        _sleepAdviceTime = 0;
        ReplInfo r("sync");
        if ( !cmdLine.quiet ) {
            Nullstream& l = log();
            l << "repl: from ";
            if( sourceName() != "main" ) {
                l << "source:" << sourceName() << ' ';
            }
            l << "host:" << hostName << endl;
        }
        nClonedThisPass = 0;

        // FIXME Handle cases where this db isn't on default port, or default port is spec'd in hostName.
        if ( (string("localhost") == hostName || string("127.0.0.1") == hostName) && cmdLine.port == CmdLine::DefaultDBPort ) {
            log() << "repl:   can't sync from self (localhost). sources configuration may be wrong." << endl;
            sleepsecs(5);
            return -1;
        }

        if ( !oplogReader.connect(hostName) ) {
            log(4) << "repl:  can't connect to sync source" << endl;
            return -1;
        }

        /*
            // get current mtime at the server.
            BSONObj o = conn->findOne("admin.$cmd", opTimeQuery);
            BSONElement e = o.getField("optime");
            if( e.eoo() ) {
                log() << "repl:   failed to get cur optime from master" << endl;
                log() << "        " << o.toString() << endl;
                return false;
            }
            uassert( 10124 ,  e.type() == Date );
            OpTime serverCurTime;
            serverCurTime.asDate() = e.date();
        */
        return sync_pullOpLog(nApplied);
    }
Exemplo n.º 5
0
        virtual void run() {
            Client::initThread( name().c_str() );

            while ( ! inShutdown() ) {
                sleepsecs( 60 );
                
                LOG(3) << "TTLMonitor thread awake" << endl;
                
                if ( lockedForWriting() ) {
                    // note: this is not perfect as you can go into fsync+lock between 
                    // this and actually doing the delete later
                    LOG(3) << " locked for writing" << endl;
                    continue;
                }

                // if part of replSet but not in a readable state (e.g. during initial sync), skip.
                if ( theReplSet && !theReplSet->state().readable() )
                    continue;

                set<string> dbs;
                {
                    Lock::DBRead lk( "local" );
                    dbHolder().getAllShortNames( dbs );
                }
                
                ttlPasses.increment();

                for ( set<string>::const_iterator i=dbs.begin(); i!=dbs.end(); ++i ) {
                    string db = *i;
                    try {
                        doTTLForDB( db );
                    }
                    catch ( DBException& e ) {
                        error() << "error processing ttl for db: " << db << " " << e << endl;
                    }
                }

            }
        }
Exemplo n.º 6
0
    void syncDoInitialSync() {
        static const int maxFailedAttempts = 10;

        OperationContextImpl txn;
        createOplog(&txn);

        int failedAttempts = 0;
        while ( failedAttempts < maxFailedAttempts ) {
            try {
                _initialSync();
                break;
            }
            catch(DBException& e) {
                failedAttempts++;
                mongoutils::str::stream msg;
                error() << "initial sync exception: " << e.toString() << " " << 
                    (maxFailedAttempts - failedAttempts) << " attempts remaining";
                sleepsecs(5);
            }
        }
        fassert( 16233, failedAttempts < maxFailedAttempts);
    }
Exemplo n.º 7
0
        bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {

            cc().curop()->suppressFromCurop();
            cc().curop()->setExpectedLatencyMs( 30000 );

            BSONElement e = cmdObj.firstElement();
            if ( e.type() != jstOID ) {
                errmsg = "need oid as first value";
                return 0;
            }

            // get the command issuer's (a mongos) serverID
            const OID id = e.__oid();

            // the command issuer is blocked awaiting a response
            // we want to do return at least at every 5 minutes so sockets don't timeout
            BSONObj z;
            if ( writeBackManager.getWritebackQueue(id.str())->queue.blockingPop( z, 5 * 60 /* 5 minutes */ ) ) {
                LOG(1) << "WriteBackCommand got : " << z << endl;
                result.append( "data" , z );
            }
            else {
                result.appendBool( "noop" , true );
            }

#ifdef _DEBUG
            PseudoRandom r(static_cast<int64_t>(time(0)));
            // Sleep a short amount of time usually
            int sleepFor = r.nextInt32( 10 );
            sleepmillis( sleepFor );

            // Sleep a longer amount of time every once and awhile
            int sleepLong = r.nextInt32( 50 );
            if( sleepLong == 0 ) sleepsecs( 2 );
#endif

            return true;
        }
Exemplo n.º 8
0
        int run() {
            _sleep = getParam( "sleep" , _sleep );

            auth();
            
            BSONObj prev = getData();

            while ( true ) {
                sleepsecs( _sleep );
                
                BSONObj now;
                try {
                    now = getData();
                }
                catch ( std::exception& e ) {
                    cout << "can't get data: " << e.what() << endl;
                    continue;
                }

                if ( now.isEmpty() )
                    return -2;
                
                try {
                    printDiff( prev , now );
                }
                catch ( AssertionException& e ) {
                    cout << "\nerror: " << e.what() << "\n"
                         << now
                         << endl;
                }


                prev = now;
            }

            return 0;
        }
Exemplo n.º 9
0
 void statsThread() {
     /*cout << "TEMP disabled statsthread" << endl;
     if( 1 ) 
         return;*/
     Client::initThread("stats");
     unsigned long long timeLastPass = 0;
     while ( 1 ) {
         {
             /* todo: do we even need readlock here?  if so for what? */
             readlock lk("");
             Top::completeSnapshot();
             q = (q+1)%NStats;
             Timing timing;
             dbMutex.info().getTimingInfo(timing.start, timing.timeLocked);
             unsigned long long now = curTimeMicros64();
             if ( timeLastPass ) {
                 unsigned long long dt = now - timeLastPass;
                 unsigned long long dlocked = timing.timeLocked - tlast.timeLocked;
                 {
                     stringstream ss;
                     ss << dt / 1000 << '\t';
                     ss << dlocked / 1000 << '\t';
                     if ( dt )
                         ss << (dlocked*100)/dt << '%';
                     string s = ss.str();
                     if ( cmdLine.cpu )
                         log() << "cpu: " << s << endl;
                     lockStats[q] = s;
                     ClientCursor::idleTimeReport( (unsigned) ((dt - dlocked)/1000) );
                 }
             }
             timeLastPass = now;
             tlast = timing;
         }
         sleepsecs(4);
     }
 }
Exemplo n.º 10
0
        int run() {
            _sleep = getParam( "sleep" , _sleep );

            if (isMongos()) {
                log() << "mongotop only works on instances of mongod." << endl;
                return EXIT_FAILURE;
            }

            NamespaceStats prev = getData();

            while ( true ) {
                sleepsecs( _sleep );
                
                NamespaceStats now;
                try {
                    now = getData();
                }
                catch ( std::exception& e ) {
                    cout << "can't get data: " << e.what() << endl;
                    continue;
                }

                if ( now.size() == 0 )
                    return -2;
                
                try {
                    printDiff( prev , now );
                }
                catch ( AssertionException& e ) {
                    cout << "\nerror: " << e.what() << endl;
                }

                prev = now;
            }

            return 0;
        }
Exemplo n.º 11
0
            virtual void run(){

                int minutesRunning = 0;
                std::string lastRunningTestName, currentTestName;

                {
                    boost::lock_guard<boost::mutex> lk( globalCurrentTestNameMutex );
                    lastRunningTestName = globalCurrentTestName;
                }

                while (true) {
                    sleepsecs(60);
                    minutesRunning++;

                    {
                        boost::lock_guard<boost::mutex> lk( globalCurrentTestNameMutex );
                        currentTestName = globalCurrentTestName;
                    }

                    if (currentTestName != lastRunningTestName) {
                        minutesRunning = 0;
                        lastRunningTestName = currentTestName;
                    }

                    if (minutesRunning > 30){
                        log() << currentTestName << " has been running for more than 30 minutes. aborting." << endl;
                        ::abort();
                    }
                    else if (minutesRunning > 1){
                        warning() << currentTestName << " has been running for more than " << minutesRunning-1 << " minutes." << endl;

                        // See what is stuck
                        getGlobalLockManager()->dump();
                    }
                }
            }
Exemplo n.º 12
0
    void PeriodicTask::Runner::run() { 
        int sleeptime = 60;
        DEV sleeptime = 5; // to catch race conditions

        while ( ! inShutdown() ) {

            sleepsecs( sleeptime );
            
            scoped_spinlock lk( _lock );
            
            size_t size = _tasks.size();
            
            for ( size_t i=0; i<size; i++ ) {
                PeriodicTask * t = _tasks[i];
                if ( ! t )
                    continue;

                if ( inShutdown() )
                    break;
                
                Timer timer;
                try {
                    t->taskDoWork();
                }
                catch ( std::exception& e ) {
                    error() << "task: " << t->taskName() << " failed: " << e.what() << endl;
                }
                catch ( ... ) {
                    error() << "task: " << t->taskName() << " failed with unknown error" << endl;
                }
                
                int ms = timer.millis();
                LOG( ms <= 3 ? 3 : 0 ) << "task: " << t->taskName() << " took: " << ms << "ms" << endl;
            }
        }
    }
Exemplo n.º 13
0
void BackgroundSync::_produce(OperationContext* txn, executor::TaskExecutor* taskExecutor) {
    // this oplog reader does not do a handshake because we don't want the server it's syncing
    // from to track how far it has synced
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        if (_lastOpTimeFetched.isNull()) {
            // then we're initial syncing and we're still waiting for this to be set
            lock.unlock();
            sleepsecs(1);
            // if there is no one to sync from
            return;
        }

        if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() ||
            inShutdownStrict()) {
            return;
        }
    }

    while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
        sleepmillis(0);
    }


    // find a target to sync from the last optime fetched
    OpTime lastOpTimeFetched;
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        lastOpTimeFetched = _lastOpTimeFetched;
        _syncSourceHost = HostAndPort();
    }
    OplogReader syncSourceReader;
    syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord);

    // no server found
    if (syncSourceReader.getHost().empty()) {
        sleepsecs(1);
        // if there is no one to sync from
        return;
    }

    long long lastHashFetched;
    {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        if (_pause) {
            return;
        }
        lastOpTimeFetched = _lastOpTimeFetched;
        lastHashFetched = _lastFetchedHash;
        _syncSourceHost = syncSourceReader.getHost();
        _replCoord->signalUpstreamUpdater();
    }

    const Milliseconds oplogSocketTimeout(OplogReader::kSocketTimeout);

    // Prefer host in oplog reader to _syncSourceHost because _syncSourceHost may be cleared
    // if sync source feedback fails.
    const HostAndPort source = syncSourceReader.getHost();
    syncSourceReader.resetConnection();
    // no more references to oplog reader from here on.

    // If this status is not OK after the fetcher returns from wait(),
    // proceed to execute rollback
    Status remoteOplogStartStatus = Status::OK();

    auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback,
                                      this,
                                      stdx::placeholders::_1,
                                      stdx::placeholders::_3,
                                      stdx::cref(source),
                                      lastOpTimeFetched,
                                      lastHashFetched,
                                      &remoteOplogStartStatus);

    auto cmdObj = BSON("find" << nsToCollectionSubstring(rsOplogName) << "filter"
                              << BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp()))
                              << "tailable" << true << "oplogReplay" << true << "awaitData" << true
                              << "maxTimeMS" << int(fetcherMaxTimeMS.count()));
    Fetcher fetcher(taskExecutor,
                    source,
                    nsToDatabase(rsOplogName),
                    cmdObj,
                    fetcherCallback,
                    rpc::makeEmptyMetadata());
    auto scheduleStatus = fetcher.schedule();
    if (!scheduleStatus.isOK()) {
        warning() << "unable to schedule fetcher to read remote oplog on " << source << ": "
                  << scheduleStatus;
        return;
    }
    fetcher.wait();

    // If the background sync is paused after the fetcher is started, we need to
    // re-evaluate our sync source and oplog common point.
    if (isPaused()) {
        return;
    }

    // Execute rollback if necessary.
    // Rollback is a synchronous operation that uses the task executor and may not be
    // executed inside the fetcher callback.
    if (!remoteOplogStartStatus.isOK()) {
        const int messagingPortTags = 0;
        ConnectionPool connectionPool(messagingPortTags);
        std::unique_ptr<ConnectionPool::ConnectionPtr> connection;
        auto getConnection =
            [&connection, &connectionPool, oplogSocketTimeout, source]() -> DBClientBase* {
                if (!connection.get()) {
                    connection.reset(new ConnectionPool::ConnectionPtr(
                        &connectionPool, source, Date_t::now(), oplogSocketTimeout));
                };
                return connection->get();
            };

        log() << "starting rollback: " << remoteOplogStartStatus;
        _rollback(txn, source, getConnection);
        stop();
    }
}
Exemplo n.º 14
0
void runSyncThread() {
    Client::initThread("rsSync");
    AuthorizationSession::get(cc())->grantInternalAuthorization();
    ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator();

    // Set initial indexPrefetch setting
    const std::string& prefetch = replCoord->getSettings().rsIndexPrefetch;
    if (!prefetch.empty()) {
        BackgroundSync::IndexPrefetchConfig prefetchConfig = BackgroundSync::PREFETCH_ALL;
        if (prefetch == "none")
            prefetchConfig = BackgroundSync::PREFETCH_NONE;
        else if (prefetch == "_id_only")
            prefetchConfig = BackgroundSync::PREFETCH_ID_ONLY;
        else if (prefetch == "all")
            prefetchConfig = BackgroundSync::PREFETCH_ALL;
        else {
            warning() << "unrecognized indexPrefetch setting " << prefetch << ", defaulting "
                      << "to \"all\"";
        }
        BackgroundSync::get()->setIndexPrefetchConfig(prefetchConfig);
    }

    while (!inShutdown()) {
        // After a reconfig, we may not be in the replica set anymore, so
        // check that we are in the set (and not an arbiter) before
        // trying to sync with other replicas.
        // TODO(spencer): Use a condition variable to await loading a config
        if (replCoord->getMemberState().startup()) {
            warning() << "did not receive a valid config yet";
            sleepsecs(1);
            continue;
        }

        const MemberState memberState = replCoord->getMemberState();

        // An arbiter can never transition to any other state, and doesn't replicate, ever
        if (memberState.arbiter()) {
            break;
        }

        // If we are removed then we don't belong to the set anymore
        if (memberState.removed()) {
            sleepsecs(5);
            continue;
        }

        try {
            if (memberState.primary() && !replCoord->isWaitingForApplierToDrain()) {
                sleepsecs(1);
                continue;
            }

            bool initialSyncRequested = BackgroundSync::get()->getInitialSyncRequestedFlag();
            // Check criteria for doing an initial sync:
            // 1. If the oplog is empty, do an initial sync
            // 2. If minValid has _initialSyncFlag set, do an initial sync
            // 3. If initialSyncRequested is true
            if (getGlobalReplicationCoordinator()->getMyLastOptime().isNull() ||
                getInitialSyncFlag() || initialSyncRequested) {
                syncDoInitialSync();
                continue;  // start from top again in case sync failed.
            }
            if (!replCoord->setFollowerMode(MemberState::RS_RECOVERING)) {
                continue;
            }

            /* we have some data.  continue tailing. */
            SyncTail tail(BackgroundSync::get(), multiSyncApply);
            tail.oplogApplication();
        } catch (const DBException& e) {
            log() << "Received exception while syncing: " << e.toString();
            sleepsecs(10);
        } catch (const std::exception& e) {
            log() << "Received exception while syncing: " << e.what();
            sleepsecs(10);
        }
    }
}
Exemplo n.º 15
0
    void BackgroundSync::produce(OperationContext* txn) {
        // this oplog reader does not do a handshake because we don't want the server it's syncing
        // from to track how far it has synced
        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            if (_lastOpTimeFetched.isNull()) {
                // then we're initial syncing and we're still waiting for this to be set
                lock.unlock();
                sleepsecs(1);
                // if there is no one to sync from
                return;
            }

            // Wait until we've applied the ops we have before we choose a sync target
            while (!_appliedBuffer) {
                _condvar.wait(lock);
            }
        }

        while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
            sleepmillis(0);
        }


        // find a target to sync from the last optime fetched
        OpTime lastOpTimeFetched;
        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            lastOpTimeFetched = _lastOpTimeFetched;
            _syncSourceHost = HostAndPort();
        }
        _syncSourceReader.resetConnection();
        _syncSourceReader.connectToSyncSource(txn, lastOpTimeFetched, _replCoord);

        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            // no server found
            if (_syncSourceReader.getHost().empty()) {
                lock.unlock();
                sleepsecs(1);
                // if there is no one to sync from
                return;
            }
            lastOpTimeFetched = _lastOpTimeFetched;
            _syncSourceHost = _syncSourceReader.getHost();
        }

        _syncSourceReader.tailingQueryGTE(rsoplog, lastOpTimeFetched);

        // if target cut connections between connecting and querying (for
        // example, because it stepped down) we might not have a cursor
        if (!_syncSourceReader.haveCursor()) {
            return;
        }

        if (_rollbackIfNeeded(txn, _syncSourceReader)) {
            stop();
            return;
        }

        while (!inShutdown()) {
            if (!_syncSourceReader.moreInCurrentBatch()) {
                // Check some things periodically
                // (whenever we run out of items in the
                // current cursor batch)

                int bs = _syncSourceReader.currentBatchMessageSize();
                if( bs > 0 && bs < BatchIsSmallish ) {
                    // on a very low latency network, if we don't wait a little, we'll be 
                    // getting ops to write almost one at a time.  this will both be expensive
                    // for the upstream server as well as potentially defeating our parallel 
                    // application of batches on the secondary.
                    //
                    // the inference here is basically if the batch is really small, we are 
                    // "caught up".
                    //
                    sleepmillis(SleepToAllowBatchingMillis);
                }

                // If we are transitioning to primary state, we need to leave
                // this loop in order to go into bgsync-pause mode.
                if (_replCoord->isWaitingForApplierToDrain() || 
                    _replCoord->getCurrentMemberState().primary()) {
                    return;
                }

                // re-evaluate quality of sync target
                if (shouldChangeSyncSource()) {
                    return;
                }

                {
                    //record time for each getmore
                    TimerHolder batchTimer(&getmoreReplStats);
                    
                    // This calls receiveMore() on the oplogreader cursor.
                    // It can wait up to five seconds for more data.
                    _syncSourceReader.more();
                }
                networkByteStats.increment(_syncSourceReader.currentBatchMessageSize());

                if (!_syncSourceReader.moreInCurrentBatch()) {
                    // If there is still no data from upstream, check a few more things
                    // and then loop back for another pass at getting more data
                    {
                        boost::unique_lock<boost::mutex> lock(_mutex);
                        if (_pause) {
                            return;
                        }
                    }

                    _syncSourceReader.tailCheck();
                    if( !_syncSourceReader.haveCursor() ) {
                        LOG(1) << "replSet end syncTail pass" << rsLog;
                        return;
                    }

                    continue;
                }
            }

            // At this point, we are guaranteed to have at least one thing to read out
            // of the oplogreader cursor.
            BSONObj o = _syncSourceReader.nextSafe().getOwned();
            opsReadStats.increment();

            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _appliedBuffer = false;
            }

            OCCASIONALLY {
                LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog;
            }
            // the blocking queue will wait (forever) until there's room for us to push
            _buffer.push(o);
            bufferCountGauge.increment();
            bufferSizeGauge.increment(getSize(o));

            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _lastFetchedHash = o["h"].numberLong();
                _lastOpTimeFetched = o["ts"]._opTime();
                LOG(3) << "replSet lastOpTimeFetched: "
                       << _lastOpTimeFetched.toStringPretty() << rsLog;
            }
        }
    }
Exemplo n.º 16
0
void Balancer::run() {
    Client::initThread("Balancer");

    // This is the body of a BackgroundJob so if we throw here we're basically ending the balancer
    // thread prematurely.
    while (!inShutdown()) {
        auto txn = cc().makeOperationContext();
        if (!_init(txn.get())) {
            log() << "will retry to initialize balancer in one minute";
            sleepsecs(60);
            continue;
        }

        break;
    }

    Seconds balanceRoundInterval(kBalanceRoundDefaultInterval);

    while (!inShutdown()) {
        auto txn = cc().makeOperationContext();

        BalanceRoundDetails roundDetails;

        try {
            // ping has to be first so we keep things in the config server in sync
            _ping(txn.get(), false);

            MONGO_FAIL_POINT_BLOCK(balancerRoundIntervalSetting, scopedBalancerRoundInterval) {
                const BSONObj& data = scopedBalancerRoundInterval.getData();
                balanceRoundInterval = Seconds(data["sleepSecs"].numberInt());
            }

            // Use fresh shard state and balancer settings
            Grid::get(txn.get())->shardRegistry()->reload(txn.get());

            auto balancerConfig = Grid::get(txn.get())->getBalancerConfiguration();
            Status refreshStatus = balancerConfig->refreshAndCheck(txn.get());
            if (!refreshStatus.isOK()) {
                warning() << "Skipping balancing round" << causedBy(refreshStatus);
                sleepFor(balanceRoundInterval);
                continue;
            }

            // now make sure we should even be running
            if (!balancerConfig->isBalancerActive() || MONGO_FAIL_POINT(skipBalanceRound)) {
                LOG(1) << "skipping balancing round because balancing is disabled";

                // Ping again so scripts can determine if we're active without waiting
                _ping(txn.get(), true);

                sleepFor(balanceRoundInterval);
                continue;
            }

            uassert(13258, "oids broken after resetting!", _checkOIDs(txn.get()));

            {
                auto scopedDistLock = grid.catalogManager(txn.get())
                                          ->distLock(txn.get(),
                                                     "balancer",
                                                     "doing balance round",
                                                     DistLockManager::kSingleLockAttemptTimeout);

                if (!scopedDistLock.isOK()) {
                    LOG(1) << "skipping balancing round" << causedBy(scopedDistLock.getStatus());

                    // Ping again so scripts can determine if we're active without waiting
                    _ping(txn.get(), true);

                    sleepFor(balanceRoundInterval);  // no need to wake up soon
                    continue;
                }

                LOG(1) << "*** start balancing round. "
                       << "waitForDelete: " << balancerConfig->waitForDelete()
                       << ", secondaryThrottle: "
                       << balancerConfig->getSecondaryThrottle().toBSON();

                OCCASIONALLY warnOnMultiVersion(
                    uassertStatusOK(_clusterStats->getStats(txn.get())));

                Status status = _enforceTagRanges(txn.get());
                if (!status.isOK()) {
                    warning() << "Failed to enforce tag ranges" << causedBy(status);
                } else {
                    LOG(1) << "Done enforcing tag range boundaries.";
                }

                const auto candidateChunks = uassertStatusOK(
                    _chunkSelectionPolicy->selectChunksToMove(txn.get(), _balancedLastTime));

                if (candidateChunks.empty()) {
                    LOG(1) << "no need to move any chunk";
                    _balancedLastTime = 0;
                } else {
                    _balancedLastTime = _moveChunks(txn.get(),
                                                    candidateChunks,
                                                    balancerConfig->getSecondaryThrottle(),
                                                    balancerConfig->waitForDelete());

                    roundDetails.setSucceeded(static_cast<int>(candidateChunks.size()),
                                              _balancedLastTime);

                    grid.catalogManager(txn.get())
                        ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON());
                }

                LOG(1) << "*** End of balancing round";
            }

            // Ping again so scripts can determine if we're active without waiting
            _ping(txn.get(), true);

            sleepFor(_balancedLastTime ? kShortBalanceRoundInterval : balanceRoundInterval);
        } catch (const std::exception& e) {
            log() << "caught exception while doing balance: " << e.what();

            // Just to match the opening statement if in log level 1
            LOG(1) << "*** End of balancing round";

            // This round failed, tell the world!
            roundDetails.setFailed(e.what());

            grid.catalogManager(txn.get())
                ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON());

            // Sleep a fair amount before retrying because of the error
            sleepFor(balanceRoundInterval);
        }
    }
}
Exemplo n.º 17
0
void BackgroundSync::_produce(OperationContext* txn) {
    // this oplog reader does not do a handshake because we don't want the server it's syncing
    // from to track how far it has synced
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        if (_lastOpTimeFetched.isNull()) {
            // then we're initial syncing and we're still waiting for this to be set
            lock.unlock();
            sleepsecs(1);
            // if there is no one to sync from
            return;
        }

        if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary() ||
            inShutdownStrict()) {
            return;
        }
    }

    while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
        sleepmillis(0);
    }


    // find a target to sync from the last optime fetched
    OpTime lastOpTimeFetched;
    HostAndPort source;
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        lastOpTimeFetched = _lastOpTimeFetched;
        _syncSourceHost = HostAndPort();
    }
    SyncSourceResolverResponse syncSourceResp =
        _syncSourceResolver.findSyncSource(txn, lastOpTimeFetched);

    if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) {
        // All (accessible) sync sources were too stale.
        error() << "too stale to catch up -- entering maintenance mode";
        log() << "Our newest OpTime : " << lastOpTimeFetched;
        log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen;
        log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";
        setMinValid(txn, {lastOpTimeFetched, syncSourceResp.earliestOpTimeSeen});
        auto status = _replCoord->setMaintenanceMode(true);
        if (!status.isOK()) {
            warning() << "Failed to transition into maintenance mode.";
        }
        bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING);
        if (!worked) {
            warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                      << ". Current state: " << _replCoord->getMemberState();
        }
        return;
    } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        _syncSourceHost = syncSourceResp.getSyncSource();
        source = _syncSourceHost;
    } else {
        if (!syncSourceResp.isOK()) {
            log() << "failed to find sync source, received error "
                  << syncSourceResp.syncSourceStatus.getStatus();
        }
        // No sync source found.
        sleepsecs(1);
        return;
    }

    long long lastHashFetched;
    {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        if (_stopped) {
            return;
        }
        lastOpTimeFetched = _lastOpTimeFetched;
        lastHashFetched = _lastFetchedHash;
        _replCoord->signalUpstreamUpdater();
    }

    const auto isV1ElectionProtocol = _replCoord->isV1ElectionProtocol();
    // Under protocol version 1, make the awaitData timeout (maxTimeMS) dependent on the election
    // timeout. This enables the sync source to communicate liveness of the primary to secondaries.
    // Under protocol version 0, use a default timeout of 2 seconds for awaitData.
    const Milliseconds fetcherMaxTimeMS(
        isV1ElectionProtocol ? _replCoord->getConfig().getElectionTimeoutPeriod() / 2 : Seconds(2));

    Status fetcherReturnStatus = Status::OK();
    auto fetcherCallback = stdx::bind(&BackgroundSync::_fetcherCallback,
                                      this,
                                      stdx::placeholders::_1,
                                      stdx::placeholders::_3,
                                      stdx::cref(source),
                                      lastOpTimeFetched,
                                      lastHashFetched,
                                      fetcherMaxTimeMS,
                                      &fetcherReturnStatus);


    BSONObjBuilder cmdBob;
    cmdBob.append("find", nsToCollectionSubstring(rsOplogName));
    cmdBob.append("filter", BSON("ts" << BSON("$gte" << lastOpTimeFetched.getTimestamp())));
    cmdBob.append("tailable", true);
    cmdBob.append("oplogReplay", true);
    cmdBob.append("awaitData", true);
    cmdBob.append("maxTimeMS", durationCount<Milliseconds>(Minutes(1)));  // 1 min initial find.

    BSONObjBuilder metadataBob;
    if (isV1ElectionProtocol) {
        cmdBob.append("term", _replCoord->getTerm());
        metadataBob.append(rpc::kReplSetMetadataFieldName, 1);
    }

    auto dbName = nsToDatabase(rsOplogName);
    auto cmdObj = cmdBob.obj();
    auto metadataObj = metadataBob.obj();
    Fetcher fetcher(&_threadPoolTaskExecutor,
                    source,
                    dbName,
                    cmdObj,
                    fetcherCallback,
                    metadataObj,
                    _replCoord->getConfig().getElectionTimeoutPeriod());

    LOG(1) << "scheduling fetcher to read remote oplog on " << source << " starting at "
           << cmdObj["filter"];
    auto scheduleStatus = fetcher.schedule();
    if (!scheduleStatus.isOK()) {
        warning() << "unable to schedule fetcher to read remote oplog on " << source << ": "
                  << scheduleStatus;
        return;
    }
    fetcher.wait();
    LOG(1) << "fetcher stopped reading remote oplog on " << source;

    // If the background sync is stopped after the fetcher is started, we need to
    // re-evaluate our sync source and oplog common point.
    if (isStopped()) {
        return;
    }

    if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) {
        // This is bad because it means that our source
        // has not returned oplog entries in ascending ts order, and they need to be.

        warning() << fetcherReturnStatus.toString();
        // Do not blacklist the server here, it will be blacklisted when we try to reuse it,
        // if it can't return a matching oplog start from the last fetch oplog ts field.
        return;
    } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing ||
               fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) {
        // Rollback is a synchronous operation that uses the task executor and may not be
        // executed inside the fetcher callback.
        const int messagingPortTags = 0;
        ConnectionPool connectionPool(messagingPortTags);
        std::unique_ptr<ConnectionPool::ConnectionPtr> connection;
        auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* {
            if (!connection.get()) {
                connection.reset(new ConnectionPool::ConnectionPtr(
                    &connectionPool, source, Date_t::now(), oplogSocketTimeout));
            };
            return connection->get();
        };

        {
            stdx::lock_guard<stdx::mutex> lock(_mutex);
            lastOpTimeFetched = _lastOpTimeFetched;
        }

        log() << "Starting rollback due to " << fetcherReturnStatus;

        // Wait till all buffered oplog entries have drained and been applied.
        auto lastApplied = _replCoord->getMyLastAppliedOpTime();
        if (lastApplied != lastOpTimeFetched) {
            log() << "Waiting for all operations from " << lastApplied << " until "
                  << lastOpTimeFetched << " to be applied before starting rollback.";
            while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) {
                sleepmillis(10);
                if (isStopped() || inShutdown()) {
                    return;
                }
            }
        }
        // check that we are at minvalid, otherwise we cannot roll back as we may be in an
        // inconsistent state
        BatchBoundaries boundaries = getMinValid(txn);
        if (!boundaries.start.isNull() || boundaries.end > lastApplied) {
            fassertNoTrace(18750,
                           Status(ErrorCodes::UnrecoverableRollbackError,
                                  str::stream()
                                      << "need to rollback, but in inconsistent state. "
                                      << "minvalid: " << boundaries.end.toString()
                                      << " > our last optime: " << lastApplied.toString()));
        }

        _rollback(txn, source, getConnection);
        stop();
    } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) {
        Seconds blacklistDuration(60);
        warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source "
                  << source << " for " << blacklistDuration << ".";
        _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration);
    } else if (!fetcherReturnStatus.isOK()) {
        warning() << "Fetcher error querying oplog: " << fetcherReturnStatus.toString();
    }
}
Exemplo n.º 18
0
    void Balancer::run() {

        // this is the body of a BackgroundJob so if we throw here we're basically ending the balancer thread prematurely
        while ( ! inShutdown() ) {

            if ( ! _init() ) {
                log() << "will retry to initialize balancer in one minute" << endl;
                sleepsecs( 60 );
                continue;
            }

            break;
        }

        int sleepTime = 30;

        // getConnectioString and dist lock constructor does not throw, which is what we expect on while
        // on the balancer thread
        ConnectionString config = configServer.getConnectionString();
        DistributedLock balanceLock( config , "balancer" );

        while ( ! inShutdown() ) {

            try {

                scoped_ptr<ScopedDbConnection> connPtr(
                        ScopedDbConnection::getInternalScopedDbConnection( config.toString() ) );
                ScopedDbConnection& conn = *connPtr;

                // ping has to be first so we keep things in the config server in sync
                _ping( conn.conn() );

                // use fresh shard state
                Shard::reloadShardInfo();

                // refresh chunk size (even though another balancer might be active)
                Chunk::refreshChunkSize();

                BSONObj balancerConfig;
                // now make sure we should even be running
                if ( ! grid.shouldBalance( "", &balancerConfig ) ) {
                    LOG(1) << "skipping balancing round because balancing is disabled" << endl;

                    // Ping again so scripts can determine if we're active without waiting
                    _ping( conn.conn(), true );

                    conn.done();
                    
                    sleepsecs( sleepTime );
                    continue;
                }

                sleepTime = balancerConfig["_nosleep"].trueValue() ? 30 : 6;
                
                uassert( 13258 , "oids broken after resetting!" , _checkOIDs() );

                {
                    dist_lock_try lk( &balanceLock , "doing balance round" );
                    if ( ! lk.got() ) {
                        LOG(1) << "skipping balancing round because another balancer is active" << endl;

                        // Ping again so scripts can determine if we're active without waiting
                        _ping( conn.conn(), true );

                        conn.done();
                        
                        sleepsecs( sleepTime ); // no need to wake up soon
                        continue;
                    }
                    
                    LOG(1) << "*** start balancing round" << endl;

                    vector<CandidateChunkPtr> candidateChunks;
                    _doBalanceRound( conn.conn() , &candidateChunks );
                    if ( candidateChunks.size() == 0 ) {
                        LOG(1) << "no need to move any chunk" << endl;
                        _balancedLastTime = 0;
                    }
                    else {
                        _balancedLastTime = _moveChunks( &candidateChunks, balancerConfig["_secondaryThrottle"].trueValue() );
                    }
                    
                    LOG(1) << "*** end of balancing round" << endl;
                }

                // Ping again so scripts can determine if we're active without waiting
                _ping( conn.conn(), true );
                
                conn.done();

                sleepsecs( _balancedLastTime ? sleepTime / 6 : sleepTime );
            }
            catch ( std::exception& e ) {
                log() << "caught exception while doing balance: " << e.what() << endl;

                // Just to match the opening statement if in log level 1
                LOG(1) << "*** End of balancing round" << endl;

                sleepsecs( sleepTime ); // sleep a fair amount b/c of error
                continue;
            }
        }

    }
Exemplo n.º 19
0
    void BackgroundSync::produce() {
        // this oplog reader does not do a handshake because we don't want the server it's syncing
        // from to track how far it has synced
        OplogReader r(false /* doHandshake */);

        // find a target to sync from the last op time written
        getOplogReader(r);

        // no server found
        {
            boost::unique_lock<boost::mutex> lock(_mutex);

            if (_currentSyncTarget == NULL) {
                lock.unlock();
                sleepsecs(1);
                // if there is no one to sync from
                return;
            }

            r.tailingQueryGTE(rsoplog, _lastOpTimeFetched);
        }

        // if target cut connections between connecting and querying (for
        // example, because it stepped down) we might not have a cursor
        if (!r.haveCursor()) {
            return;
        }

        while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
            sleepmillis(0);
        }

        uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() );

        if (isRollbackRequired(r)) {
            stop();
            return;
        }

        while (!inShutdown()) {
            while (!inShutdown()) {

                if (!r.moreInCurrentBatch()) {
                    int bs = r.currentBatchMessageSize();
                    if( bs > 0 && bs < BatchIsSmallish ) {
                        // on a very low latency network, if we don't wait a little, we'll be 
                        // getting ops to write almost one at a time.  this will both be expensive
                        // for the upstream server as well as postentiallyd efating our parallel 
                        // application of batches on the secondary.
                        //
                        // the inference here is basically if the batch is really small, we are 
                        // "caught up".
                        //
                        dassert( !Lock::isLocked() );
                        sleepmillis(SleepToAllowBatchingMillis);
                    }
  
                    if (theReplSet->gotForceSync()) {
                        return;
                    }

                    if (isAssumingPrimary() || theReplSet->isPrimary()) {
                        return;
                    }

                    // re-evaluate quality of sync target
                    if (shouldChangeSyncTarget()) {
                        return;
                    }
                    //record time for each getmore
                    {
                        TimerHolder batchTimer(&getmoreReplStats);
                        r.more();
                    }
                    //increment
                    networkByteStats.increment(r.currentBatchMessageSize());

                }

                if (!r.more())
                    break;

                BSONObj o = r.nextSafe().getOwned();
                opsReadStats.increment();

                {
                    boost::unique_lock<boost::mutex> lock(_mutex);
                    _appliedBuffer = false;
                }

                OCCASIONALLY {
                    LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog;
                }
                // the blocking queue will wait (forever) until there's room for us to push
                _buffer.push(o);
                bufferCountGauge.increment();
                bufferSizeGauge.increment(getSize(o));

                {
                    boost::unique_lock<boost::mutex> lock(_mutex);
                    _lastH = o["h"].numberLong();
                    _lastOpTimeFetched = o["ts"]._opTime();
                }
            } // end while

            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                if (_pause || !_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) {
                    return;
                }
            }


            r.tailCheck();
            if( !r.haveCursor() ) {
                LOG(1) << "replSet end syncTail pass" << rsLog;
                return;
            }

            // looping back is ok because this is a tailable cursor
        }
    }
Exemplo n.º 20
0
    void BackgroundSync::getOplogReader(OplogReader& r) {
        const Member *target = NULL, *stale = NULL;
        BSONObj oldest;

        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            if (_lastOpTimeFetched.isNull()) {
                // then we're initial syncing and we're still waiting for this to be set
                _currentSyncTarget = NULL;
                return;
            }

            // Wait until we've applied the ops we have before we choose a sync target
            while (!_appliedBuffer) {
                _condvar.wait(lock);
            }
        }

        while (MONGO_FAIL_POINT(rsBgSyncProduce)) {
            sleepmillis(0);
        }

        verify(r.conn() == NULL);

        while ((target = theReplSet->getMemberToSyncTo()) != NULL) {
            string current = target->fullName();

            if (!r.connect(current)) {
                LOG(2) << "replSet can't connect to " << current << " to read operations" << rsLog;
                r.resetConnection();
                theReplSet->veto(current);
                sleepsecs(1);
                continue;
            }

            if (isStale(r, oldest)) {
                r.resetConnection();
                theReplSet->veto(current, 600);
                stale = target;
                continue;
            }

            // if we made it here, the target is up and not stale
            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _currentSyncTarget = target;
            }

            boost::unique_lock<boost::mutex> oplogLockSSF(theReplSet->syncSourceFeedback.oplock);
            theReplSet->syncSourceFeedback.connect(target);

            return;
        }

        // the only viable sync target was stale
        if (stale) {
            theReplSet->goStale(stale, oldest);
            sleepsecs(120);
        }

        {
            boost::unique_lock<boost::mutex> lock(_mutex);
            _currentSyncTarget = NULL;
        }
    }
Exemplo n.º 21
0
 MONGO_FAIL_POINT_BLOCK(rsDelayHeartbeatResponse, delay) {
     const BSONObj& data = delay.getData();
     sleepsecs(data["delay"].numberInt());
 }
Exemplo n.º 22
0
    /* we reuse our existing objects so that we can keep our existing connection
       and cursor in effect.
    */
    void ReplSource::loadAll(SourceVector &v) {
        Client::Context ctx("local.sources");
        SourceVector old = v;
        v.clear();

        if ( !cmdLine.source.empty() ) {
            // --source <host> specified.
            // check that no items are in sources other than that
            // add if missing
            shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
            int n = 0;
            while ( c->ok() ) {
                n++;
                ReplSource tmp(c->current());
                if ( tmp.hostName != cmdLine.source ) {
                    log() << "repl: --source " << cmdLine.source << " != " << tmp.hostName << " from local.sources collection" << endl;
                    log() << "repl: for instructions on changing this slave's source, see:" << endl;
                    log() << "http://dochub.mongodb.org/core/masterslave" << endl;
                    log() << "repl: terminating mongod after 30 seconds" << endl;
                    sleepsecs(30);
                    dbexit( EXIT_REPLICATION_ERROR );
                }
                if ( tmp.only != cmdLine.only ) {
                    log() << "--only " << cmdLine.only << " != " << tmp.only << " from local.sources collection" << endl;
                    log() << "terminating after 30 seconds" << endl;
                    sleepsecs(30);
                    dbexit( EXIT_REPLICATION_ERROR );
                }
                c->advance();
            }
            uassert( 10002 ,  "local.sources collection corrupt?", n<2 );
            if ( n == 0 ) {
                // source missing.  add.
                ReplSource s;
                s.hostName = cmdLine.source;
                s.only = cmdLine.only;
                s.save();
            }
        }
        else {
            try {
                massert( 10384 , "--only requires use of --source", cmdLine.only.empty());
            }
            catch ( ... ) {
                dbexit( EXIT_BADOPTIONS );
            }
        }

        shared_ptr<Cursor> c = findTableScan("local.sources", BSONObj());
        while ( c->ok() ) {
            ReplSource tmp(c->current());
            if ( tmp.syncedTo.isNull() ) {
                DBDirectClient c;
                if ( c.exists( "local.oplog.$main" ) ) {
                    BSONObj op = c.findOne( "local.oplog.$main", QUERY( "op" << NE << "n" ).sort( BSON( "$natural" << -1 ) ) );
                    if ( !op.isEmpty() ) {
                        tmp.syncedTo = op[ "ts" ].date();
                    }
                }
            }
            addSourceToList(v, tmp, old);
            c->advance();
        }
    }
Exemplo n.º 23
0
    /* tail an oplog.  ok to return, will be re-called. */
    void SyncTail::oplogApplication() {
        while( 1 ) {
            OpQueue ops;

            verify( !Lock::isLocked() );

            Timer batchTimer;
            int lastTimeChecked = 0;

            do {
                if (theReplSet->isPrimary()) {
                    massert(16620, "there are ops to sync, but I'm primary", ops.empty());
                    return;
                }

                int now = batchTimer.seconds();

                // apply replication batch limits
                if (!ops.empty()) {
                    if (now > replBatchLimitSeconds)
                        break;
                    if (ops.getDeque().size() > replBatchLimitOperations)
                        break;
                }
                // occasionally check some things
                // (always checked in the first iteration of this do-while loop, because
                // ops is empty)
                if (ops.empty() || now > lastTimeChecked) {
                    {
                        boost::unique_lock<boost::mutex> lock(theReplSet->initialSyncMutex);
                        if (theReplSet->initialSyncRequested) {
                            // got a resync command
                            return;
                        }
                    }
                    lastTimeChecked = now;
                    // can we become secondary?
                    // we have to check this before calling mgr, as we must be a secondary to
                    // become primary
                    if (!theReplSet->isSecondary()) {
                        OpTime minvalid;

                        OperationContextImpl txn;
                        theReplSet->tryToGoLiveAsASecondary(&txn, minvalid);
                    }

                    // normally msgCheckNewState gets called periodically, but in a single node
                    // replset there are no heartbeat threads, so we do it here to be sure.  this is
                    // relevant if the singleton member has done a stepDown() and needs to come back
                    // up.
                    if (theReplSet->config().members.size() == 1 &&
                        theReplSet->myConfig().potentiallyHot()) {
                        Manager* mgr = theReplSet->mgr;
                        // When would mgr be null?  During replsettest'ing, in which case we should
                        // fall through and actually apply ops as if we were a real secondary.
                        if (mgr) { 
                            mgr->send(stdx::bind(&Manager::msgCheckNewState, theReplSet->mgr));
                            sleepsecs(1);
                            // There should never be ops to sync in a 1-member set, anyway
                            return;
                        }
                    }
                }

                const int slaveDelaySecs = theReplSet->myConfig().slaveDelay;
                if (!ops.empty() && slaveDelaySecs > 0) {
                    const BSONObj& lastOp = ops.getDeque().back();
                    const unsigned int opTimestampSecs = lastOp["ts"]._opTime().getSecs();

                    // Stop the batch as the lastOp is too new to be applied. If we continue
                    // on, we can get ops that are way ahead of the delay and this will
                    // make this thread sleep longer when handleSlaveDelay is called
                    // and apply ops much sooner than we like.
                    if (opTimestampSecs > static_cast<unsigned int>(time(0) - slaveDelaySecs)) {
                        break;
                    }
                }
                // keep fetching more ops as long as we haven't filled up a full batch yet
            } while (!tryPopAndWaitForMore(&ops) && // tryPopAndWaitForMore returns true 
                                                    // when we need to end a batch early
                   (ops.getSize() < replBatchLimitBytes));

            // For pausing replication in tests
            while (MONGO_FAIL_POINT(rsSyncApplyStop)) {
                sleepmillis(0);
            }

            const BSONObj& lastOp = ops.getDeque().back();
            setOplogVersion(lastOp);
            handleSlaveDelay(lastOp);

            // Set minValid to the last op to be applied in this next batch.
            // This will cause this node to go into RECOVERING state
            // if we should crash and restart before updating the oplog
            theReplSet->setMinValid(lastOp);

            if (BackgroundSync::get()->isAssumingPrimary()) {
                LOG(1) << "about to apply batch up to optime: "
                       << ops.getDeque().back()["ts"]._opTime().toStringPretty();
            }
            
            multiApply(ops.getDeque(), multiSyncApply);

            if (BackgroundSync::get()->isAssumingPrimary()) {
                LOG(1) << "about to update oplog to optime: "
                       << ops.getDeque().back()["ts"]._opTime().toStringPretty();
            }
            
            applyOpsToOplog(&ops.getDeque());

            // If we're just testing (no manager), don't keep looping if we exhausted the bgqueue
            if (!theReplSet->mgr) {
                BSONObj op;
                if (!peek(&op)) {
                    return;
                }
            }
        }
    }
Exemplo n.º 24
0
void BackgroundSync::_fetcherCallback(const StatusWith<Fetcher::QueryResponse>& result,
                                      BSONObjBuilder* bob,
                                      const HostAndPort& source,
                                      OpTime lastOpTimeFetched,
                                      long long lastFetchedHash,
                                      Status* remoteOplogStartStatus) {
    // if target cut connections between connecting and querying (for
    // example, because it stepped down) we might not have a cursor
    if (!result.isOK()) {
        return;
    }

    if (inShutdown()) {
        return;
    }

    // Check if we have been paused.
    if (isPaused()) {
        return;
    }

    const auto& queryResponse = result.getValue();
    const auto& documents = queryResponse.documents;
    auto documentBegin = documents.cbegin();
    auto documentEnd = documents.cend();

    // Check start of remote oplog and, if necessary, stop fetcher to execute rollback.
    if (queryResponse.first) {
        auto getNextOperation = [&documentBegin, documentEnd]() -> StatusWith<BSONObj> {
            if (documentBegin == documentEnd) {
                return Status(ErrorCodes::OplogStartMissing, "remote oplog start missing");
            }
            return *(documentBegin++);
        };

        *remoteOplogStartStatus =
            checkRemoteOplogStart(getNextOperation, lastOpTimeFetched, lastFetchedHash);
        if (!remoteOplogStartStatus->isOK()) {
            // Stop fetcher and execute rollback.
            return;
        }

        // If this is the first batch and no rollback is needed, we should have advanced
        // the document iterator.
        invariant(documentBegin != documents.cbegin());
    }

    // process documents
    int currentBatchMessageSize = 0;
    for (auto documentIter = documentBegin; documentIter != documentEnd; ++documentIter) {
        if (inShutdown()) {
            return;
        }

        // If we are transitioning to primary state, we need to leave
        // this loop in order to go into bgsync-pause mode.
        if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) {
            LOG(1) << "waiting for draining or we are primary, not adding more ops to buffer";
            return;
        }

        // At this point, we are guaranteed to have at least one thing to read out
        // of the fetcher.
        const BSONObj& o = *documentIter;
        currentBatchMessageSize += o.objsize();
        opsReadStats.increment();

        if (MONGO_FAIL_POINT(stepDownWhileDrainingFailPoint)) {
            sleepsecs(20);
        }

        {
            stdx::unique_lock<stdx::mutex> lock(_mutex);
            _appliedBuffer = false;
        }

        OCCASIONALLY {
            LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes";
        }

        bufferCountGauge.increment();
        bufferSizeGauge.increment(getSize(o));
        _buffer.push(o);

        {
            stdx::unique_lock<stdx::mutex> lock(_mutex);
            _lastFetchedHash = o["h"].numberLong();
            _lastOpTimeFetched = extractOpTime(o);
            LOG(3) << "lastOpTimeFetched: " << _lastOpTimeFetched;
        }
    }

    // record time for each batch
    getmoreReplStats.recordMillis(queryResponse.elapsedMillis.count());

    networkByteStats.increment(currentBatchMessageSize);

    // Check some things periodically
    // (whenever we run out of items in the
    // current cursor batch)
    if (currentBatchMessageSize > 0 && currentBatchMessageSize < BatchIsSmallish) {
        // on a very low latency network, if we don't wait a little, we'll be
        // getting ops to write almost one at a time.  this will both be expensive
        // for the upstream server as well as potentially defeating our parallel
        // application of batches on the secondary.
        //
        // the inference here is basically if the batch is really small, we are
        // "caught up".
        //
        sleepmillis(SleepToAllowBatchingMillis);
    }

    // If we are transitioning to primary state, we need to leave
    // this loop in order to go into bgsync-pause mode.
    if (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary()) {
        return;
    }

    // re-evaluate quality of sync target
    if (_shouldChangeSyncSource(source)) {
        return;
    }

    // Check if we have been paused.
    if (isPaused()) {
        return;
    }

    // We fill in 'bob' to signal the fetcher to process with another getMore.
    invariant(bob);
    bob->append("getMore", queryResponse.cursorId);
    bob->append("collection", queryResponse.nss.coll());
    bob->append("maxTimeMS", int(fetcherMaxTimeMS.count()));
}
Exemplo n.º 25
0
    void BackgroundSync::produce() {
        // this oplog reader does not do a handshake because we don't want the server it's syncing
        // from to track how far it has synced
        OplogReader r;
        OpTime lastOpTimeFetched;
        // find a target to sync from the last op time written
        getOplogReader(r);

        // no server found
        {
            boost::unique_lock<boost::mutex> lock(_mutex);

            if (_currentSyncTarget == NULL) {
                lock.unlock();
                sleepsecs(1);
                // if there is no one to sync from
                return;
            }
            lastOpTimeFetched = _lastOpTimeFetched;
        }

        r.tailingQueryGTE(rsoplog, lastOpTimeFetched);

        // if target cut connections between connecting and querying (for
        // example, because it stepped down) we might not have a cursor
        if (!r.haveCursor()) {
            return;
        }

        uassert(1000, "replSet source for syncing doesn't seem to be await capable -- is it an older version of mongodb?", r.awaitCapable() );

        if (isRollbackRequired(r)) {
            stop();
            return;
        }

        while (!inShutdown()) {
            if (!r.moreInCurrentBatch()) {
                // Check some things periodically
                // (whenever we run out of items in the
                // current cursor batch)

                int bs = r.currentBatchMessageSize();
                if( bs > 0 && bs < BatchIsSmallish ) {
                    // on a very low latency network, if we don't wait a little, we'll be 
                    // getting ops to write almost one at a time.  this will both be expensive
                    // for the upstream server as well as potentially defeating our parallel 
                    // application of batches on the secondary.
                    //
                    // the inference here is basically if the batch is really small, we are 
                    // "caught up".
                    //
                    dassert( !Lock::isLocked() );
                    sleepmillis(SleepToAllowBatchingMillis);
                }
  
                if (theReplSet->gotForceSync()) {
                    return;
                }
                // If we are transitioning to primary state, we need to leave
                // this loop in order to go into bgsync-pause mode.
                if (isAssumingPrimary() || theReplSet->isPrimary()) {
                    return;
                }

                // re-evaluate quality of sync target
                if (shouldChangeSyncTarget()) {
                    return;
                }


                {
                    //record time for each getmore
                    TimerHolder batchTimer(&getmoreReplStats);
                    
                    // This calls receiveMore() on the oplogreader cursor.
                    // It can wait up to five seconds for more data.
                    r.more();
                }
                networkByteStats.increment(r.currentBatchMessageSize());

                if (!r.moreInCurrentBatch()) {
                    // If there is still no data from upstream, check a few more things
                    // and then loop back for another pass at getting more data
                    {
                        boost::unique_lock<boost::mutex> lock(_mutex);
                        if (_pause || 
                            !_currentSyncTarget || 
                            !_currentSyncTarget->hbinfo().hbstate.readable()) {
                            return;
                        }
                    }

                    r.tailCheck();
                    if( !r.haveCursor() ) {
                        LOG(1) << "replSet end syncTail pass" << rsLog;
                        return;
                    }

                    continue;
                }
            }

            // At this point, we are guaranteed to have at least one thing to read out
            // of the oplogreader cursor.
            BSONObj o = r.nextSafe().getOwned();
            opsReadStats.increment();

            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _appliedBuffer = false;
            }

            OCCASIONALLY {
                LOG(2) << "bgsync buffer has " << _buffer.size() << " bytes" << rsLog;
            }
            // the blocking queue will wait (forever) until there's room for us to push
            _buffer.push(o);
            bufferCountGauge.increment();
            bufferSizeGauge.increment(getSize(o));

            {
                boost::unique_lock<boost::mutex> lock(_mutex);
                _lastH = o["h"].numberLong();
                _lastOpTimeFetched = o["ts"]._opTime();
                LOG(3) << "replSet lastOpTimeFetched: "
                       << _lastOpTimeFetched.toStringPretty() << rsLog;
            }
        }
    }
Exemplo n.º 26
0
        int runMany() {
            StateMap threads;

            {
                string orig = getParam( "host" );
                bool showPorts = false;
                if ( orig == "" )
                    orig = "localhost";

                if ( orig.find( ":" ) != string::npos || hasParam( "port" ) )
                    showPorts = true;

                StringSplitter ss( orig.c_str() , "," );
                while ( ss.more() ) {
                    string host = ss.next();
                    if ( showPorts && host.find( ":" ) == string::npos) {
                        // port supplied, but not for this host.  use default.
                        if ( hasParam( "port" ) )
                            host += ":" + _params["port"].as<string>();
                        else
                            host += ":27017";
                    }
                    _add( threads , host );
                }
            }

            sleepsecs(1);

            int row = 0;
            bool discover = hasParam( "discover" );
            int maxLockedDbWidth = 0;

            while ( 1 ) {
                sleepsecs( (int)ceil(_statUtil.getSeconds()) );

                // collect data
                vector<Row> rows;
                for ( map<string,shared_ptr<ServerState> >::iterator i=threads.begin(); i!=threads.end(); ++i ) {
                    scoped_lock lk( i->second->lock );

                    if ( i->second->error.size() ) {
                        rows.push_back( Row( i->first , i->second->error ) );
                    }
                    else if ( i->second->prev.isEmpty() || i->second->now.isEmpty() ) {
                        rows.push_back( Row( i->first ) );
                    }
                    else {
                        BSONObj out = _statUtil.doRow( i->second->prev , i->second->now );
                        rows.push_back( Row( i->first , out ) );
                    }

                    if ( discover && ! i->second->now.isEmpty() ) {
                        if ( _discover( threads , i->first , i->second ) )
                            break;
                    }
                }

                // compute some stats
                unsigned longestHost = 0;
                BSONObj biggest;
                for ( unsigned i=0; i<rows.size(); i++ ) {
                    if ( rows[i].host.size() > longestHost )
                        longestHost = rows[i].host.size();
                    if ( rows[i].data.nFields() > biggest.nFields() )
                        biggest = rows[i].data;

                    // adjust width up as longer 'locked db' values appear
                    setMaxLockedDbWidth( &rows[i].data, &maxLockedDbWidth ); 
                }

                {
                    // check for any headers not in biggest

                    // TODO: we put any new headers at end,
                    //       ideally we would interleave

                    set<string> seen;

                    BSONObjBuilder b;

                    {
                        // iterate biggest
                        BSONObjIterator i( biggest );
                        while ( i.more() ) {
                            BSONElement e = i.next();
                            seen.insert( e.fieldName() );
                            b.append( e );
                        }
                    }

                    // now do the rest
                    for ( unsigned j=0; j<rows.size(); j++ ) {
                        BSONObjIterator i( rows[j].data );
                        while ( i.more() ) {
                            BSONElement e = i.next();
                            if ( seen.count( e.fieldName() ) )
                                continue;
                            seen.insert( e.fieldName() );
                            b.append( e );
                        }

                    }

                    biggest = b.obj();

                }

                // display data

                cout << endl;

                //    header
                if ( row++ % 5 == 0 && ! biggest.isEmpty() ) {
                    cout << setw( longestHost ) << "" << "\t";
                    printHeaders( biggest );
                }

                //    rows
                for ( unsigned i=0; i<rows.size(); i++ ) {
                    cout << setw( longestHost ) << rows[i].host << "\t";
                    if ( rows[i].err.size() )
                        cout << rows[i].err << endl;
                    else if ( rows[i].data.isEmpty() )
                        cout << "no data" << endl;
                    else
                        printData( rows[i].data , biggest );
                }

            }

            return 0;
        }
Exemplo n.º 27
0
        void run() {
            if ( _token.size() == 0  && _name.size() == 0 ) {
                log(1) << "mms not configured" << endl;
                return;
            }

            if ( _token.size() == 0 ) {
                log() << "no token for mms - not running" << endl;
                return;
            }

            if ( _name.size() == 0 ) {
                log() << "no name for mms - not running" << endl;
                return;
            }

            log() << "mms monitor staring...  token:" << _token << " name:" << _name << " interval: " << _secsToSleep << endl;
            Client::initThread( "mms" );
            Client& c = cc();


            // TODO: using direct client is bad, but easy for now

            while ( ! inShutdown() ) {
                sleepsecs( _secsToSleep );

                try {
                    stringstream url;
                    url << _baseurl << "?"
                        << "token=" << _token << "&"
                        << "name=" << _name << "&"
                        << "ts=" << time(0)
                        ;

                    BSONObjBuilder bb;
                    // duplicated so the post has everything
                    bb.append( "token" , _token );
                    bb.append( "name" , _name );
                    bb.appendDate( "ts" , jsTime()  );

                    // any commands
                    _add( bb , "buildinfo" );
                    _add( bb , "serverStatus" );

                    BSONObj postData = bb.obj();

                    log(1) << "mms url: " << url.str() << "\n\t post: " << postData << endl;;

                    HttpClient c;
                    HttpClient::Result r;
                    int rc = c.post( url.str() , postData.jsonString() , &r );
                    log(1) << "\t response code: " << rc << endl;
                    if ( rc != 200 ) {
                        log() << "mms error response code:" << rc << endl;
                        log(1) << "mms error body:" << r.getEntireResponse() << endl;
                    }
                }
                catch ( std::exception& e ) {
                    log() << "mms exception: " << e.what() << endl;
                }
            }

            c.shutdown();
        }
Exemplo n.º 28
0
void BackgroundSync::_produce(OperationContext* txn) {

    while (MONGO_FAIL_POINT(pauseRsBgSyncProducer)) {
        sleepmillis(0);
    }

    // this oplog reader does not do a handshake because we don't want the server it's syncing
    // from to track how far it has synced
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        if (_lastOpTimeFetched.isNull()) {
            // then we're initial syncing and we're still waiting for this to be set
            lock.unlock();
            sleepsecs(1);
            // if there is no one to sync from
            return;
        }

        if (!_replCoord->isCatchingUp() &&
            (_replCoord->isWaitingForApplierToDrain() || _replCoord->getMemberState().primary())) {
            return;
        }

        if (_inShutdown_inlock()) {
            return;
        }
    }

    // find a target to sync from the last optime fetched
    OpTime lastOpTimeFetched;
    HostAndPort source;
    SyncSourceResolverResponse syncSourceResp;
    SyncSourceResolver* syncSourceResolver;
    OpTime minValid;
    if (_replCoord->getMemberState().recovering()) {
        auto minValidSaved = StorageInterface::get(txn)->getMinValid(txn);
        if (minValidSaved > lastOpTimeFetched) {
            minValid = minValidSaved;
        }
    }
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        lastOpTimeFetched = _lastOpTimeFetched;
        _syncSourceHost = HostAndPort();
        _syncSourceResolver = stdx::make_unique<SyncSourceResolver>(
            _replicationCoordinatorExternalState->getTaskExecutor(),
            _replCoord,
            lastOpTimeFetched,
            minValid,
            [&syncSourceResp](const SyncSourceResolverResponse& resp) { syncSourceResp = resp; });
        syncSourceResolver = _syncSourceResolver.get();
    }
    // This may deadlock if called inside the mutex because SyncSourceResolver::startup() calls
    // ReplicationCoordinator::chooseNewSyncSource(). ReplicationCoordinatorImpl's mutex has to
    // acquired before BackgroundSync's.
    // It is safe to call startup() outside the mutex on this instance of SyncSourceResolver because
    // we do not destroy this instance outside of this function.
    auto status = _syncSourceResolver->startup();
    if (ErrorCodes::CallbackCanceled == status || ErrorCodes::isShutdownError(status.code())) {
        return;
    }
    fassertStatusOK(40349, status);
    syncSourceResolver->join();
    syncSourceResolver = nullptr;
    {
        stdx::unique_lock<stdx::mutex> lock(_mutex);
        _syncSourceResolver.reset();
    }

    if (syncSourceResp.syncSourceStatus == ErrorCodes::OplogStartMissing) {
        // All (accessible) sync sources were too stale.
        if (_replCoord->isCatchingUp()) {
            warning() << "Too stale to catch up.";
            log() << "Our newest OpTime : " << lastOpTimeFetched;
            log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen
                  << " from " << syncSourceResp.getSyncSource();
            sleepsecs(1);
            return;
        }

        error() << "too stale to catch up -- entering maintenance mode";
        log() << "Our newest OpTime : " << lastOpTimeFetched;
        log() << "Earliest OpTime available is " << syncSourceResp.earliestOpTimeSeen;
        log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember";
        auto status = _replCoord->setMaintenanceMode(true);
        if (!status.isOK()) {
            warning() << "Failed to transition into maintenance mode: " << status;
        }
        bool worked = _replCoord->setFollowerMode(MemberState::RS_RECOVERING);
        if (!worked) {
            warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING)
                      << ". Current state: " << _replCoord->getMemberState();
        }
        return;
    } else if (syncSourceResp.isOK() && !syncSourceResp.getSyncSource().empty()) {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        _syncSourceHost = syncSourceResp.getSyncSource();
        source = _syncSourceHost;
    } else {
        if (!syncSourceResp.isOK()) {
            log() << "failed to find sync source, received error "
                  << syncSourceResp.syncSourceStatus.getStatus();
        }
        // No sync source found.
        sleepsecs(1);
        return;
    }

    long long lastHashFetched;
    {
        stdx::lock_guard<stdx::mutex> lock(_mutex);
        if (_stopped) {
            return;
        }
        lastOpTimeFetched = _lastOpTimeFetched;
        lastHashFetched = _lastFetchedHash;
        if (!_replCoord->isCatchingUp()) {
            _replCoord->signalUpstreamUpdater();
        }
    }

    // Set the applied point if unset. This is most likely the first time we've established a sync
    // source since stepping down or otherwise clearing the applied point. We need to set this here,
    // before the OplogWriter gets a chance to append to the oplog.
    if (StorageInterface::get(txn)->getAppliedThrough(txn).isNull()) {
        StorageInterface::get(txn)->setAppliedThrough(txn, _replCoord->getMyLastAppliedOpTime());
    }

    // "lastFetched" not used. Already set in _enqueueDocuments.
    Status fetcherReturnStatus = Status::OK();
    DataReplicatorExternalStateBackgroundSync dataReplicatorExternalState(
        _replCoord, _replicationCoordinatorExternalState, this);
    OplogFetcher* oplogFetcher;
    try {
        auto executor = _replicationCoordinatorExternalState->getTaskExecutor();
        auto config = _replCoord->getConfig();
        auto onOplogFetcherShutdownCallbackFn =
            [&fetcherReturnStatus](const Status& status, const OpTimeWithHash& lastFetched) {
                fetcherReturnStatus = status;
            };

        stdx::lock_guard<stdx::mutex> lock(_mutex);
        _oplogFetcher = stdx::make_unique<OplogFetcher>(
            executor,
            OpTimeWithHash(lastHashFetched, lastOpTimeFetched),
            source,
            NamespaceString(rsOplogName),
            config,
            _replicationCoordinatorExternalState->getOplogFetcherMaxFetcherRestarts(),
            &dataReplicatorExternalState,
            stdx::bind(&BackgroundSync::_enqueueDocuments,
                       this,
                       stdx::placeholders::_1,
                       stdx::placeholders::_2,
                       stdx::placeholders::_3),
            onOplogFetcherShutdownCallbackFn);
        oplogFetcher = _oplogFetcher.get();
    } catch (const mongo::DBException& ex) {
        fassertFailedWithStatus(34440, exceptionToStatus());
    }

    LOG(1) << "scheduling fetcher to read remote oplog on " << _syncSourceHost << " starting at "
           << oplogFetcher->getCommandObject_forTest()["filter"];
    auto scheduleStatus = oplogFetcher->startup();
    if (!scheduleStatus.isOK()) {
        warning() << "unable to schedule fetcher to read remote oplog on " << source << ": "
                  << scheduleStatus;
        return;
    }

    oplogFetcher->join();
    LOG(1) << "fetcher stopped reading remote oplog on " << source;

    // If the background sync is stopped after the fetcher is started, we need to
    // re-evaluate our sync source and oplog common point.
    if (isStopped()) {
        return;
    }

    if (fetcherReturnStatus.code() == ErrorCodes::OplogOutOfOrder) {
        // This is bad because it means that our source
        // has not returned oplog entries in ascending ts order, and they need to be.

        warning() << redact(fetcherReturnStatus);
        // Do not blacklist the server here, it will be blacklisted when we try to reuse it,
        // if it can't return a matching oplog start from the last fetch oplog ts field.
        return;
    } else if (fetcherReturnStatus.code() == ErrorCodes::OplogStartMissing ||
               fetcherReturnStatus.code() == ErrorCodes::RemoteOplogStale) {
        if (_replCoord->isCatchingUp()) {
            warning() << "Rollback situation detected in catch-up mode; catch-up mode will end.";
            sleepsecs(1);
            return;
        }

        // Rollback is a synchronous operation that uses the task executor and may not be
        // executed inside the fetcher callback.
        const int messagingPortTags = 0;
        ConnectionPool connectionPool(messagingPortTags);
        std::unique_ptr<ConnectionPool::ConnectionPtr> connection;
        auto getConnection = [&connection, &connectionPool, source]() -> DBClientBase* {
            if (!connection.get()) {
                connection.reset(new ConnectionPool::ConnectionPtr(
                    &connectionPool, source, Date_t::now(), kRollbackOplogSocketTimeout));
            };
            return connection->get();
        };

        {
            stdx::lock_guard<stdx::mutex> lock(_mutex);
            lastOpTimeFetched = _lastOpTimeFetched;
        }

        log() << "Starting rollback due to " << redact(fetcherReturnStatus);

        // Wait till all buffered oplog entries have drained and been applied.
        auto lastApplied = _replCoord->getMyLastAppliedOpTime();
        if (lastApplied != lastOpTimeFetched) {
            log() << "Waiting for all operations from " << lastApplied << " until "
                  << lastOpTimeFetched << " to be applied before starting rollback.";
            while (lastOpTimeFetched > (lastApplied = _replCoord->getMyLastAppliedOpTime())) {
                sleepmillis(10);
                if (isStopped() || inShutdown()) {
                    return;
                }
            }
        }
        // check that we are at minvalid, otherwise we cannot roll back as we may be in an
        // inconsistent state
        const auto minValid = StorageInterface::get(txn)->getMinValid(txn);
        if (lastApplied < minValid) {
            fassertNoTrace(18750,
                           Status(ErrorCodes::UnrecoverableRollbackError,
                                  str::stream() << "need to rollback, but in inconsistent state. "
                                                << "minvalid: "
                                                << minValid.toString()
                                                << " > our last optime: "
                                                << lastApplied.toString()));
        }

        _rollback(txn, source, getConnection);
        stop();
    } else if (fetcherReturnStatus == ErrorCodes::InvalidBSON) {
        Seconds blacklistDuration(60);
        warning() << "Fetcher got invalid BSON while querying oplog. Blacklisting sync source "
                  << source << " for " << blacklistDuration << ".";
        _replCoord->blacklistSyncSource(source, Date_t::now() + blacklistDuration);
    } else if (!fetcherReturnStatus.isOK()) {
        warning() << "Fetcher stopped querying remote oplog with error: "
                  << redact(fetcherReturnStatus);
    }
}
Exemplo n.º 29
0
 bool run(const string& ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
     sleepsecs(100);
     return true;
 }
Exemplo n.º 30
0
    virtual bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) {
        if( replSetBlind ) {
            if (theReplSet) {
                errmsg = str::stream() << theReplSet->selfFullName() << " is blind";
            }
            return false;
        }

        MONGO_FAIL_POINT_BLOCK(rsDelayHeartbeatResponse, delay) {
            const BSONObj& data = delay.getData();
            sleepsecs(data["delay"].numberInt());
        }

        /* we don't call ReplSetCommand::check() here because heartbeat
           checks many things that are pre-initialization. */
        if( !replSet ) {
            errmsg = "not running with --replSet";
            return false;
        }

        /* we want to keep heartbeat connections open when relinquishing primary.  tag them here. */
        {
            AbstractMessagingPort *mp = cc().port();
            if( mp )
                mp->tag |= ScopedConn::keepOpen;
        }

        if( cmdObj["pv"].Int() != 1 ) {
            errmsg = "incompatible replset protocol version";
            return false;
        }
        {
            string s = string(cmdObj.getStringField("replSetHeartbeat"));
            if (replSettings.ourSetName() != s) {
                errmsg = "repl set names do not match";
                log() << "replSet set names do not match, our cmdline: " << replSettings.replSet
                      << rsLog;
                log() << "replSet s: " << s << rsLog;
                result.append("mismatch", true);
                return false;
            }
        }

        result.append("rs", true);
        if( cmdObj["checkEmpty"].trueValue() ) {
            result.append("hasData", replHasDatabases());
        }
        if( (theReplSet == 0) || (theReplSet->startupStatus == ReplSetImpl::LOADINGCONFIG) ) {
            string from( cmdObj.getStringField("from") );
            if( !from.empty() ) {
                scoped_lock lck( replSettings.discoveredSeeds_mx );
                replSettings.discoveredSeeds.insert(from);
            }
            result.append("hbmsg", "still initializing");
            return true;
        }

        if( theReplSet->name() != cmdObj.getStringField("replSetHeartbeat") ) {
            errmsg = "repl set names do not match (2)";
            result.append("mismatch", true);
            return false;
        }
        result.append("set", theReplSet->name());

        MemberState currentState = theReplSet->state();
        result.append("state", currentState.s);
        if (currentState == MemberState::RS_PRIMARY) {
            result.appendDate("electionTime", theReplSet->getElectionTime().asDate());
        }

        result.append("e", theReplSet->iAmElectable());
        result.append("hbmsg", theReplSet->hbmsg());
        result.append("time", (long long) time(0));
        result.appendDate("opTime", theReplSet->lastOpTimeWritten.asDate());
        const Member *syncTarget = replset::BackgroundSync::get()->getSyncTarget();
        if (syncTarget) {
            result.append("syncingTo", syncTarget->fullName());
        }

        int v = theReplSet->config().version;
        result.append("v", v);
        if( v > cmdObj["v"].Int() )
            result << "config" << theReplSet->config().asBson();

        Member* from = NULL;
        if (cmdObj.hasField("fromId")) {
            if (v == cmdObj["v"].Int()) {
                from = theReplSet->getMutableMember(cmdObj["fromId"].Int());
            }
        }
        if (!from) {
            from = theReplSet->findByName(cmdObj.getStringField("from"));
            if (!from) {
                return true;
            }
        }

        // if we thought that this node is down, let it know
        if (!from->hbinfo().up()) {
            result.append("stateDisagreement", true);
        }

        // note that we got a heartbeat from this node
        theReplSet->mgr->send(boost::bind(&ReplSet::msgUpdateHBRecv,
                                          theReplSet, from->hbinfo().id(), time(0)));


        return true;
    }