Esempio n. 1
0
    void Balancer::run() {

        // this is the body of a BackgroundJob so if we throw here we're basically ending the balancer thread prematurely
        while ( ! inShutdown() ) {

            if ( ! _init() ) {
                log() << "will retry to initialize balancer in one minute" << endl;
                sleepsecs( 60 );
                continue;
            }

            break;
        }

        int sleepTime = 10;

        // getConnectioString and dist lock constructor does not throw, which is what we expect on while
        // on the balancer thread
        ConnectionString config = configServer.getConnectionString();
        DistributedLock balanceLock( config , "balancer" );

        while ( ! inShutdown() ) {

            try {

                ScopedDbConnection conn(config.toString(), 30);

                // ping has to be first so we keep things in the config server in sync
                _ping();

                // use fresh shard state
                Shard::reloadShardInfo();

                // refresh chunk size (even though another balancer might be active)
                Chunk::refreshChunkSize();

                SettingsType balancerConfig;
                string errMsg;

                if (!grid.getBalancerSettings(&balancerConfig, &errMsg)) {
                    warning() << errMsg;
                    return ;
                }

                // now make sure we should even be running
                if ((balancerConfig.isKeySet() && // balancer config doc exists
                        !grid.shouldBalance(balancerConfig)) ||
                        MONGO_FAIL_POINT(skipBalanceRound)) {

                    LOG(1) << "skipping balancing round because balancing is disabled" << endl;

                    // Ping again so scripts can determine if we're active without waiting
                    _ping( true );

                    conn.done();

                    sleepsecs( sleepTime );
                    continue;
                }

                uassert( 13258 , "oids broken after resetting!" , _checkOIDs() );

                {
                    dist_lock_try lk( &balanceLock , "doing balance round" );
                    if ( ! lk.got() ) {
                        LOG(1) << "skipping balancing round because another balancer is active" << endl;

                        // Ping again so scripts can determine if we're active without waiting
                        _ping( true );

                        conn.done();
                        
                        sleepsecs( sleepTime ); // no need to wake up soon
                        continue;
                    }

                    if ( !isConfigServerConsistent() ) {
                        conn.done();
                        warning() << "Skipping balancing round because data inconsistency"
                                  << " was detected amongst the config servers." << endl;
                        sleepsecs( sleepTime );
                        continue;
                    }

                    const bool waitForDelete = (balancerConfig.isWaitForDeleteSet() ?
                            balancerConfig.getWaitForDelete() : false);

                    scoped_ptr<WriteConcernOptions> writeConcern;
                    if (balancerConfig.isKeySet()) { // if balancer doc exists.
                        StatusWith<WriteConcernOptions*> extractStatus =
                                balancerConfig.extractWriteConcern();
                        if (extractStatus.isOK()) {
                            writeConcern.reset(extractStatus.getValue());
                        }
                        else {
                            warning() << extractStatus.toString();
                        }
                    }

                    LOG(1) << "*** start balancing round. "
                           << "waitForDelete: " << waitForDelete
                           << ", secondaryThrottle: "
                           << (writeConcern.get() ? writeConcern->toBSON().toString() : "default")
                           << endl;

                    vector<CandidateChunkPtr> candidateChunks;
                    _doBalanceRound( conn.conn() , &candidateChunks );
                    if ( candidateChunks.size() == 0 ) {
                        LOG(1) << "no need to move any chunk" << endl;
                        _balancedLastTime = 0;
                    }
                    else {
                        _balancedLastTime = _moveChunks(&candidateChunks,
                                                        writeConcern.get(),
                                                        waitForDelete );
                    }

                    LOG(1) << "*** end of balancing round" << endl;
                }

                // Ping again so scripts can determine if we're active without waiting
                _ping( true );
                
                conn.done();

                sleepsecs( _balancedLastTime ? sleepTime / 10 : sleepTime );
            }
            catch ( std::exception& e ) {
                log() << "caught exception while doing balance: " << e.what() << endl;

                // Just to match the opening statement if in log level 1
                LOG(1) << "*** End of balancing round" << endl;

                sleepsecs( sleepTime ); // sleep a fair amount b/c of error
                continue;
            }
        }

    }
Esempio n. 2
0
    void Balancer::run() {

        // this is the body of a BackgroundJob so if we throw here we're basically ending the balancer thread prematurely
        while ( ! inShutdown() ) {

            if ( ! _init() ) {
                log() << "will retry to initialize balancer in one minute" << endl;
                sleepsecs( 60 );
                continue;
            }

            break;
        }

        int sleepTime = 30;

        // getConnectioString and dist lock constructor does not throw, which is what we expect on while
        // on the balancer thread
        ConnectionString config = configServer.getConnectionString();
        DistributedLock balanceLock( config , "balancer" );

        while ( ! inShutdown() ) {

            try {

                scoped_ptr<ScopedDbConnection> connPtr(
                        ScopedDbConnection::getInternalScopedDbConnection( config.toString() ) );
                ScopedDbConnection& conn = *connPtr;

                // ping has to be first so we keep things in the config server in sync
                _ping( conn.conn() );

                // use fresh shard state
                Shard::reloadShardInfo();

                // refresh chunk size (even though another balancer might be active)
                Chunk::refreshChunkSize();

                BSONObj balancerConfig;
                // now make sure we should even be running
                if ( ! grid.shouldBalance( "", &balancerConfig ) ) {
                    LOG(1) << "skipping balancing round because balancing is disabled" << endl;

                    // Ping again so scripts can determine if we're active without waiting
                    _ping( conn.conn(), true );

                    conn.done();
                    
                    sleepsecs( sleepTime );
                    continue;
                }

                sleepTime = balancerConfig["_nosleep"].trueValue() ? 30 : 6;
                
                uassert( 13258 , "oids broken after resetting!" , _checkOIDs() );

                {
                    dist_lock_try lk( &balanceLock , "doing balance round" );
                    if ( ! lk.got() ) {
                        LOG(1) << "skipping balancing round because another balancer is active" << endl;

                        // Ping again so scripts can determine if we're active without waiting
                        _ping( conn.conn(), true );

                        conn.done();
                        
                        sleepsecs( sleepTime ); // no need to wake up soon
                        continue;
                    }
                    
                    LOG(1) << "*** start balancing round" << endl;

                    vector<CandidateChunkPtr> candidateChunks;
                    _doBalanceRound( conn.conn() , &candidateChunks );
                    if ( candidateChunks.size() == 0 ) {
                        LOG(1) << "no need to move any chunk" << endl;
                        _balancedLastTime = 0;
                    }
                    else {
                        _balancedLastTime = _moveChunks( &candidateChunks, balancerConfig["_secondaryThrottle"].trueValue() );
                    }
                    
                    LOG(1) << "*** end of balancing round" << endl;
                }

                // Ping again so scripts can determine if we're active without waiting
                _ping( conn.conn(), true );
                
                conn.done();

                sleepsecs( _balancedLastTime ? sleepTime / 6 : sleepTime );
            }
            catch ( std::exception& e ) {
                log() << "caught exception while doing balance: " << e.what() << endl;

                // Just to match the opening statement if in log level 1
                LOG(1) << "*** End of balancing round" << endl;

                sleepsecs( sleepTime ); // sleep a fair amount b/c of error
                continue;
            }
        }

    }
Esempio n. 3
0
    void Balancer::run() {

        // this is the body of a BackgroundJob so if we throw here we're basically ending the balancer thread prematurely
        while ( ! inShutdown() ) {

            if ( ! _init() ) {
                log() << "will retry to initialize balancer in one minute" << endl;
                sleepsecs( 60 );
                continue;
            }

            break;
        }

        // getConnectioString and dist lock constructor does not throw, which is what we expect on while
        // on the balancer thread
        ConnectionString config = configServer.getConnectionString();
        DistributedLock balanceLock( config , "balancer" );

        while ( ! inShutdown() ) {

            try {
                
                ScopedDbConnection conn( config );
                
                // ping has to be first so we keep things in the config server in sync
                _ping( conn.conn() );

                // now make sure we should even be running
                if ( ! grid.shouldBalance() ) {
                    LOG(1) << "skipping balancing round because balancing is disabled" << endl;
                    conn.done();
                    
                    sleepsecs( 30 );
                    continue;
                }
                
                uassert( 13258 , "oids broken after resetting!" , _checkOIDs() );

                // use fresh shard state
                Shard::reloadShardInfo();

                {
                    dist_lock_try lk( &balanceLock , "doing balance round" );
                    if ( ! lk.got() ) {
                        LOG(1) << "skipping balancing round because another balancer is active" << endl;
                        conn.done();
                        
                        sleepsecs( 30 ); // no need to wake up soon
                        continue;
                    }
                    
                    LOG(1) << "*** start balancing round" << endl;
                    
                    vector<CandidateChunkPtr> candidateChunks;
                    _doBalanceRound( conn.conn() , &candidateChunks );
                    if ( candidateChunks.size() == 0 ) {
                        LOG(1) << "no need to move any chunk" << endl;
                    }
                    else {
                        _balancedLastTime = _moveChunks( &candidateChunks );
                    }
                    
                    LOG(1) << "*** end of balancing round" << endl;
                }
                
                conn.done();
                    
                sleepsecs( _balancedLastTime ? 5 : 10 );
            }
            catch ( std::exception& e ) {
                log() << "caught exception while doing balance: " << e.what() << endl;

                // Just to match the opening statement if in log level 1
                LOG(1) << "*** End of balancing round" << endl;

                sleepsecs( 30 ); // sleep a fair amount b/c of error
                continue;
            }
        }

    }