Ejemplo n.º 1
0
void Balancer::run() {
    Client::initThread("Balancer");

    // This is the body of a BackgroundJob so if we throw here we're basically ending the balancer
    // thread prematurely.
    while (!inShutdown()) {
        if (!_init()) {
            log() << "will retry to initialize balancer in one minute";
            sleepsecs(60);
            continue;
        }

        break;
    }

    const int sleepTime = 10;

    while (!inShutdown()) {
        Timer balanceRoundTimer;
        ActionLogType actionLog;

        actionLog.setServer(getHostNameCached());
        actionLog.setWhat("balancer.round");

        try {
            // ping has to be first so we keep things in the config server in sync
            _ping();

            BSONObj balancerResult;

            // use fresh shard state
            Shard::reloadShardInfo();

            // refresh chunk size (even though another balancer might be active)
            Chunk::refreshChunkSize();

            auto balSettingsResult =
                grid.catalogManager()->getGlobalSettings(SettingsType::BalancerDocKey);
            const bool isBalSettingsAbsent =
                balSettingsResult.getStatus() == ErrorCodes::NoMatchingDocument;
            if (!balSettingsResult.isOK() && !isBalSettingsAbsent) {
                warning() << balSettingsResult.getStatus();
                return;
            }
            const SettingsType& balancerConfig =
                isBalSettingsAbsent ? SettingsType{} :
                balSettingsResult.getValue();

            // now make sure we should even be running
            if ((!isBalSettingsAbsent && !grid.shouldBalance(balancerConfig)) ||
                    MONGO_FAIL_POINT(skipBalanceRound)) {
                LOG(1) << "skipping balancing round because balancing is disabled";

                // Ping again so scripts can determine if we're active without waiting
                _ping(true);

                sleepsecs(sleepTime);
                continue;
            }

            uassert(13258, "oids broken after resetting!", _checkOIDs());

            {
                auto scopedDistLock = grid.catalogManager()->getDistLockManager()->lock(
                                          "balancer", "doing balance round");

                if (!scopedDistLock.isOK()) {
                    LOG(1) << "skipping balancing round" << causedBy(scopedDistLock.getStatus());

                    // Ping again so scripts can determine if we're active without waiting
                    _ping(true);

                    sleepsecs(sleepTime);  // no need to wake up soon
                    continue;
                }

                const bool waitForDelete =
                    (balancerConfig.isWaitForDeleteSet() ? balancerConfig.getWaitForDelete()
                     : false);

                std::unique_ptr<WriteConcernOptions> writeConcern;
                if (balancerConfig.isKeySet()) {  // if balancer doc exists.
                    writeConcern = std::move(balancerConfig.getWriteConcern());
                }

                LOG(1) << "*** start balancing round. "
                       << "waitForDelete: " << waitForDelete << ", secondaryThrottle: "
                       << (writeConcern.get() ? writeConcern->toBSON().toString() : "default");

                vector<shared_ptr<MigrateInfo>> candidateChunks;
                _doBalanceRound(&candidateChunks);

                if (candidateChunks.size() == 0) {
                    LOG(1) << "no need to move any chunk";
                    _balancedLastTime = 0;
                } else {
                    _balancedLastTime =
                        _moveChunks(candidateChunks, writeConcern.get(), waitForDelete);
                }

                actionLog.setDetails(boost::none,
                                     balanceRoundTimer.millis(),
                                     static_cast<int>(candidateChunks.size()),
                                     _balancedLastTime);
                actionLog.setTime(jsTime());

                grid.catalogManager()->logAction(actionLog);

                LOG(1) << "*** end of balancing round";
            }

            // Ping again so scripts can determine if we're active without waiting
            _ping(true);

            sleepsecs(_balancedLastTime ? sleepTime / 10 : sleepTime);
        } catch (std::exception& e) {
            log() << "caught exception while doing balance: " << e.what();

            // Just to match the opening statement if in log level 1
            LOG(1) << "*** End of balancing round";

            // This round failed, tell the world!
            actionLog.setDetails(string(e.what()), balanceRoundTimer.millis(), 0, 0);
            actionLog.setTime(jsTime());

            grid.catalogManager()->logAction(actionLog);

            // Sleep a fair amount before retrying because of the error
            sleepsecs(sleepTime);

            continue;
        }
    }
}
Ejemplo n.º 2
0
// balancer是background的job
void Balancer::run() {
    // this is the body of a BackgroundJob so if we throw here we're basically ending the balancer
    // thread prematurely
	// 检查是否停机
    while (!inShutdown()) {
		 // 检查是否初始化
        if (!_init()) {
            log() << "will retry to initialize balancer in one minute" << endl;
            sleepsecs(60);
            continue;
        }

        break;
    }

    int sleepTime = 10;

    // getConnectioString and dist lock constructor does not throw, which is what we expect on while
    // on the balancer thread
    ConnectionString config = configServer.getConnectionString();
	// 分布式锁,不知道要锁定什么
    DistributedLock balanceLock(config, "balancer");
	// 循环检查shutdown
    while (!inShutdown()) {
        Timer balanceRoundTimer;
        ActionLogType actionLog;

        actionLog.setServer(getHostNameCached());
        actionLog.setWhat("balancer.round");

        try {
            ScopedDbConnection conn(config.toString(), 30);

            // ping has to be first so we keep things in the config server in sync
            _ping();

            BSONObj balancerResult;

            // use fresh shard state
            Shard::reloadShardInfo();

            // refresh chunk size (even though another balancer might be active)
            Chunk::refreshChunkSize();

            SettingsType balancerConfig;
            string errMsg;

            if (!grid.getBalancerSettings(&balancerConfig, &errMsg)) {
                warning() << errMsg;
                return;
            }

            // now make sure we should even be running
            if ((balancerConfig.isKeySet() &&  // balancer config doc exists
                 !grid.shouldBalance(balancerConfig)) ||
                MONGO_FAIL_POINT(skipBalanceRound)) {
                LOG(1) << "skipping balancing round because balancing is disabled" << endl;

                // Ping again so scripts can determine if we're active without waiting
                _ping(true);

                conn.done();

                sleepsecs(sleepTime);
                continue;
            }

            uassert(13258, "oids broken after resetting!", _checkOIDs());

            {
                dist_lock_try lk(&balanceLock, "doing balance round");
                if (!lk.got()) {
                    LOG(1) << "skipping balancing round because another balancer is active" << endl;

                    // Ping again so scripts can determine if we're active without waiting
                    _ping(true);

                    conn.done();

                    sleepsecs(sleepTime);  // no need to wake up soon
                    continue;
                }

                if (!isConfigServerConsistent()) {
                    conn.done();
                    warning() << "Skipping balancing round because data inconsistency"
                              << " was detected amongst the config servers." << endl;
                    sleepsecs(sleepTime);
                    continue;
                }

                const bool waitForDelete =
                    (balancerConfig.isWaitForDeleteSet() ? balancerConfig.getWaitForDelete()
                                                         : false);

                scoped_ptr<WriteConcernOptions> writeConcern;
                if (balancerConfig.isKeySet()) {  // if balancer doc exists.
                    StatusWith<WriteConcernOptions*> extractStatus =
                        balancerConfig.extractWriteConcern();
                    if (extractStatus.isOK()) {
                        writeConcern.reset(extractStatus.getValue());
                    } else {
                        warning() << extractStatus.toString();
                    }
                }

                LOG(1) << "*** start balancing round. "
                       << "waitForDelete: " << waitForDelete << ", secondaryThrottle: "
                       << (writeConcern.get() ? writeConcern->toBSON().toString() : "default")
                       << endl;

                vector<CandidateChunkPtr> candidateChunks;
                _doBalanceRound(conn.conn(), &candidateChunks);
                if (candidateChunks.size() == 0) {
                    LOG(1) << "no need to move any chunk" << endl;
                    _balancedLastTime = 0;
                } else {
                    _balancedLastTime =
                        _moveChunks(&candidateChunks, writeConcern.get(), waitForDelete);
                }

                actionLog.setDetails(_buildDetails(false,
                                                   balanceRoundTimer.millis(),
                                                   static_cast<int>(candidateChunks.size()),
                                                   _balancedLastTime,
                                                   ""));

                _reportRound(actionLog);

                LOG(1) << "*** end of balancing round" << endl;
            }

            // Ping again so scripts can determine if we're active without waiting
            _ping(true);

            conn.done();

            sleepsecs(_balancedLastTime ? sleepTime / 10 : sleepTime);
        } catch (std::exception& e) {
            log() << "caught exception while doing balance: " << e.what() << endl;

            // Just to match the opening statement if in log level 1
            LOG(1) << "*** End of balancing round" << endl;

            // This round failed, tell the world!
            actionLog.setDetails(_buildDetails(true, balanceRoundTimer.millis(), 0, 0, e.what()));

            _reportRound(actionLog);

            sleepsecs(sleepTime);  // sleep a fair amount b/c of error
            continue;
        }
    }
}