示例#1
0
static void _reportRound(ActionLogType& actionLog) {
    try {
        ScopedDbConnection conn(configServer.getConnectionString(), 30);

        // send a copy of the message to the log in case it doesn't reach config.actionlog
        actionLog.setTime(jsTime());

        LOG(1) << "about to log balancer result: " << actionLog;

        // The following method is not thread safe. However, there is only one balancer
        // thread per mongos process. The create collection is a a no-op when the collection
        // already exists
        static bool createActionlog = false;
        if (!createActionlog) {
            try {
                static const int actionLogSizeBytes = 1024 * 1024 * 2;
                conn->createCollection(ActionLogType::ConfigNS, actionLogSizeBytes, true);
            } catch (const DBException& ex) {
                LOG(1) << "config.actionlog could not be created, another mongos process "
                       << "may have done so" << causedBy(ex);
            }
            createActionlog = true;
        }

        Status result = clusterInsert(
            ActionLogType::ConfigNS, actionLog.toBSON(), WriteConcernOptions::AllConfigs, NULL);

        if (!result.isOK()) {
            log() << "Error encountered while logging action from balancer " << result.reason();
        }

        conn.done();
    } catch (const DBException& ex) {
        // if we got here, it means the config change is only in the log;
        // the change didn't make it to config.actionlog
        warning() << "could not log balancer result" << causedBy(ex);
    }
}
示例#2
0
void Balancer::run() {
    Client::initThread("Balancer");

    // This is the body of a BackgroundJob so if we throw here we're basically ending the balancer
    // thread prematurely.
    while (!inShutdown()) {
        if (!_init()) {
            log() << "will retry to initialize balancer in one minute";
            sleepsecs(60);
            continue;
        }

        break;
    }

    const int sleepTime = 10;

    while (!inShutdown()) {
        Timer balanceRoundTimer;
        ActionLogType actionLog;

        actionLog.setServer(getHostNameCached());
        actionLog.setWhat("balancer.round");

        try {
            // ping has to be first so we keep things in the config server in sync
            _ping();

            BSONObj balancerResult;

            // use fresh shard state
            Shard::reloadShardInfo();

            // refresh chunk size (even though another balancer might be active)
            Chunk::refreshChunkSize();

            auto balSettingsResult =
                grid.catalogManager()->getGlobalSettings(SettingsType::BalancerDocKey);
            const bool isBalSettingsAbsent =
                balSettingsResult.getStatus() == ErrorCodes::NoMatchingDocument;
            if (!balSettingsResult.isOK() && !isBalSettingsAbsent) {
                warning() << balSettingsResult.getStatus();
                return;
            }
            const SettingsType& balancerConfig =
                isBalSettingsAbsent ? SettingsType{} :
                balSettingsResult.getValue();

            // now make sure we should even be running
            if ((!isBalSettingsAbsent && !grid.shouldBalance(balancerConfig)) ||
                    MONGO_FAIL_POINT(skipBalanceRound)) {
                LOG(1) << "skipping balancing round because balancing is disabled";

                // Ping again so scripts can determine if we're active without waiting
                _ping(true);

                sleepsecs(sleepTime);
                continue;
            }

            uassert(13258, "oids broken after resetting!", _checkOIDs());

            {
                auto scopedDistLock = grid.catalogManager()->getDistLockManager()->lock(
                                          "balancer", "doing balance round");

                if (!scopedDistLock.isOK()) {
                    LOG(1) << "skipping balancing round" << causedBy(scopedDistLock.getStatus());

                    // Ping again so scripts can determine if we're active without waiting
                    _ping(true);

                    sleepsecs(sleepTime);  // no need to wake up soon
                    continue;
                }

                const bool waitForDelete =
                    (balancerConfig.isWaitForDeleteSet() ? balancerConfig.getWaitForDelete()
                     : false);

                std::unique_ptr<WriteConcernOptions> writeConcern;
                if (balancerConfig.isKeySet()) {  // if balancer doc exists.
                    writeConcern = std::move(balancerConfig.getWriteConcern());
                }

                LOG(1) << "*** start balancing round. "
                       << "waitForDelete: " << waitForDelete << ", secondaryThrottle: "
                       << (writeConcern.get() ? writeConcern->toBSON().toString() : "default");

                vector<shared_ptr<MigrateInfo>> candidateChunks;
                _doBalanceRound(&candidateChunks);

                if (candidateChunks.size() == 0) {
                    LOG(1) << "no need to move any chunk";
                    _balancedLastTime = 0;
                } else {
                    _balancedLastTime =
                        _moveChunks(candidateChunks, writeConcern.get(), waitForDelete);
                }

                actionLog.setDetails(boost::none,
                                     balanceRoundTimer.millis(),
                                     static_cast<int>(candidateChunks.size()),
                                     _balancedLastTime);
                actionLog.setTime(jsTime());

                grid.catalogManager()->logAction(actionLog);

                LOG(1) << "*** end of balancing round";
            }

            // Ping again so scripts can determine if we're active without waiting
            _ping(true);

            sleepsecs(_balancedLastTime ? sleepTime / 10 : sleepTime);
        } catch (std::exception& e) {
            log() << "caught exception while doing balance: " << e.what();

            // Just to match the opening statement if in log level 1
            LOG(1) << "*** End of balancing round";

            // This round failed, tell the world!
            actionLog.setDetails(string(e.what()), balanceRoundTimer.millis(), 0, 0);
            actionLog.setTime(jsTime());

            grid.catalogManager()->logAction(actionLog);

            // Sleep a fair amount before retrying because of the error
            sleepsecs(sleepTime);

            continue;
        }
    }
}