void CatalogManagerReplicaSet::logAction(OperationContext* txn, const ActionLogType& actionLog) {
    if (_actionLogCollectionCreated.load() == 0) {
        BSONObj createCmd = BSON("create" << ActionLogType::ConfigNS << "capped" << true << "size"
                                          << kActionLogCollectionSize);
        auto result =
            grid.shardRegistry()->runCommandOnConfigWithNotMasterRetries("config", createCmd);
        if (!result.isOK()) {
            LOG(1) << "couldn't create actionlog collection: " << causedBy(result.getStatus());
            return;
        }

        Status commandStatus = Command::getStatusFromCommandResult(result.getValue());
        if (commandStatus.isOK() || commandStatus == ErrorCodes::NamespaceExists) {
            _actionLogCollectionCreated.store(1);
        } else {
            LOG(1) << "couldn't create actionlog collection: " << causedBy(commandStatus);
            return;
        }
    }

    Status result = insert(txn, ActionLogType::ConfigNS, actionLog.toBSON(), NULL);
    if (!result.isOK()) {
        log() << "error encountered while logging action: " << result;
    }
}
Exemplo n.º 2
0
static void _reportRound(ActionLogType& actionLog) {
    try {
        ScopedDbConnection conn(configServer.getConnectionString(), 30);

        // send a copy of the message to the log in case it doesn't reach config.actionlog
        actionLog.setTime(jsTime());

        LOG(1) << "about to log balancer result: " << actionLog;

        // The following method is not thread safe. However, there is only one balancer
        // thread per mongos process. The create collection is a a no-op when the collection
        // already exists
        static bool createActionlog = false;
        if (!createActionlog) {
            try {
                static const int actionLogSizeBytes = 1024 * 1024 * 2;
                conn->createCollection(ActionLogType::ConfigNS, actionLogSizeBytes, true);
            } catch (const DBException& ex) {
                LOG(1) << "config.actionlog could not be created, another mongos process "
                       << "may have done so" << causedBy(ex);
            }
            createActionlog = true;
        }

        Status result = clusterInsert(
            ActionLogType::ConfigNS, actionLog.toBSON(), WriteConcernOptions::AllConfigs, NULL);

        if (!result.isOK()) {
            log() << "Error encountered while logging action from balancer " << result.reason();
        }

        conn.done();
    } catch (const DBException& ex) {
        // if we got here, it means the config change is only in the log;
        // the change didn't make it to config.actionlog
        warning() << "could not log balancer result" << causedBy(ex);
    }
}
Exemplo n.º 3
0
void Balancer::run() {
    Client::initThread("Balancer");

    // This is the body of a BackgroundJob so if we throw here we're basically ending the balancer
    // thread prematurely.
    while (!inShutdown()) {
        if (!_init()) {
            log() << "will retry to initialize balancer in one minute";
            sleepsecs(60);
            continue;
        }

        break;
    }

    const int sleepTime = 10;

    while (!inShutdown()) {
        Timer balanceRoundTimer;
        ActionLogType actionLog;

        actionLog.setServer(getHostNameCached());
        actionLog.setWhat("balancer.round");

        try {
            // ping has to be first so we keep things in the config server in sync
            _ping();

            BSONObj balancerResult;

            // use fresh shard state
            Shard::reloadShardInfo();

            // refresh chunk size (even though another balancer might be active)
            Chunk::refreshChunkSize();

            auto balSettingsResult =
                grid.catalogManager()->getGlobalSettings(SettingsType::BalancerDocKey);
            const bool isBalSettingsAbsent =
                balSettingsResult.getStatus() == ErrorCodes::NoMatchingDocument;
            if (!balSettingsResult.isOK() && !isBalSettingsAbsent) {
                warning() << balSettingsResult.getStatus();
                return;
            }
            const SettingsType& balancerConfig =
                isBalSettingsAbsent ? SettingsType{} :
                balSettingsResult.getValue();

            // now make sure we should even be running
            if ((!isBalSettingsAbsent && !grid.shouldBalance(balancerConfig)) ||
                    MONGO_FAIL_POINT(skipBalanceRound)) {
                LOG(1) << "skipping balancing round because balancing is disabled";

                // Ping again so scripts can determine if we're active without waiting
                _ping(true);

                sleepsecs(sleepTime);
                continue;
            }

            uassert(13258, "oids broken after resetting!", _checkOIDs());

            {
                auto scopedDistLock = grid.catalogManager()->getDistLockManager()->lock(
                                          "balancer", "doing balance round");

                if (!scopedDistLock.isOK()) {
                    LOG(1) << "skipping balancing round" << causedBy(scopedDistLock.getStatus());

                    // Ping again so scripts can determine if we're active without waiting
                    _ping(true);

                    sleepsecs(sleepTime);  // no need to wake up soon
                    continue;
                }

                const bool waitForDelete =
                    (balancerConfig.isWaitForDeleteSet() ? balancerConfig.getWaitForDelete()
                     : false);

                std::unique_ptr<WriteConcernOptions> writeConcern;
                if (balancerConfig.isKeySet()) {  // if balancer doc exists.
                    writeConcern = std::move(balancerConfig.getWriteConcern());
                }

                LOG(1) << "*** start balancing round. "
                       << "waitForDelete: " << waitForDelete << ", secondaryThrottle: "
                       << (writeConcern.get() ? writeConcern->toBSON().toString() : "default");

                vector<shared_ptr<MigrateInfo>> candidateChunks;
                _doBalanceRound(&candidateChunks);

                if (candidateChunks.size() == 0) {
                    LOG(1) << "no need to move any chunk";
                    _balancedLastTime = 0;
                } else {
                    _balancedLastTime =
                        _moveChunks(candidateChunks, writeConcern.get(), waitForDelete);
                }

                actionLog.setDetails(boost::none,
                                     balanceRoundTimer.millis(),
                                     static_cast<int>(candidateChunks.size()),
                                     _balancedLastTime);
                actionLog.setTime(jsTime());

                grid.catalogManager()->logAction(actionLog);

                LOG(1) << "*** end of balancing round";
            }

            // Ping again so scripts can determine if we're active without waiting
            _ping(true);

            sleepsecs(_balancedLastTime ? sleepTime / 10 : sleepTime);
        } catch (std::exception& e) {
            log() << "caught exception while doing balance: " << e.what();

            // Just to match the opening statement if in log level 1
            LOG(1) << "*** End of balancing round";

            // This round failed, tell the world!
            actionLog.setDetails(string(e.what()), balanceRoundTimer.millis(), 0, 0);
            actionLog.setTime(jsTime());

            grid.catalogManager()->logAction(actionLog);

            // Sleep a fair amount before retrying because of the error
            sleepsecs(sleepTime);

            continue;
        }
    }
}
Exemplo n.º 4
0
// balancer是background的job
void Balancer::run() {
    // this is the body of a BackgroundJob so if we throw here we're basically ending the balancer
    // thread prematurely
	// 检查是否停机
    while (!inShutdown()) {
		 // 检查是否初始化
        if (!_init()) {
            log() << "will retry to initialize balancer in one minute" << endl;
            sleepsecs(60);
            continue;
        }

        break;
    }

    int sleepTime = 10;

    // getConnectioString and dist lock constructor does not throw, which is what we expect on while
    // on the balancer thread
    ConnectionString config = configServer.getConnectionString();
	// 分布式锁,不知道要锁定什么
    DistributedLock balanceLock(config, "balancer");
	// 循环检查shutdown
    while (!inShutdown()) {
        Timer balanceRoundTimer;
        ActionLogType actionLog;

        actionLog.setServer(getHostNameCached());
        actionLog.setWhat("balancer.round");

        try {
            ScopedDbConnection conn(config.toString(), 30);

            // ping has to be first so we keep things in the config server in sync
            _ping();

            BSONObj balancerResult;

            // use fresh shard state
            Shard::reloadShardInfo();

            // refresh chunk size (even though another balancer might be active)
            Chunk::refreshChunkSize();

            SettingsType balancerConfig;
            string errMsg;

            if (!grid.getBalancerSettings(&balancerConfig, &errMsg)) {
                warning() << errMsg;
                return;
            }

            // now make sure we should even be running
            if ((balancerConfig.isKeySet() &&  // balancer config doc exists
                 !grid.shouldBalance(balancerConfig)) ||
                MONGO_FAIL_POINT(skipBalanceRound)) {
                LOG(1) << "skipping balancing round because balancing is disabled" << endl;

                // Ping again so scripts can determine if we're active without waiting
                _ping(true);

                conn.done();

                sleepsecs(sleepTime);
                continue;
            }

            uassert(13258, "oids broken after resetting!", _checkOIDs());

            {
                dist_lock_try lk(&balanceLock, "doing balance round");
                if (!lk.got()) {
                    LOG(1) << "skipping balancing round because another balancer is active" << endl;

                    // Ping again so scripts can determine if we're active without waiting
                    _ping(true);

                    conn.done();

                    sleepsecs(sleepTime);  // no need to wake up soon
                    continue;
                }

                if (!isConfigServerConsistent()) {
                    conn.done();
                    warning() << "Skipping balancing round because data inconsistency"
                              << " was detected amongst the config servers." << endl;
                    sleepsecs(sleepTime);
                    continue;
                }

                const bool waitForDelete =
                    (balancerConfig.isWaitForDeleteSet() ? balancerConfig.getWaitForDelete()
                                                         : false);

                scoped_ptr<WriteConcernOptions> writeConcern;
                if (balancerConfig.isKeySet()) {  // if balancer doc exists.
                    StatusWith<WriteConcernOptions*> extractStatus =
                        balancerConfig.extractWriteConcern();
                    if (extractStatus.isOK()) {
                        writeConcern.reset(extractStatus.getValue());
                    } else {
                        warning() << extractStatus.toString();
                    }
                }

                LOG(1) << "*** start balancing round. "
                       << "waitForDelete: " << waitForDelete << ", secondaryThrottle: "
                       << (writeConcern.get() ? writeConcern->toBSON().toString() : "default")
                       << endl;

                vector<CandidateChunkPtr> candidateChunks;
                _doBalanceRound(conn.conn(), &candidateChunks);
                if (candidateChunks.size() == 0) {
                    LOG(1) << "no need to move any chunk" << endl;
                    _balancedLastTime = 0;
                } else {
                    _balancedLastTime =
                        _moveChunks(&candidateChunks, writeConcern.get(), waitForDelete);
                }

                actionLog.setDetails(_buildDetails(false,
                                                   balanceRoundTimer.millis(),
                                                   static_cast<int>(candidateChunks.size()),
                                                   _balancedLastTime,
                                                   ""));

                _reportRound(actionLog);

                LOG(1) << "*** end of balancing round" << endl;
            }

            // Ping again so scripts can determine if we're active without waiting
            _ping(true);

            conn.done();

            sleepsecs(_balancedLastTime ? sleepTime / 10 : sleepTime);
        } catch (std::exception& e) {
            log() << "caught exception while doing balance: " << e.what() << endl;

            // Just to match the opening statement if in log level 1
            LOG(1) << "*** End of balancing round" << endl;

            // This round failed, tell the world!
            actionLog.setDetails(_buildDetails(true, balanceRoundTimer.millis(), 0, 0, e.what()));

            _reportRound(actionLog);

            sleepsecs(sleepTime);  // sleep a fair amount b/c of error
            continue;
        }
    }
}