static void _reportRound(ActionLogType& actionLog) { try { ScopedDbConnection conn(configServer.getConnectionString(), 30); // send a copy of the message to the log in case it doesn't reach config.actionlog actionLog.setTime(jsTime()); LOG(1) << "about to log balancer result: " << actionLog; // The following method is not thread safe. However, there is only one balancer // thread per mongos process. The create collection is a a no-op when the collection // already exists static bool createActionlog = false; if (!createActionlog) { try { static const int actionLogSizeBytes = 1024 * 1024 * 2; conn->createCollection(ActionLogType::ConfigNS, actionLogSizeBytes, true); } catch (const DBException& ex) { LOG(1) << "config.actionlog could not be created, another mongos process " << "may have done so" << causedBy(ex); } createActionlog = true; } Status result = clusterInsert( ActionLogType::ConfigNS, actionLog.toBSON(), WriteConcernOptions::AllConfigs, NULL); if (!result.isOK()) { log() << "Error encountered while logging action from balancer " << result.reason(); } conn.done(); } catch (const DBException& ex) { // if we got here, it means the config change is only in the log; // the change didn't make it to config.actionlog warning() << "could not log balancer result" << causedBy(ex); } }
void Balancer::run() { Client::initThread("Balancer"); // This is the body of a BackgroundJob so if we throw here we're basically ending the balancer // thread prematurely. while (!inShutdown()) { if (!_init()) { log() << "will retry to initialize balancer in one minute"; sleepsecs(60); continue; } break; } const int sleepTime = 10; while (!inShutdown()) { Timer balanceRoundTimer; ActionLogType actionLog; actionLog.setServer(getHostNameCached()); actionLog.setWhat("balancer.round"); try { // ping has to be first so we keep things in the config server in sync _ping(); BSONObj balancerResult; // use fresh shard state Shard::reloadShardInfo(); // refresh chunk size (even though another balancer might be active) Chunk::refreshChunkSize(); auto balSettingsResult = grid.catalogManager()->getGlobalSettings(SettingsType::BalancerDocKey); const bool isBalSettingsAbsent = balSettingsResult.getStatus() == ErrorCodes::NoMatchingDocument; if (!balSettingsResult.isOK() && !isBalSettingsAbsent) { warning() << balSettingsResult.getStatus(); return; } const SettingsType& balancerConfig = isBalSettingsAbsent ? SettingsType{} : balSettingsResult.getValue(); // now make sure we should even be running if ((!isBalSettingsAbsent && !grid.shouldBalance(balancerConfig)) || MONGO_FAIL_POINT(skipBalanceRound)) { LOG(1) << "skipping balancing round because balancing is disabled"; // Ping again so scripts can determine if we're active without waiting _ping(true); sleepsecs(sleepTime); continue; } uassert(13258, "oids broken after resetting!", _checkOIDs()); { auto scopedDistLock = grid.catalogManager()->getDistLockManager()->lock( "balancer", "doing balance round"); if (!scopedDistLock.isOK()) { LOG(1) << "skipping balancing round" << causedBy(scopedDistLock.getStatus()); // Ping again so scripts can determine if we're active without waiting _ping(true); sleepsecs(sleepTime); // no need to wake up soon continue; } const bool waitForDelete = (balancerConfig.isWaitForDeleteSet() ? balancerConfig.getWaitForDelete() : false); std::unique_ptr<WriteConcernOptions> writeConcern; if (balancerConfig.isKeySet()) { // if balancer doc exists. writeConcern = std::move(balancerConfig.getWriteConcern()); } LOG(1) << "*** start balancing round. " << "waitForDelete: " << waitForDelete << ", secondaryThrottle: " << (writeConcern.get() ? writeConcern->toBSON().toString() : "default"); vector<shared_ptr<MigrateInfo>> candidateChunks; _doBalanceRound(&candidateChunks); if (candidateChunks.size() == 0) { LOG(1) << "no need to move any chunk"; _balancedLastTime = 0; } else { _balancedLastTime = _moveChunks(candidateChunks, writeConcern.get(), waitForDelete); } actionLog.setDetails(boost::none, balanceRoundTimer.millis(), static_cast<int>(candidateChunks.size()), _balancedLastTime); actionLog.setTime(jsTime()); grid.catalogManager()->logAction(actionLog); LOG(1) << "*** end of balancing round"; } // Ping again so scripts can determine if we're active without waiting _ping(true); sleepsecs(_balancedLastTime ? sleepTime / 10 : sleepTime); } catch (std::exception& e) { log() << "caught exception while doing balance: " << e.what(); // Just to match the opening statement if in log level 1 LOG(1) << "*** End of balancing round"; // This round failed, tell the world! actionLog.setDetails(string(e.what()), balanceRoundTimer.millis(), 0, 0); actionLog.setTime(jsTime()); grid.catalogManager()->logAction(actionLog); // Sleep a fair amount before retrying because of the error sleepsecs(sleepTime); continue; } } }