void Balancer::run() { Client::initThread("Balancer"); // This is the body of a BackgroundJob so if we throw here we're basically ending the balancer // thread prematurely. while (!inShutdown()) { if (!_init()) { log() << "will retry to initialize balancer in one minute"; sleepsecs(60); continue; } break; } const int sleepTime = 10; while (!inShutdown()) { Timer balanceRoundTimer; ActionLogType actionLog; actionLog.setServer(getHostNameCached()); actionLog.setWhat("balancer.round"); try { // ping has to be first so we keep things in the config server in sync _ping(); BSONObj balancerResult; // use fresh shard state Shard::reloadShardInfo(); // refresh chunk size (even though another balancer might be active) Chunk::refreshChunkSize(); auto balSettingsResult = grid.catalogManager()->getGlobalSettings(SettingsType::BalancerDocKey); const bool isBalSettingsAbsent = balSettingsResult.getStatus() == ErrorCodes::NoMatchingDocument; if (!balSettingsResult.isOK() && !isBalSettingsAbsent) { warning() << balSettingsResult.getStatus(); return; } const SettingsType& balancerConfig = isBalSettingsAbsent ? SettingsType{} : balSettingsResult.getValue(); // now make sure we should even be running if ((!isBalSettingsAbsent && !grid.shouldBalance(balancerConfig)) || MONGO_FAIL_POINT(skipBalanceRound)) { LOG(1) << "skipping balancing round because balancing is disabled"; // Ping again so scripts can determine if we're active without waiting _ping(true); sleepsecs(sleepTime); continue; } uassert(13258, "oids broken after resetting!", _checkOIDs()); { auto scopedDistLock = grid.catalogManager()->getDistLockManager()->lock( "balancer", "doing balance round"); if (!scopedDistLock.isOK()) { LOG(1) << "skipping balancing round" << causedBy(scopedDistLock.getStatus()); // Ping again so scripts can determine if we're active without waiting _ping(true); sleepsecs(sleepTime); // no need to wake up soon continue; } const bool waitForDelete = (balancerConfig.isWaitForDeleteSet() ? balancerConfig.getWaitForDelete() : false); std::unique_ptr<WriteConcernOptions> writeConcern; if (balancerConfig.isKeySet()) { // if balancer doc exists. writeConcern = std::move(balancerConfig.getWriteConcern()); } LOG(1) << "*** start balancing round. " << "waitForDelete: " << waitForDelete << ", secondaryThrottle: " << (writeConcern.get() ? writeConcern->toBSON().toString() : "default"); vector<shared_ptr<MigrateInfo>> candidateChunks; _doBalanceRound(&candidateChunks); if (candidateChunks.size() == 0) { LOG(1) << "no need to move any chunk"; _balancedLastTime = 0; } else { _balancedLastTime = _moveChunks(candidateChunks, writeConcern.get(), waitForDelete); } actionLog.setDetails(boost::none, balanceRoundTimer.millis(), static_cast<int>(candidateChunks.size()), _balancedLastTime); actionLog.setTime(jsTime()); grid.catalogManager()->logAction(actionLog); LOG(1) << "*** end of balancing round"; } // Ping again so scripts can determine if we're active without waiting _ping(true); sleepsecs(_balancedLastTime ? sleepTime / 10 : sleepTime); } catch (std::exception& e) { log() << "caught exception while doing balance: " << e.what(); // Just to match the opening statement if in log level 1 LOG(1) << "*** End of balancing round"; // This round failed, tell the world! actionLog.setDetails(string(e.what()), balanceRoundTimer.millis(), 0, 0); actionLog.setTime(jsTime()); grid.catalogManager()->logAction(actionLog); // Sleep a fair amount before retrying because of the error sleepsecs(sleepTime); continue; } } }
// balancer是background的job void Balancer::run() { // this is the body of a BackgroundJob so if we throw here we're basically ending the balancer // thread prematurely // 检查是否停机 while (!inShutdown()) { // 检查是否初始化 if (!_init()) { log() << "will retry to initialize balancer in one minute" << endl; sleepsecs(60); continue; } break; } int sleepTime = 10; // getConnectioString and dist lock constructor does not throw, which is what we expect on while // on the balancer thread ConnectionString config = configServer.getConnectionString(); // 分布式锁,不知道要锁定什么 DistributedLock balanceLock(config, "balancer"); // 循环检查shutdown while (!inShutdown()) { Timer balanceRoundTimer; ActionLogType actionLog; actionLog.setServer(getHostNameCached()); actionLog.setWhat("balancer.round"); try { ScopedDbConnection conn(config.toString(), 30); // ping has to be first so we keep things in the config server in sync _ping(); BSONObj balancerResult; // use fresh shard state Shard::reloadShardInfo(); // refresh chunk size (even though another balancer might be active) Chunk::refreshChunkSize(); SettingsType balancerConfig; string errMsg; if (!grid.getBalancerSettings(&balancerConfig, &errMsg)) { warning() << errMsg; return; } // now make sure we should even be running if ((balancerConfig.isKeySet() && // balancer config doc exists !grid.shouldBalance(balancerConfig)) || MONGO_FAIL_POINT(skipBalanceRound)) { LOG(1) << "skipping balancing round because balancing is disabled" << endl; // Ping again so scripts can determine if we're active without waiting _ping(true); conn.done(); sleepsecs(sleepTime); continue; } uassert(13258, "oids broken after resetting!", _checkOIDs()); { dist_lock_try lk(&balanceLock, "doing balance round"); if (!lk.got()) { LOG(1) << "skipping balancing round because another balancer is active" << endl; // Ping again so scripts can determine if we're active without waiting _ping(true); conn.done(); sleepsecs(sleepTime); // no need to wake up soon continue; } if (!isConfigServerConsistent()) { conn.done(); warning() << "Skipping balancing round because data inconsistency" << " was detected amongst the config servers." << endl; sleepsecs(sleepTime); continue; } const bool waitForDelete = (balancerConfig.isWaitForDeleteSet() ? balancerConfig.getWaitForDelete() : false); scoped_ptr<WriteConcernOptions> writeConcern; if (balancerConfig.isKeySet()) { // if balancer doc exists. StatusWith<WriteConcernOptions*> extractStatus = balancerConfig.extractWriteConcern(); if (extractStatus.isOK()) { writeConcern.reset(extractStatus.getValue()); } else { warning() << extractStatus.toString(); } } LOG(1) << "*** start balancing round. " << "waitForDelete: " << waitForDelete << ", secondaryThrottle: " << (writeConcern.get() ? writeConcern->toBSON().toString() : "default") << endl; vector<CandidateChunkPtr> candidateChunks; _doBalanceRound(conn.conn(), &candidateChunks); if (candidateChunks.size() == 0) { LOG(1) << "no need to move any chunk" << endl; _balancedLastTime = 0; } else { _balancedLastTime = _moveChunks(&candidateChunks, writeConcern.get(), waitForDelete); } actionLog.setDetails(_buildDetails(false, balanceRoundTimer.millis(), static_cast<int>(candidateChunks.size()), _balancedLastTime, "")); _reportRound(actionLog); LOG(1) << "*** end of balancing round" << endl; } // Ping again so scripts can determine if we're active without waiting _ping(true); conn.done(); sleepsecs(_balancedLastTime ? sleepTime / 10 : sleepTime); } catch (std::exception& e) { log() << "caught exception while doing balance: " << e.what() << endl; // Just to match the opening statement if in log level 1 LOG(1) << "*** End of balancing round" << endl; // This round failed, tell the world! actionLog.setDetails(_buildDetails(true, balanceRoundTimer.millis(), 0, 0, e.what())); _reportRound(actionLog); sleepsecs(sleepTime); // sleep a fair amount b/c of error continue; } } }