void Balancer::run() { Client::initThread("Balancer"); // This is the body of a BackgroundJob so if we throw here we're basically ending the balancer // thread prematurely. while (!inShutdown()) { auto txn = cc().makeOperationContext(); if (!_init(txn.get())) { log() << "will retry to initialize balancer in one minute"; sleepsecs(60); continue; } break; } Seconds balanceRoundInterval(kBalanceRoundDefaultInterval); while (!inShutdown()) { auto txn = cc().makeOperationContext(); BalanceRoundDetails roundDetails; try { // ping has to be first so we keep things in the config server in sync _ping(txn.get(), false); MONGO_FAIL_POINT_BLOCK(balancerRoundIntervalSetting, scopedBalancerRoundInterval) { const BSONObj& data = scopedBalancerRoundInterval.getData(); balanceRoundInterval = Seconds(data["sleepSecs"].numberInt()); } // Use fresh shard state and balancer settings Grid::get(txn.get())->shardRegistry()->reload(txn.get()); auto balancerConfig = Grid::get(txn.get())->getBalancerConfiguration(); Status refreshStatus = balancerConfig->refreshAndCheck(txn.get()); if (!refreshStatus.isOK()) { warning() << "Skipping balancing round" << causedBy(refreshStatus); sleepFor(balanceRoundInterval); continue; } // now make sure we should even be running if (!balancerConfig->isBalancerActive() || MONGO_FAIL_POINT(skipBalanceRound)) { LOG(1) << "skipping balancing round because balancing is disabled"; // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(balanceRoundInterval); continue; } uassert(13258, "oids broken after resetting!", _checkOIDs(txn.get())); { auto scopedDistLock = grid.catalogManager(txn.get()) ->distLock(txn.get(), "balancer", "doing balance round", DistLockManager::kSingleLockAttemptTimeout); if (!scopedDistLock.isOK()) { LOG(1) << "skipping balancing round" << causedBy(scopedDistLock.getStatus()); // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(balanceRoundInterval); // no need to wake up soon continue; } LOG(1) << "*** start balancing round. " << "waitForDelete: " << balancerConfig->waitForDelete() << ", secondaryThrottle: " << balancerConfig->getSecondaryThrottle().toBSON(); OCCASIONALLY warnOnMultiVersion( uassertStatusOK(_clusterStats->getStats(txn.get()))); Status status = _enforceTagRanges(txn.get()); if (!status.isOK()) { warning() << "Failed to enforce tag ranges" << causedBy(status); } else { LOG(1) << "Done enforcing tag range boundaries."; } const auto candidateChunks = uassertStatusOK( _chunkSelectionPolicy->selectChunksToMove(txn.get(), _balancedLastTime)); if (candidateChunks.empty()) { LOG(1) << "no need to move any chunk"; _balancedLastTime = 0; } else { _balancedLastTime = _moveChunks(txn.get(), candidateChunks, balancerConfig->getSecondaryThrottle(), balancerConfig->waitForDelete()); roundDetails.setSucceeded(static_cast<int>(candidateChunks.size()), _balancedLastTime); grid.catalogManager(txn.get()) ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON()); } LOG(1) << "*** End of balancing round"; } // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(_balancedLastTime ? kShortBalanceRoundInterval : balanceRoundInterval); } catch (const std::exception& e) { log() << "caught exception while doing balance: " << e.what(); // Just to match the opening statement if in log level 1 LOG(1) << "*** End of balancing round"; // This round failed, tell the world! roundDetails.setFailed(e.what()); grid.catalogManager(txn.get()) ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON()); // Sleep a fair amount before retrying because of the error sleepFor(balanceRoundInterval); } } }
void Balancer::run() { Client::initThread("Balancer"); // This is the body of a BackgroundJob so if we throw here we're basically ending the balancer // thread prematurely. while (!inShutdown()) { auto txn = cc().makeOperationContext(); if (!_init(txn.get())) { log() << "will retry to initialize balancer in one minute"; sleepsecs(60); continue; } break; } Seconds balanceRoundInterval(kBalanceRoundDefaultInterval); while (!inShutdown()) { auto txn = cc().makeOperationContext(); BalanceRoundDetails roundDetails; try { // ping has to be first so we keep things in the config server in sync _ping(txn.get()); MONGO_FAIL_POINT_BLOCK(balancerRoundIntervalSetting, scopedBalancerRoundInterval) { const BSONObj& data = scopedBalancerRoundInterval.getData(); balanceRoundInterval = Seconds(data["sleepSecs"].numberInt()); } BSONObj balancerResult; // use fresh shard state grid.shardRegistry()->reload(txn.get()); // refresh chunk size (even though another balancer might be active) Chunk::refreshChunkSize(txn.get()); auto balSettingsResult = grid.catalogManager(txn.get())->getGlobalSettings( txn.get(), SettingsType::BalancerDocKey); const bool isBalSettingsAbsent = balSettingsResult.getStatus() == ErrorCodes::NoMatchingDocument; if (!balSettingsResult.isOK() && !isBalSettingsAbsent) { warning() << balSettingsResult.getStatus(); return; } const SettingsType& balancerConfig = isBalSettingsAbsent ? SettingsType{} : balSettingsResult.getValue(); // now make sure we should even be running if ((!isBalSettingsAbsent && !Chunk::shouldBalance(balancerConfig)) || MONGO_FAIL_POINT(skipBalanceRound)) { LOG(1) << "skipping balancing round because balancing is disabled"; // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(balanceRoundInterval); continue; } uassert(13258, "oids broken after resetting!", _checkOIDs(txn.get())); { auto scopedDistLock = grid.catalogManager(txn.get()) ->distLock(txn.get(), "balancer", "doing balance round", DistLockManager::kSingleLockAttemptTimeout); if (!scopedDistLock.isOK()) { LOG(1) << "skipping balancing round" << causedBy(scopedDistLock.getStatus()); // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(balanceRoundInterval); // no need to wake up soon continue; } const bool waitForDelete = (balancerConfig.isWaitForDeleteSet() ? balancerConfig.getWaitForDelete() : false); MigrationSecondaryThrottleOptions secondaryThrottle( MigrationSecondaryThrottleOptions::create( MigrationSecondaryThrottleOptions::kDefault)); if (balancerConfig.isKeySet()) { secondaryThrottle = uassertStatusOK(MigrationSecondaryThrottleOptions::createFromBalancerConfig( balancerConfig.toBSON())); } LOG(1) << "*** start balancing round. " << "waitForDelete: " << waitForDelete << ", secondaryThrottle: " << secondaryThrottle.toBSON(); const auto candidateChunks = uassertStatusOK(_getCandidateChunks(txn.get())); if (candidateChunks.empty()) { LOG(1) << "no need to move any chunk"; _balancedLastTime = 0; } else { _balancedLastTime = _moveChunks(txn.get(), candidateChunks, secondaryThrottle, waitForDelete); roundDetails.setSucceeded(static_cast<int>(candidateChunks.size()), _balancedLastTime); grid.catalogManager(txn.get()) ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON()); } LOG(1) << "*** End of balancing round"; } // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(_balancedLastTime ? kShortBalanceRoundInterval : balanceRoundInterval); } catch (const std::exception& e) { log() << "caught exception while doing balance: " << e.what(); // Just to match the opening statement if in log level 1 LOG(1) << "*** End of balancing round"; // This round failed, tell the world! roundDetails.setFailed(e.what()); grid.catalogManager(txn.get()) ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON()); // Sleep a fair amount before retrying because of the error sleepFor(balanceRoundInterval); } } }
void Balancer::_mainThread() { Client::initThread("Balancer"); log() << "CSRS balancer is starting"; const Seconds kInitBackoffInterval(60); // Take the balancer distributed lock and hold it permanently while (!_stopRequested()) { auto txn = cc().makeOperationContext(); auto shardingContext = Grid::get(txn.get()); auto distLockHandleStatus = shardingContext->catalogClient(txn.get())->getDistLockManager()->lockWithSessionID( txn.get(), "balancer", "CSRS Balancer", OID::gen()); if (distLockHandleStatus.isOK()) { break; } warning() << "Balancer distributed lock could not be acquired and will be retried in " << durationCount<Seconds>(kInitBackoffInterval) << " seconds" << causedBy(distLockHandleStatus.getStatus()); _sleepFor(txn.get(), kInitBackoffInterval); } log() << "CSRS balancer thread is now running"; // Main balancer loop while (!_stopRequested()) { auto txn = cc().makeOperationContext(); auto shardingContext = Grid::get(txn.get()); auto balancerConfig = shardingContext->getBalancerConfiguration(); BalanceRoundDetails roundDetails; _beginRound(txn.get()); try { shardingContext->shardRegistry()->reload(txn.get()); uassert(13258, "oids broken after resetting!", _checkOIDs(txn.get())); Status refreshStatus = balancerConfig->refreshAndCheck(txn.get()); if (!refreshStatus.isOK()) { warning() << "Skipping balancing round" << causedBy(refreshStatus); _endRound(txn.get(), kBalanceRoundDefaultInterval); continue; } if (!balancerConfig->shouldBalance()) { LOG(1) << "Skipping balancing round because balancing is disabled"; _endRound(txn.get(), kBalanceRoundDefaultInterval); continue; } { LOG(1) << "*** start balancing round. " << "waitForDelete: " << balancerConfig->waitForDelete() << ", secondaryThrottle: " << balancerConfig->getSecondaryThrottle().toBSON(); OCCASIONALLY warnOnMultiVersion( uassertStatusOK(_clusterStats->getStats(txn.get()))); Status status = _enforceTagRanges(txn.get()); if (!status.isOK()) { warning() << "Failed to enforce tag ranges" << causedBy(status); } else { LOG(1) << "Done enforcing tag range boundaries."; } const auto candidateChunks = uassertStatusOK( _chunkSelectionPolicy->selectChunksToMove(txn.get(), _balancedLastTime)); if (candidateChunks.empty()) { LOG(1) << "no need to move any chunk"; _balancedLastTime = false; } else { _balancedLastTime = _moveChunks(txn.get(), candidateChunks); roundDetails.setSucceeded(static_cast<int>(candidateChunks.size()), _balancedLastTime); shardingContext->catalogClient(txn.get())->logAction( txn.get(), "balancer.round", "", roundDetails.toBSON()); } LOG(1) << "*** End of balancing round"; } _endRound(txn.get(), _balancedLastTime ? kShortBalanceRoundInterval : kBalanceRoundDefaultInterval); } catch (const std::exception& e) { log() << "caught exception while doing balance: " << e.what(); // Just to match the opening statement if in log level 1 LOG(1) << "*** End of balancing round"; // This round failed, tell the world! roundDetails.setFailed(e.what()); shardingContext->catalogClient(txn.get())->logAction( txn.get(), "balancer.round", "", roundDetails.toBSON()); // Sleep a fair amount before retrying because of the error _endRound(txn.get(), kBalanceRoundDefaultInterval); } } log() << "CSRS balancer is stopped"; }
void Balancer::_mainThread() { Client::initThread("Balancer"); // TODO (SERVER-24754): Balancer thread should only keep the operation context alive while it is // doing balancing const auto txn = cc().makeOperationContext(); log() << "CSRS balancer is starting"; // TODO (SERVER-23096): Use the actual cluster id const OID csrsBalancerLockSessionID{OID()}; const Seconds kInitBackoffInterval(60); // The balancer thread is holding the balancer during its entire lifetime boost::optional<DistLockManager::ScopedDistLock> scopedBalancerLock; // Take the balancer distributed lock while (!_stopRequested() && !scopedBalancerLock) { auto shardingContext = Grid::get(txn.get()); auto scopedDistLock = shardingContext->catalogClient(txn.get())->getDistLockManager()->lockWithSessionID( txn.get(), "balancer", "CSRS balancer starting", csrsBalancerLockSessionID, DistLockManager::kSingleLockAttemptTimeout); if (!scopedDistLock.isOK()) { warning() << "Balancer distributed lock could not be acquired and will be retried in " "one minute" << causedBy(scopedDistLock.getStatus()); _sleepFor(txn.get(), kInitBackoffInterval); continue; } // Initialization and distributed lock acquisition succeeded scopedBalancerLock = std::move(scopedDistLock.getValue()); } log() << "CSRS balancer started with instance id " << csrsBalancerLockSessionID; // Main balancer loop while (!_stopRequested()) { auto shardingContext = Grid::get(txn.get()); auto balancerConfig = shardingContext->getBalancerConfiguration(); BalanceRoundDetails roundDetails; try { _beginRound(txn.get()); shardingContext->shardRegistry()->reload(txn.get()); uassert(13258, "oids broken after resetting!", _checkOIDs(txn.get())); Status refreshStatus = balancerConfig->refreshAndCheck(txn.get()); if (!refreshStatus.isOK()) { warning() << "Skipping balancing round" << causedBy(refreshStatus); _endRound(txn.get(), kBalanceRoundDefaultInterval); continue; } if (!balancerConfig->shouldBalance()) { LOG(1) << "Skipping balancing round because balancing is disabled"; _endRound(txn.get(), kBalanceRoundDefaultInterval); continue; } { LOG(1) << "*** start balancing round. " << "waitForDelete: " << balancerConfig->waitForDelete() << ", secondaryThrottle: " << balancerConfig->getSecondaryThrottle().toBSON(); OCCASIONALLY warnOnMultiVersion( uassertStatusOK(_clusterStats->getStats(txn.get()))); Status status = _enforceTagRanges(txn.get()); if (!status.isOK()) { warning() << "Failed to enforce tag ranges" << causedBy(status); } else { LOG(1) << "Done enforcing tag range boundaries."; } const auto candidateChunks = uassertStatusOK( _chunkSelectionPolicy->selectChunksToMove(txn.get(), _balancedLastTime)); if (candidateChunks.empty()) { LOG(1) << "no need to move any chunk"; _balancedLastTime = 0; } else { _balancedLastTime = _moveChunks(txn.get(), candidateChunks, balancerConfig->getSecondaryThrottle(), balancerConfig->waitForDelete()); roundDetails.setSucceeded(static_cast<int>(candidateChunks.size()), _balancedLastTime); shardingContext->catalogClient(txn.get())->logAction( txn.get(), "balancer.round", "", roundDetails.toBSON()); } LOG(1) << "*** End of balancing round"; } _endRound(txn.get(), _balancedLastTime ? kShortBalanceRoundInterval : kBalanceRoundDefaultInterval); } catch (const std::exception& e) { log() << "caught exception while doing balance: " << e.what(); // Just to match the opening statement if in log level 1 LOG(1) << "*** End of balancing round"; // This round failed, tell the world! roundDetails.setFailed(e.what()); shardingContext->catalogClient(txn.get())->logAction( txn.get(), "balancer.round", "", roundDetails.toBSON()); // Sleep a fair amount before retrying because of the error _endRound(txn.get(), kBalanceRoundDefaultInterval); } } log() << "CSRS balancer is stopped"; }