void Balancer::run() { Client::initThread("Balancer"); // This is the body of a BackgroundJob so if we throw here we're basically ending the balancer // thread prematurely. while (!inShutdown()) { auto txn = cc().makeOperationContext(); if (!_init(txn.get())) { log() << "will retry to initialize balancer in one minute"; sleepsecs(60); continue; } break; } Seconds balanceRoundInterval(kBalanceRoundDefaultInterval); while (!inShutdown()) { auto txn = cc().makeOperationContext(); BalanceRoundDetails roundDetails; try { // ping has to be first so we keep things in the config server in sync _ping(txn.get(), false); MONGO_FAIL_POINT_BLOCK(balancerRoundIntervalSetting, scopedBalancerRoundInterval) { const BSONObj& data = scopedBalancerRoundInterval.getData(); balanceRoundInterval = Seconds(data["sleepSecs"].numberInt()); } // Use fresh shard state and balancer settings Grid::get(txn.get())->shardRegistry()->reload(txn.get()); auto balancerConfig = Grid::get(txn.get())->getBalancerConfiguration(); Status refreshStatus = balancerConfig->refreshAndCheck(txn.get()); if (!refreshStatus.isOK()) { warning() << "Skipping balancing round" << causedBy(refreshStatus); sleepFor(balanceRoundInterval); continue; } // now make sure we should even be running if (!balancerConfig->isBalancerActive() || MONGO_FAIL_POINT(skipBalanceRound)) { LOG(1) << "skipping balancing round because balancing is disabled"; // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(balanceRoundInterval); continue; } uassert(13258, "oids broken after resetting!", _checkOIDs(txn.get())); { auto scopedDistLock = grid.catalogManager(txn.get()) ->distLock(txn.get(), "balancer", "doing balance round", DistLockManager::kSingleLockAttemptTimeout); if (!scopedDistLock.isOK()) { LOG(1) << "skipping balancing round" << causedBy(scopedDistLock.getStatus()); // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(balanceRoundInterval); // no need to wake up soon continue; } LOG(1) << "*** start balancing round. " << "waitForDelete: " << balancerConfig->waitForDelete() << ", secondaryThrottle: " << balancerConfig->getSecondaryThrottle().toBSON(); OCCASIONALLY warnOnMultiVersion( uassertStatusOK(_clusterStats->getStats(txn.get()))); Status status = _enforceTagRanges(txn.get()); if (!status.isOK()) { warning() << "Failed to enforce tag ranges" << causedBy(status); } else { LOG(1) << "Done enforcing tag range boundaries."; } const auto candidateChunks = uassertStatusOK( _chunkSelectionPolicy->selectChunksToMove(txn.get(), _balancedLastTime)); if (candidateChunks.empty()) { LOG(1) << "no need to move any chunk"; _balancedLastTime = 0; } else { _balancedLastTime = _moveChunks(txn.get(), candidateChunks, balancerConfig->getSecondaryThrottle(), balancerConfig->waitForDelete()); roundDetails.setSucceeded(static_cast<int>(candidateChunks.size()), _balancedLastTime); grid.catalogManager(txn.get()) ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON()); } LOG(1) << "*** End of balancing round"; } // Ping again so scripts can determine if we're active without waiting _ping(txn.get(), true); sleepFor(_balancedLastTime ? kShortBalanceRoundInterval : balanceRoundInterval); } catch (const std::exception& e) { log() << "caught exception while doing balance: " << e.what(); // Just to match the opening statement if in log level 1 LOG(1) << "*** End of balancing round"; // This round failed, tell the world! roundDetails.setFailed(e.what()); grid.catalogManager(txn.get()) ->logAction(txn.get(), "balancer.round", "", roundDetails.toBSON()); // Sleep a fair amount before retrying because of the error sleepFor(balanceRoundInterval); } } }
StatusWith<vector<MigrateInfo>> Balancer::_getCandidateChunks(OperationContext* txn) { vector<CollectionType> collections; Status collsStatus = grid.catalogManager(txn)->getCollections(txn, nullptr, &collections, nullptr); if (!collsStatus.isOK()) { return collsStatus; } if (collections.empty()) { return vector<MigrateInfo>(); } // Get a list of all the shards that are participating in this balance round along with any // maximum allowed quotas and current utilization. We get the latter by issuing // db.serverStatus() (mem.mapped) to all shards. // // TODO: skip unresponsive shards and mark information as stale. auto shardInfoStatus = DistributionStatus::populateShardInfoMap(txn); if (!shardInfoStatus.isOK()) { return shardInfoStatus.getStatus(); } const ShardInfoMap shardInfo(std::move(shardInfoStatus.getValue())); if (shardInfo.size() < 2) { return vector<MigrateInfo>(); } OCCASIONALLY warnOnMultiVersion(shardInfo); std::vector<MigrateInfo> candidateChunks; // For each collection, check if the balancing policy recommends moving anything around. for (const auto& coll : collections) { // Skip collections for which balancing is disabled const NamespaceString& nss = coll.getNs(); if (!coll.getAllowBalance()) { LOG(1) << "Not balancing collection " << nss << "; explicitly disabled."; continue; } std::vector<ChunkType> allNsChunks; Status status = grid.catalogManager(txn)->getChunks(txn, BSON(ChunkType::ns(nss.ns())), BSON(ChunkType::min() << 1), boost::none, // all chunks &allNsChunks, nullptr); if (!status.isOK()) { warning() << "failed to load chunks for ns " << nss.ns() << causedBy(status); continue; } set<BSONObj> allChunkMinimums; map<string, vector<ChunkType>> shardToChunksMap; for (const ChunkType& chunk : allNsChunks) { allChunkMinimums.insert(chunk.getMin().getOwned()); vector<ChunkType>& chunksList = shardToChunksMap[chunk.getShard()]; chunksList.push_back(chunk); } if (shardToChunksMap.empty()) { LOG(1) << "skipping empty collection (" << nss.ns() << ")"; continue; } for (ShardInfoMap::const_iterator i = shardInfo.begin(); i != shardInfo.end(); ++i) { // This loop just makes sure there is an entry in shardToChunksMap for every shard shardToChunksMap[i->first]; } DistributionStatus distStatus(shardInfo, shardToChunksMap); // TODO: TagRange contains all the information from TagsType except for the namespace, // so maybe the two can be merged at some point in order to avoid the // transformation below. vector<TagRange> ranges; { vector<TagsType> collectionTags; uassertStatusOK( grid.catalogManager(txn)->getTagsForCollection(txn, nss.ns(), &collectionTags)); for (const auto& tt : collectionTags) { ranges.push_back( TagRange(tt.getMinKey().getOwned(), tt.getMaxKey().getOwned(), tt.getTag())); uassert(16356, str::stream() << "tag ranges not valid for: " << nss.ns(), distStatus.addTagRange(ranges.back())); } } auto statusGetDb = grid.catalogCache()->getDatabase(txn, nss.db().toString()); if (!statusGetDb.isOK()) { warning() << "could not load db config to balance collection [" << nss.ns() << "]: " << statusGetDb.getStatus(); continue; } shared_ptr<DBConfig> cfg = statusGetDb.getValue(); // This line reloads the chunk manager once if this process doesn't know the collection // is sharded yet. shared_ptr<ChunkManager> cm = cfg->getChunkManagerIfExists(txn, nss.ns(), true); if (!cm) { warning() << "could not load chunks to balance " << nss.ns() << " collection"; continue; } // Loop through tags to make sure no chunk spans tags. Split on tag min for all chunks. bool didAnySplits = false; for (const TagRange& range : ranges) { BSONObj min = cm->getShardKeyPattern().getKeyPattern().extendRangeBound(range.min, false); if (allChunkMinimums.count(min) > 0) { continue; } didAnySplits = true; log() << "nss: " << nss.ns() << " need to split on " << min << " because there is a range there"; vector<BSONObj> splitPoints; splitPoints.push_back(min); shared_ptr<Chunk> c = cm->findIntersectingChunk(txn, min); Status status = c->multiSplit(txn, splitPoints, NULL); if (!status.isOK()) { error() << "split failed: " << status; } else { LOG(1) << "split worked"; } break; } if (didAnySplits) { // State change, just wait till next round continue; } shared_ptr<MigrateInfo> migrateInfo( BalancerPolicy::balance(nss.ns(), distStatus, _balancedLastTime)); if (migrateInfo) { candidateChunks.emplace_back(*migrateInfo); } } return candidateChunks; }
void Balancer::_doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks ) { verify( candidateChunks ); // // 1. Check whether there is any sharded collection to be balanced by querying // the ShardsNS::collections collection // auto_ptr<DBClientCursor> cursor = conn.query(CollectionType::ConfigNS, BSONObj()); if ( NULL == cursor.get() ) { warning() << "could not query " << CollectionType::ConfigNS << " while trying to balance" << endl; return; } vector< string > collections; while ( cursor->more() ) { BSONObj col = cursor->nextSafe(); // sharded collections will have a shard "key". if ( ! col[CollectionType::keyPattern()].eoo() && ! col[CollectionType::noBalance()].trueValue() ){ collections.push_back( col[CollectionType::ns()].String() ); } else if( col[CollectionType::noBalance()].trueValue() ){ LOG(1) << "not balancing collection " << col[CollectionType::ns()].String() << ", explicitly disabled" << endl; } } cursor.reset(); if ( collections.empty() ) { LOG(1) << "no collections to balance" << endl; return; } // // 2. Get a list of all the shards that are participating in this balance round // along with any maximum allowed quotas and current utilization. We get the // latter by issuing db.serverStatus() (mem.mapped) to all shards. // // TODO: skip unresponsive shards and mark information as stale. // ShardInfoMap shardInfo; Status loadStatus = DistributionStatus::populateShardInfoMap(&shardInfo); if (!loadStatus.isOK()) { warning() << "failed to load shard metadata" << causedBy(loadStatus) << endl; return; } if (shardInfo.size() < 2) { LOG(1) << "can't balance without more active shards" << endl; return; } OCCASIONALLY warnOnMultiVersion( shardInfo ); // // 3. For each collection, check if the balancing policy recommends moving anything around. // for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it ) { const string& ns = *it; OwnedPointerMap<string, OwnedPointerVector<ChunkType> > shardToChunksMap; cursor = conn.query(ChunkType::ConfigNS, QUERY(ChunkType::ns(ns)).sort(ChunkType::min())); set<BSONObj> allChunkMinimums; while ( cursor->more() ) { BSONObj chunkDoc = cursor->nextSafe().getOwned(); auto_ptr<ChunkType> chunk(new ChunkType()); string errmsg; if (!chunk->parseBSON(chunkDoc, &errmsg)) { error() << "bad chunk format for " << chunkDoc << ": " << errmsg << endl; return; } allChunkMinimums.insert(chunk->getMin().getOwned()); OwnedPointerVector<ChunkType>*& chunkList = shardToChunksMap.mutableMap()[chunk->getShard()]; if (chunkList == NULL) { chunkList = new OwnedPointerVector<ChunkType>(); } chunkList->mutableVector().push_back(chunk.release()); } cursor.reset(); if (shardToChunksMap.map().empty()) { LOG(1) << "skipping empty collection (" << ns << ")"; continue; } for (ShardInfoMap::const_iterator i = shardInfo.begin(); i != shardInfo.end(); ++i) { // this just makes sure there is an entry in shardToChunksMap for every shard OwnedPointerVector<ChunkType>*& chunkList = shardToChunksMap.mutableMap()[i->first]; if (chunkList == NULL) { chunkList = new OwnedPointerVector<ChunkType>(); } } DistributionStatus status(shardInfo, shardToChunksMap.map()); // load tags Status result = clusterCreateIndex(TagsType::ConfigNS, BSON(TagsType::ns() << 1 << TagsType::min() << 1), true, // unique WriteConcernOptions::AllConfigs, NULL); if ( !result.isOK() ) { warning() << "could not create index tags_1_min_1: " << result.reason() << endl; continue; } cursor = conn.query(TagsType::ConfigNS, QUERY(TagsType::ns(ns)).sort(TagsType::min())); vector<TagRange> ranges; while ( cursor->more() ) { BSONObj tag = cursor->nextSafe(); TagRange tr(tag[TagsType::min()].Obj().getOwned(), tag[TagsType::max()].Obj().getOwned(), tag[TagsType::tag()].String()); ranges.push_back(tr); uassert(16356, str::stream() << "tag ranges not valid for: " << ns, status.addTagRange(tr) ); } cursor.reset(); DBConfigPtr cfg = grid.getDBConfig( ns ); if ( !cfg ) { warning() << "could not load db config to balance " << ns << " collection" << endl; continue; } // This line reloads the chunk manager once if this process doesn't know the collection // is sharded yet. ChunkManagerPtr cm = cfg->getChunkManagerIfExists( ns, true ); if ( !cm ) { warning() << "could not load chunks to balance " << ns << " collection" << endl; continue; } // loop through tags to make sure no chunk spans tags; splits on tag min. for all chunks bool didAnySplits = false; for ( unsigned i = 0; i < ranges.size(); i++ ) { BSONObj min = ranges[i].min; min = cm->getShardKey().extendRangeBound( min, false ); if ( allChunkMinimums.count( min ) > 0 ) continue; didAnySplits = true; log() << "ns: " << ns << " need to split on " << min << " because there is a range there" << endl; ChunkPtr c = cm->findIntersectingChunk( min ); vector<BSONObj> splitPoints; splitPoints.push_back( min ); BSONObj res; if ( !c->multiSplit( splitPoints, res ) ) { error() << "split failed: " << res << endl; } else { LOG(1) << "split worked: " << res << endl; } break; } if ( didAnySplits ) { // state change, just wait till next round continue; } CandidateChunk* p = _policy->balance( ns, status, _balancedLastTime ); if ( p ) candidateChunks->push_back( CandidateChunkPtr( p ) ); } }
void Balancer::_doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks ) { verify( candidateChunks ); // // 1. Check whether there is any sharded collection to be balanced by querying // the ShardsNS::collections collection // auto_ptr<DBClientCursor> cursor = conn.query(CollectionType::ConfigNS, BSONObj()); vector< string > collections; while ( cursor->more() ) { BSONObj col = cursor->nextSafe(); // sharded collections will have a shard "key". if ( ! col[CollectionType::keyPattern()].eoo() && ! col[CollectionType::noBalance()].trueValue() ){ collections.push_back( col[CollectionType::ns()].String() ); } else if( col[CollectionType::noBalance()].trueValue() ){ LOG(1) << "not balancing collection " << col[CollectionType::ns()].String() << ", explicitly disabled" << endl; } } cursor.reset(); if ( collections.empty() ) { LOG(1) << "no collections to balance" << endl; return; } // // 2. Get a list of all the shards that are participating in this balance round // along with any maximum allowed quotas and current utilization. We get the // latter by issuing db.serverStatus() (mem.mapped) to all shards. // // TODO: skip unresponsive shards and mark information as stale. // vector<Shard> allShards; Shard::getAllShards( allShards ); if ( allShards.size() < 2) { LOG(1) << "can't balance without more active shards" << endl; return; } ShardInfoMap shardInfo; for ( vector<Shard>::const_iterator it = allShards.begin(); it != allShards.end(); ++it ) { const Shard& s = *it; ShardStatus status = s.getStatus(); shardInfo[ s.getName() ] = ShardInfo( s.getMaxSize(), status.mapped(), s.isDraining(), status.hasOpsQueued(), s.tags(), status.mongoVersion() ); } OCCASIONALLY warnOnMultiVersion( shardInfo ); // // 3. For each collection, check if the balancing policy recommends moving anything around. // for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it ) { const string& ns = *it; map< string,vector<BSONObj> > shardToChunksMap; cursor = conn.query(ChunkType::ConfigNS, QUERY(ChunkType::ns(ns)).sort(ChunkType::min())); set<BSONObj> allChunkMinimums; while ( cursor->more() ) { BSONObj chunk = cursor->nextSafe().getOwned(); vector<BSONObj>& chunks = shardToChunksMap[chunk[ChunkType::shard()].String()]; allChunkMinimums.insert( chunk[ChunkType::min()].Obj() ); chunks.push_back( chunk ); } cursor.reset(); if (shardToChunksMap.empty()) { LOG(1) << "skipping empty collection (" << ns << ")"; continue; } for ( vector<Shard>::iterator i=allShards.begin(); i!=allShards.end(); ++i ) { // this just makes sure there is an entry in shardToChunksMap for every shard Shard s = *i; shardToChunksMap[s.getName()].size(); } DistributionStatus status( shardInfo, shardToChunksMap ); // load tags conn.ensureIndex(TagsType::ConfigNS, BSON(TagsType::ns() << 1 << TagsType::min() << 1), true); cursor = conn.query(TagsType::ConfigNS, QUERY(TagsType::ns(ns)).sort(TagsType::min())); vector<TagRange> ranges; while ( cursor->more() ) { BSONObj tag = cursor->nextSafe(); TagRange tr(tag[TagsType::min()].Obj().getOwned(), tag[TagsType::max()].Obj().getOwned(), tag[TagsType::tag()].String()); ranges.push_back(tr); uassert(16356, str::stream() << "tag ranges not valid for: " << ns, status.addTagRange(tr) ); } cursor.reset(); DBConfigPtr cfg = grid.getDBConfig( ns ); if ( !cfg ) { warning() << "could not load db config to balance " << ns << " collection" << endl; continue; } // This line reloads the chunk manager once if this process doesn't know the collection // is sharded yet. ChunkManagerPtr cm = cfg->getChunkManagerIfExists( ns, true ); if ( !cm ) { warning() << "could not load chunks to balance " << ns << " collection" << endl; continue; } // loop through tags to make sure no chunk spans tags; splits on tag min. for all chunks bool didAnySplits = false; for ( unsigned i = 0; i < ranges.size(); i++ ) { BSONObj min = ranges[i].min; min = cm->getShardKey().extendRangeBound( min, false ); if ( allChunkMinimums.count( min ) > 0 ) continue; didAnySplits = true; log() << "ns: " << ns << " need to split on " << min << " because there is a range there" << endl; ChunkPtr c = cm->findIntersectingChunk( min ); vector<BSONObj> splitPoints; splitPoints.push_back( min ); BSONObj res; if ( !c->multiSplit( splitPoints, res ) ) { error() << "split failed: " << res << endl; } else { LOG(1) << "split worked: " << res << endl; } break; } if ( didAnySplits ) { // state change, just wait till next round continue; } CandidateChunk* p = _policy->balance( ns, status, _balancedLastTime ); if ( p ) candidateChunks->push_back( CandidateChunkPtr( p ) ); } }