void MoveChunkRequest::appendAsCommand(BSONObjBuilder* builder, const NamespaceString& nss, ChunkVersion chunkVersion, const ConnectionString& configServerConnectionString, const ShardId& fromShardId, const ShardId& toShardId, const ChunkRange& range, int64_t maxChunkSizeBytes, const MigrationSecondaryThrottleOptions& secondaryThrottle, bool waitForDelete) { invariant(builder->asTempObj().isEmpty()); invariant(nss.isValid()); builder->append(kMoveChunk, nss.ns()); chunkVersion.appendToCommand(builder); // 3.4 shard compatibility builder->append(kEpoch, chunkVersion.epoch()); // config connection string is included for 3.4 shard compatibility builder->append(kConfigServerConnectionString, configServerConnectionString.toString()); builder->append(kFromShardId, fromShardId.toString()); builder->append(kToShardId, toShardId.toString()); range.append(builder); builder->append(kMaxChunkSizeBytes, static_cast<long long>(maxChunkSizeBytes)); secondaryThrottle.append(builder); builder->append(kWaitForDelete, waitForDelete); builder->append(kTakeDistLock, false); }
void MoveChunkRequest::appendAsCommand(BSONObjBuilder* builder, const NamespaceString& nss, const ChunkVersion& shardVersion, const ConnectionString& configServerConnectionString, const ShardId& fromShardId, const ShardId& toShardId, const ChunkRange& range, int64_t maxChunkSizeBytes, const MigrationSecondaryThrottleOptions& secondaryThrottle, bool waitForDelete, bool takeDistLock) { invariant(builder->asTempObj().isEmpty()); invariant(nss.isValid()); builder->append(kMoveChunk, nss.ns()); shardVersion.appendForCommands(builder); builder->append(kConfigServerConnectionString, configServerConnectionString.toString()); builder->append(kFromShardId, fromShardId.toString()); builder->append(kToShardId, toShardId.toString()); range.append(builder); builder->append(kMaxChunkSizeBytes, static_cast<long long>(maxChunkSizeBytes)); secondaryThrottle.append(builder); builder->append(kWaitForDelete, waitForDelete); builder->append(kTakeDistLock, takeDistLock); }
StatusWith<boost::optional<MigrateInfo>> BalancerChunkSelectionPolicyImpl::selectSpecificChunkToMove(OperationContext* txn, const ChunkType& chunk) { const NamespaceString nss(chunk.getNS()); auto scopedCMStatus = ScopedChunkManager::getExisting(txn, nss); if (!scopedCMStatus.isOK()) { return scopedCMStatus.getStatus(); } auto scopedCM = std::move(scopedCMStatus.getValue()); ChunkManager* const cm = scopedCM.cm(); auto tagForChunkStatus = Grid::get(txn)->catalogManager(txn)->getTagForChunk(txn, nss.ns(), chunk); if (!tagForChunkStatus.isOK()) { return tagForChunkStatus.getStatus(); } auto shardStatsStatus = _clusterStats->getStats(txn); if (!shardStatsStatus.isOK()) { return shardStatsStatus.getStatus(); } auto collInfo = createCollectionDistributionInfo(shardStatsStatus.getValue(), cm); ShardToChunksMap shardToChunksMap = std::move(std::get<0>(collInfo)); DistributionStatus distStatus(shardStatsStatus.getValue(), shardToChunksMap); const ShardId newShardId(distStatus.getBestReceieverShard(tagForChunkStatus.getValue())); if (newShardId.empty() || newShardId == chunk.getShard()) { return boost::optional<MigrateInfo>(); } return boost::optional<MigrateInfo>{MigrateInfo(nss.ns(), newShardId, chunk)}; }
void StartChunkCloneRequest::appendAsCommand( BSONObjBuilder* builder, const NamespaceString& nss, const MigrationSessionId& sessionId, const ConnectionString& configServerConnectionString, const ConnectionString& fromShardConnectionString, const ShardId& fromShardId, const ShardId& toShardId, const BSONObj& chunkMinKey, const BSONObj& chunkMaxKey, const BSONObj& shardKeyPattern, const MigrationSecondaryThrottleOptions& secondaryThrottle) { invariant(builder->asTempObj().isEmpty()); invariant(nss.isValid()); invariant(fromShardConnectionString.isValid()); builder->append(kRecvChunkStart, nss.ns()); sessionId.append(builder); builder->append(kConfigServerConnectionString, configServerConnectionString.toString()); builder->append(kFromShardConnectionString, fromShardConnectionString.toString()); builder->append(kFromShardId, fromShardId.toString()); builder->append(kToShardId, toShardId.toString()); builder->append(kChunkMinKey, chunkMinKey); builder->append(kChunkMaxKey, chunkMaxKey); builder->append(kShardKeyPattern, shardKeyPattern); secondaryThrottle.append(builder); }
boost::optional<MigrateInfo> BalancerPolicy::balanceSingleChunk( const ChunkType& chunk, const ShardStatisticsVector& shardStats, const DistributionStatus& distribution) { const string tag = distribution.getTagForChunk(chunk); ShardId newShardId = _getLeastLoadedReceiverShard(shardStats, distribution, tag, {}); if (!newShardId.isValid() || newShardId == chunk.getShard()) { return boost::optional<MigrateInfo>(); } return MigrateInfo(distribution.nss().ns(), newShardId, chunk); }
StatusWith<std::vector<std::string>> ShardingCatalogManager::getDatabasesForShard( OperationContext* opCtx, const ShardId& shardId) { auto findStatus = Grid::get(opCtx)->catalogClient()->_exhaustiveFindOnConfig( opCtx, kConfigReadSelector, repl::ReadConcernLevel::kLocalReadConcern, DatabaseType::ConfigNS, BSON(DatabaseType::primary(shardId.toString())), BSONObj(), boost::none); // no limit if (!findStatus.isOK()) return findStatus.getStatus(); std::vector<std::string> dbs; for (const BSONObj& obj : findStatus.getValue().value) { std::string dbName; Status status = bsonExtractStringField(obj, DatabaseType::name(), &dbName); if (!status.isOK()) { return status; } dbs.push_back(dbName); } return dbs; }
void CommitChunkMigrationRequest::appendAsCommand( BSONObjBuilder* builder, const NamespaceString& nss, const ShardId& fromShard, const ShardId& toShard, const ChunkType& migratedChunkType, const boost::optional<ChunkType>& controlChunkType) { invariant(builder->asTempObj().isEmpty()); invariant(nss.isValid()); builder->append(kConfigSvrCommitChunkMigration, nss.ns()); builder->append(kFromShard, fromShard.toString()); builder->append(kToShard, toShard.toString()); builder->append(kMigratedChunk, migratedChunkType.toBSON()); if (controlChunkType) { builder->append(kControlChunk, controlChunkType->toBSON()); } }
void appendWriteConcernErrorToCmdResponse(const ShardId& shardId, const BSONElement& wcErrorElem, BSONObjBuilder& responseBuilder) { WriteConcernErrorDetail wcError; std::string errMsg; auto wcErrorObj = wcErrorElem.Obj(); if (!wcError.parseBSON(wcErrorObj, &errMsg)) { wcError.clear(); wcError.setStatus({ErrorCodes::FailedToParse, "Failed to parse writeConcernError: " + wcErrorObj.toString() + ", Received error: " + errMsg}); } auto status = wcError.toStatus(); wcError.setStatus( status.withReason(str::stream() << status.reason() << " at " << shardId.toString())); responseBuilder.append("writeConcernError", wcError.toBSON()); }
bool BalancerPolicy::_singleZoneBalance(const ShardStatisticsVector& shardStats, const DistributionStatus& distribution, const string& tag, size_t imbalanceThreshold, vector<MigrateInfo>* migrations, set<ShardId>* usedShards) { const ShardId from = _getMostOverloadedShard(shardStats, distribution, tag, *usedShards); if (!from.isValid()) return false; const size_t max = distribution.numberOfChunksInShardWithTag(from, tag); if (max == 0) return false; const ShardId to = _getLeastLoadedReceiverShard(shardStats, distribution, tag, *usedShards); if (!to.isValid()) { if (migrations->empty()) { log() << "No available shards to take chunks for tag [" << tag << "]"; } return false; } const size_t min = distribution.numberOfChunksInShardWithTag(to, tag); if (min >= max) return false; const size_t totalNumberOfChunksWithTag = (tag.empty() ? distribution.totalChunks() : distribution.totalChunksWithTag(tag)); size_t totalNumberOfShardsWithTag = 0; for (const auto& stat : shardStats) { if (tag.empty() || stat.shardTags.count(tag)) { totalNumberOfShardsWithTag++; } } // totalNumberOfShardsWithTag cannot be zero if the to shard is valid invariant(totalNumberOfShardsWithTag); invariant(totalNumberOfChunksWithTag >= max); // The ideal should be at least one per shard const size_t idealNumberOfChunksPerShardWithTag = (totalNumberOfChunksWithTag < totalNumberOfShardsWithTag) ? 1 : (totalNumberOfChunksWithTag / totalNumberOfShardsWithTag); const size_t imbalance = max - idealNumberOfChunksPerShardWithTag; LOG(1) << "collection : " << distribution.nss().ns(); LOG(1) << "zone : " << tag; LOG(1) << "donor : " << from << " chunks on " << max; LOG(1) << "receiver : " << to << " chunks on " << min; LOG(1) << "ideal : " << idealNumberOfChunksPerShardWithTag; LOG(1) << "threshold : " << imbalanceThreshold; // Check whether it is necessary to balance within this zone if (imbalance < imbalanceThreshold) return false; const vector<ChunkType>& chunks = distribution.getChunks(from); unsigned numJumboChunks = 0; for (const auto& chunk : chunks) { if (distribution.getTagForChunk(chunk) != tag) continue; if (chunk.getJumbo()) { numJumboChunks++; continue; } migrations->emplace_back(distribution.nss().ns(), to, chunk); invariant(usedShards->insert(chunk.getShard()).second); invariant(usedShards->insert(to).second); return true; } if (numJumboChunks) { warning() << "Shard: " << from << ", collection: " << distribution.nss().ns() << " has only jumbo chunks for zone \'" << tag << "\' and cannot be balanced. Jumbo chunks count: " << numJumboChunks; } return false; }
vector<MigrateInfo> BalancerPolicy::balance(const ShardStatisticsVector& shardStats, const DistributionStatus& distribution, bool shouldAggressivelyBalance) { vector<MigrateInfo> migrations; // Set of shards, which have already been used for migrations. Used so we don't return multiple // migrations for the same shard. set<ShardId> usedShards; // 1) Check for shards, which are in draining mode and must have chunks moved off of them { for (const auto& stat : shardStats) { if (!stat.isDraining) continue; const vector<ChunkType>& chunks = distribution.getChunks(stat.shardId); if (chunks.empty()) continue; // Now we know we need to move to chunks off this shard, but only if permitted by the // tags policy unsigned numJumboChunks = 0; // Since we have to move all chunks, lets just do in order for (const auto& chunk : chunks) { if (chunk.getJumbo()) { numJumboChunks++; continue; } const string tag = distribution.getTagForChunk(chunk); const ShardId to = _getLeastLoadedReceiverShard(shardStats, distribution, tag, usedShards); if (!to.isValid()) { if (migrations.empty()) { warning() << "Chunk " << chunk << " is on a draining shard, but no appropriate recipient found"; } continue; } invariant(to != stat.shardId); migrations.emplace_back(distribution.nss().ns(), to, chunk); invariant(usedShards.insert(stat.shardId).second); invariant(usedShards.insert(to).second); break; } if (migrations.empty()) { warning() << "Unable to find any chunk to move from draining shard " << stat.shardId << ". numJumboChunks: " << numJumboChunks; } } } // 2) Check for chunks, which are on the wrong shard and must be moved off of it if (!distribution.tags().empty()) { for (const auto& stat : shardStats) { const vector<ChunkType>& chunks = distribution.getChunks(stat.shardId); for (const auto& chunk : chunks) { const string tag = distribution.getTagForChunk(chunk); if (tag.empty()) continue; if (stat.shardTags.count(tag)) continue; if (chunk.getJumbo()) { warning() << "chunk " << chunk << " violates tag " << tag << ", but it is jumbo and cannot be moved"; continue; } const ShardId to = _getLeastLoadedReceiverShard(shardStats, distribution, tag, usedShards); if (!to.isValid()) { if (migrations.empty()) { warning() << "chunk " << chunk << " violates tag " << tag << ", but no appropriate recipient found"; } continue; } invariant(to != stat.shardId); migrations.emplace_back(distribution.nss().ns(), to, chunk); invariant(usedShards.insert(stat.shardId).second); invariant(usedShards.insert(to).second); break; } } } // 3) for each tag balance const size_t imbalanceThreshold = (shouldAggressivelyBalance || distribution.totalChunks() < 20) ? kAggressiveImbalanceThreshold : kDefaultImbalanceThreshold; vector<string> tagsPlusEmpty(distribution.tags().begin(), distribution.tags().end()); tagsPlusEmpty.push_back(""); for (const auto& tag : tagsPlusEmpty) { while (_singleZoneBalance( shardStats, distribution, tag, imbalanceThreshold, &migrations, &usedShards)) ; } return migrations; }
void ChunkType::setShard(const ShardId& shard) { invariant(shard.isValid()); _shard = shard; }
StatusWith<boost::optional<ChunkRange>> splitChunkAtMultiplePoints( OperationContext* txn, const ShardId& shardId, const NamespaceString& nss, const ShardKeyPattern& shardKeyPattern, ChunkVersion collectionVersion, const BSONObj& minKey, const BSONObj& maxKey, const std::vector<BSONObj>& splitPoints) { invariant(!splitPoints.empty()); invariant(minKey.woCompare(maxKey) < 0); const size_t kMaxSplitPoints = 8192; if (splitPoints.size() > kMaxSplitPoints) { return {ErrorCodes::BadValue, str::stream() << "Cannot split chunk in more than " << kMaxSplitPoints << " parts at a time."}; } BSONObjBuilder cmd; cmd.append("splitChunk", nss.ns()); cmd.append("configdb", Grid::get(txn)->shardRegistry()->getConfigServerConnectionString().toString()); cmd.append("from", shardId.toString()); cmd.append("keyPattern", shardKeyPattern.toBSON()); collectionVersion.appendForCommands(&cmd); cmd.append(kMinKey, minKey); cmd.append(kMaxKey, maxKey); cmd.append("splitKeys", splitPoints); BSONObj cmdObj = cmd.obj(); Status status{ErrorCodes::InternalError, "Uninitialized value"}; BSONObj cmdResponse; auto shard = Grid::get(txn)->shardRegistry()->getShard(txn, shardId); if (!shard) { status = Status(ErrorCodes::ShardNotFound, str::stream() << "shard " << shardId << " not found"); } else { auto cmdStatus = shard->runCommand(txn, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, "admin", cmdObj, Shard::RetryPolicy::kNotIdempotent); if (!cmdStatus.isOK()) { status = std::move(cmdStatus.getStatus()); } else { status = std::move(cmdStatus.getValue().commandStatus); cmdResponse = std::move(cmdStatus.getValue().response); } } if (!status.isOK()) { log() << "Split chunk " << redact(cmdObj) << " failed" << causedBy(redact(status)); return {status.code(), str::stream() << "split failed due to " << status.toString()}; } BSONElement shouldMigrateElement; status = bsonExtractTypedField(cmdResponse, kShouldMigrate, Object, &shouldMigrateElement); if (status.isOK()) { auto chunkRangeStatus = ChunkRange::fromBSON(shouldMigrateElement.embeddedObject()); if (!chunkRangeStatus.isOK()) { return chunkRangeStatus.getStatus(); } return boost::optional<ChunkRange>(std::move(chunkRangeStatus.getValue())); } else if (status != ErrorCodes::NoSuchKey) { warning() << "Chunk migration will be skipped because splitChunk returned invalid response: " << redact(cmdResponse) << ". Extracting " << kShouldMigrate << " field failed" << causedBy(redact(status)); } return boost::optional<ChunkRange>(); }
StatusWith<boost::optional<ChunkRange>> splitChunkAtMultiplePoints( OperationContext* txn, const ShardId& shardId, const NamespaceString& nss, const ShardKeyPattern& shardKeyPattern, ChunkVersion collectionVersion, const ChunkRange& chunkRange, const std::vector<BSONObj>& splitPoints) { invariant(!splitPoints.empty()); const size_t kMaxSplitPoints = 8192; if (splitPoints.size() > kMaxSplitPoints) { return {ErrorCodes::BadValue, str::stream() << "Cannot split chunk in more than " << kMaxSplitPoints << " parts at a time."}; } // Sanity check that we are not attempting to split at the boundaries of the chunk. This check // is already performed at chunk split commit time, but we are performing it here for parity // with old auto-split code, which might rely on it. if (SimpleBSONObjComparator::kInstance.evaluate(chunkRange.getMin() == splitPoints.front())) { const std::string msg(str::stream() << "not splitting chunk " << chunkRange.toString() << ", split point " << splitPoints.front() << " is exactly on chunk bounds"); return {ErrorCodes::CannotSplit, msg}; } if (SimpleBSONObjComparator::kInstance.evaluate(chunkRange.getMax() == splitPoints.back())) { const std::string msg(str::stream() << "not splitting chunk " << chunkRange.toString() << ", split point " << splitPoints.back() << " is exactly on chunk bounds"); return {ErrorCodes::CannotSplit, msg}; } BSONObjBuilder cmd; cmd.append("splitChunk", nss.ns()); cmd.append("configdb", Grid::get(txn)->shardRegistry()->getConfigServerConnectionString().toString()); cmd.append("from", shardId.toString()); cmd.append("keyPattern", shardKeyPattern.toBSON()); collectionVersion.appendForCommands(&cmd); chunkRange.append(&cmd); cmd.append("splitKeys", splitPoints); BSONObj cmdObj = cmd.obj(); Status status{ErrorCodes::InternalError, "Uninitialized value"}; BSONObj cmdResponse; auto shardStatus = Grid::get(txn)->shardRegistry()->getShard(txn, shardId); if (!shardStatus.isOK()) { status = shardStatus.getStatus(); } else { auto cmdStatus = shardStatus.getValue()->runCommandWithFixedRetryAttempts( txn, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, "admin", cmdObj, Shard::RetryPolicy::kNotIdempotent); if (!cmdStatus.isOK()) { status = std::move(cmdStatus.getStatus()); } else { status = std::move(cmdStatus.getValue().commandStatus); cmdResponse = std::move(cmdStatus.getValue().response); } } if (!status.isOK()) { log() << "Split chunk " << redact(cmdObj) << " failed" << causedBy(redact(status)); return {status.code(), str::stream() << "split failed due to " << status.toString()}; } BSONElement shouldMigrateElement; status = bsonExtractTypedField(cmdResponse, kShouldMigrate, Object, &shouldMigrateElement); if (status.isOK()) { auto chunkRangeStatus = ChunkRange::fromBSON(shouldMigrateElement.embeddedObject()); if (!chunkRangeStatus.isOK()) { return chunkRangeStatus.getStatus(); } return boost::optional<ChunkRange>(std::move(chunkRangeStatus.getValue())); } else if (status != ErrorCodes::NoSuchKey) { warning() << "Chunk migration will be skipped because splitChunk returned invalid response: " << redact(cmdResponse) << ". Extracting " << kShouldMigrate << " field failed" << causedBy(redact(status)); } return boost::optional<ChunkRange>(); }
StatusWith<ShardDrainingStatus> ShardingCatalogManager::removeShard(OperationContext* opCtx, const ShardId& shardId) { // Check preconditions for removing the shard std::string name = shardId.toString(); auto countStatus = _runCountCommandOnConfig( opCtx, ShardType::ConfigNS, BSON(ShardType::name() << NE << name << ShardType::draining(true))); if (!countStatus.isOK()) { return countStatus.getStatus(); } if (countStatus.getValue() > 0) { return Status(ErrorCodes::ConflictingOperationInProgress, "Can't have more than one draining shard at a time"); } countStatus = _runCountCommandOnConfig(opCtx, ShardType::ConfigNS, BSON(ShardType::name() << NE << name)); if (!countStatus.isOK()) { return countStatus.getStatus(); } if (countStatus.getValue() == 0) { return Status(ErrorCodes::IllegalOperation, "Can't remove last shard"); } // Figure out if shard is already draining countStatus = _runCountCommandOnConfig( opCtx, ShardType::ConfigNS, BSON(ShardType::name() << name << ShardType::draining(true))); if (!countStatus.isOK()) { return countStatus.getStatus(); } auto* const shardRegistry = Grid::get(opCtx)->shardRegistry(); if (countStatus.getValue() == 0) { log() << "going to start draining shard: " << name; // Record start in changelog const Status logStatus = Grid::get(opCtx)->catalogClient()->logChangeChecked( opCtx, "removeShard.start", "", BSON("shard" << name), ShardingCatalogClient::kLocalWriteConcern); if (!logStatus.isOK()) { return logStatus; } auto updateStatus = Grid::get(opCtx)->catalogClient()->updateConfigDocument( opCtx, ShardType::ConfigNS, BSON(ShardType::name() << name), BSON("$set" << BSON(ShardType::draining(true))), false, ShardingCatalogClient::kLocalWriteConcern); if (!updateStatus.isOK()) { log() << "error starting removeShard: " << name << causedBy(redact(updateStatus.getStatus())); return updateStatus.getStatus(); } shardRegistry->reload(opCtx); return ShardDrainingStatus::STARTED; } // Draining has already started, now figure out how many chunks and databases are still on the // shard. countStatus = _runCountCommandOnConfig(opCtx, ChunkType::ConfigNS, BSON(ChunkType::shard(name))); if (!countStatus.isOK()) { return countStatus.getStatus(); } const long long chunkCount = countStatus.getValue(); countStatus = _runCountCommandOnConfig(opCtx, DatabaseType::ConfigNS, BSON(DatabaseType::primary(name))); if (!countStatus.isOK()) { return countStatus.getStatus(); } const long long databaseCount = countStatus.getValue(); if (chunkCount > 0 || databaseCount > 0) { // Still more draining to do LOG(0) << "chunkCount: " << chunkCount; LOG(0) << "databaseCount: " << databaseCount; return ShardDrainingStatus::ONGOING; } // Draining is done, now finish removing the shard. log() << "going to remove shard: " << name; audit::logRemoveShard(opCtx->getClient(), name); Status status = Grid::get(opCtx)->catalogClient()->removeConfigDocuments( opCtx, ShardType::ConfigNS, BSON(ShardType::name() << name), ShardingCatalogClient::kLocalWriteConcern); if (!status.isOK()) { log() << "Error concluding removeShard operation on: " << name << "; err: " << status.reason(); return status; } shardConnectionPool.removeHost(name); ReplicaSetMonitor::remove(name); shardRegistry->reload(opCtx); // Record finish in changelog Grid::get(opCtx)->catalogClient()->logChange( opCtx, "removeShard", "", BSON("shard" << name), ShardingCatalogClient::kLocalWriteConcern); return ShardDrainingStatus::COMPLETED; }
MigrateInfo* BalancerPolicy::balance(const string& ns, const DistributionStatus& distribution, int balancedLastTime) { // 1) check for shards that policy require to us to move off of: // draining only // 2) check tag policy violations // 3) then we make sure chunks are balanced for each tag // ---- // 1) check things we have to move { for (const auto& stat : distribution.getStats()) { if (!stat.isDraining) continue; if (distribution.numberOfChunksInShard(stat.shardId) == 0) continue; // now we know we need to move to chunks off this shard // we will if we are allowed const vector<ChunkType>& chunks = distribution.getChunks(stat.shardId); unsigned numJumboChunks = 0; // since we have to move all chunks, lets just do in order for (unsigned i = 0; i < chunks.size(); i++) { const ChunkType& chunkToMove = chunks[i]; if (chunkToMove.getJumbo()) { numJumboChunks++; continue; } string tag = distribution.getTagForChunk(chunkToMove); const ShardId to = distribution.getBestReceieverShard(tag); if (to.size() == 0) { warning() << "want to move chunk: " << chunkToMove << "(" << tag << ") " << "from " << stat.shardId << " but can't find anywhere to put it"; continue; } log() << "going to move " << chunkToMove << " from " << stat.shardId << "(" << tag << ")" << " to " << to; return new MigrateInfo(ns, to, stat.shardId, chunkToMove); } warning() << "can't find any chunk to move from: " << stat.shardId << " but we want to. " << " numJumboChunks: " << numJumboChunks; } } // 2) tag violations if (distribution.tags().size() > 0) { for (const auto& stat : distribution.getStats()) { const vector<ChunkType>& chunks = distribution.getChunks(stat.shardId); for (unsigned j = 0; j < chunks.size(); j++) { const ChunkType& chunk = chunks[j]; string tag = distribution.getTagForChunk(chunk); if (tag.empty() || stat.shardTags.count(tag)) continue; // uh oh, this chunk is in the wrong place log() << "chunk " << chunk << " is not on a shard with the right tag: " << tag; if (chunk.getJumbo()) { warning() << "chunk " << chunk << " is jumbo, so cannot be moved"; continue; } const ShardId to = distribution.getBestReceieverShard(tag); if (to.size() == 0) { log() << "no where to put it :("; continue; } invariant(to != stat.shardId); log() << " going to move to: " << to; return new MigrateInfo(ns, to, stat.shardId, chunk); } } } // 3) for each tag balance int threshold = 8; if (balancedLastTime || distribution.totalChunks() < 20) threshold = 2; else if (distribution.totalChunks() < 80) threshold = 4; // randomize the order in which we balance the tags // this is so that one bad tag doesn't prevent others from getting balanced vector<string> tags; { set<string> t = distribution.tags(); for (set<string>::const_iterator i = t.begin(); i != t.end(); ++i) tags.push_back(*i); tags.push_back(""); std::random_shuffle(tags.begin(), tags.end()); } for (unsigned i = 0; i < tags.size(); i++) { string tag = tags[i]; const ShardId from = distribution.getMostOverloadedShard(tag); if (from.size() == 0) continue; unsigned max = distribution.numberOfChunksInShardWithTag(from, tag); if (max == 0) continue; string to = distribution.getBestReceieverShard(tag); if (to.size() == 0) { log() << "no available shards to take chunks for tag [" << tag << "]"; return NULL; } unsigned min = distribution.numberOfChunksInShardWithTag(to, tag); const int imbalance = max - min; LOG(1) << "collection : " << ns; LOG(1) << "donor : " << from << " chunks on " << max; LOG(1) << "receiver : " << to << " chunks on " << min; LOG(1) << "threshold : " << threshold; if (imbalance < threshold) continue; const vector<ChunkType>& chunks = distribution.getChunks(from); unsigned numJumboChunks = 0; for (unsigned j = 0; j < chunks.size(); j++) { const ChunkType& chunk = chunks[j]; if (distribution.getTagForChunk(chunk) != tag) continue; if (chunk.getJumbo()) { numJumboChunks++; continue; } log() << " ns: " << ns << " going to move " << chunk << " from: " << from << " to: " << to << " tag [" << tag << "]"; return new MigrateInfo(ns, to, from, chunk); } if (numJumboChunks) { error() << "shard: " << from << " ns: " << ns << " has too many chunks, but they are all jumbo " << " numJumboChunks: " << numJumboChunks; continue; } verify(false); // should be impossible } // Everything is balanced here! return NULL; }