Status CollectionShardingState::waitForClean(OperationContext* opCtx, const NamespaceString& nss, OID const& epoch, ChunkRange orphanRange) { while (true) { boost::optional<CleanupNotification> stillScheduled; { AutoGetCollection autoColl(opCtx, nss, MODE_IX); auto css = CollectionShardingState::get(opCtx, nss); { // First, see if collection was dropped, but do it in a separate scope in order to // not hold reference on it, which would make it appear in use auto metadata = css->_metadataManager->getActiveMetadata(css->_metadataManager); if (!metadata || metadata->getCollVersion().epoch() != epoch) { return {ErrorCodes::StaleShardVersion, "Collection being migrated was dropped"}; } } stillScheduled = css->trackOrphanedDataCleanup(orphanRange); if (!stillScheduled) { log() << "Finished deleting " << nss.ns() << " range " << redact(orphanRange.toString()); return Status::OK(); } } log() << "Waiting for deletion of " << nss.ns() << " range " << orphanRange; Status result = stillScheduled->waitStatus(opCtx); if (!result.isOK()) { return result.withContext(str::stream() << "Failed to delete orphaned " << nss.ns() << " range " << orphanRange.toString()); } } MONGO_UNREACHABLE; }
/* static */ Status CollectionShardingState::waitForClean(OperationContext* opCtx, NamespaceString nss, OID const& epoch, ChunkRange orphanRange) { do { auto stillScheduled = boost::optional<CleanupNotification>(); { AutoGetCollection autoColl(opCtx, nss, MODE_IX); // First, see if collection was dropped. auto css = CollectionShardingState::get(opCtx, nss); { auto metadata = css->_metadataManager->getActiveMetadata(css->_metadataManager); if (!metadata || metadata->getCollVersion().epoch() != epoch) { return {ErrorCodes::StaleShardVersion, "Collection being migrated was dropped"}; } } // drop metadata stillScheduled = css->trackOrphanedDataCleanup(orphanRange); if (!stillScheduled) { log() << "Finished deleting " << nss.ns() << " range " << redact(orphanRange.toString()); return Status::OK(); } } // drop collection lock log() << "Waiting for deletion of " << nss.ns() << " range " << orphanRange; Status result = stillScheduled->waitStatus(opCtx); if (!result.isOK()) { return Status{result.code(), str::stream() << "Failed to delete orphaned " << nss.ns() << " range " << orphanRange.toString() << ": " << result.reason()}; } } while (true); MONGO_UNREACHABLE; }
void updateChunkWriteStatsAndSplitIfNeeded(OperationContext* opCtx, ChunkManager* manager, Chunk* chunk, long dataWritten) { // Disable lastError tracking so that any errors, which occur during auto-split do not get // bubbled up on the client connection doing a write. LastError::Disabled d(&LastError::get(cc())); const auto balancerConfig = Grid::get(opCtx)->getBalancerConfiguration(); const bool minIsInf = (0 == manager->getShardKeyPattern().getKeyPattern().globalMin().woCompare(chunk->getMin())); const bool maxIsInf = (0 == manager->getShardKeyPattern().getKeyPattern().globalMax().woCompare(chunk->getMax())); const uint64_t chunkBytesWritten = chunk->addBytesWritten(dataWritten); const uint64_t desiredChunkSize = calculateDesiredChunkSize(balancerConfig->getMaxChunkSizeBytes(), manager->numChunks()); if (!chunk->shouldSplit(desiredChunkSize, minIsInf, maxIsInf)) { return; } const NamespaceString nss(manager->getns()); if (!manager->_autoSplitThrottle._splitTickets.tryAcquire()) { LOG(1) << "won't auto split because not enough tickets: " << nss; return; } TicketHolderReleaser releaser(&(manager->_autoSplitThrottle._splitTickets)); const ChunkRange chunkRange(chunk->getMin(), chunk->getMax()); try { // Ensure we have the most up-to-date balancer configuration uassertStatusOK(balancerConfig->refreshAndCheck(opCtx)); if (!balancerConfig->getShouldAutoSplit()) { return; } LOG(1) << "about to initiate autosplit: " << redact(chunk->toString()) << " dataWritten: " << chunkBytesWritten << " desiredChunkSize: " << desiredChunkSize; const uint64_t chunkSizeToUse = [&]() { const uint64_t estNumSplitPoints = chunkBytesWritten / desiredChunkSize * 2; if (estNumSplitPoints >= kTooManySplitPoints) { // The current desired chunk size will split the chunk into lots of small chunk and // at the worst case this can result into thousands of chunks. So check and see if a // bigger value can be used. return std::min(chunkBytesWritten, balancerConfig->getMaxChunkSizeBytes()); } else { return desiredChunkSize; } }(); auto splitPoints = uassertStatusOK(shardutil::selectChunkSplitPoints(opCtx, chunk->getShardId(), nss, manager->getShardKeyPattern(), chunkRange, chunkSizeToUse, boost::none)); if (splitPoints.size() <= 1) { // No split points means there isn't enough data to split on; 1 split point means we // have // between half the chunk size to full chunk size so there is no need to split yet chunk->clearBytesWritten(); return; } if (minIsInf || maxIsInf) { // We don't want to reset _dataWritten since we want to check the other side right away } else { // We're splitting, so should wait a bit chunk->clearBytesWritten(); } // We assume that if the chunk being split is the first (or last) one on the collection, // this chunk is likely to see more insertions. Instead of splitting mid-chunk, we use the // very first (or last) key as a split point. // // This heuristic is skipped for "special" shard key patterns that are not likely to produce // monotonically increasing or decreasing values (e.g. hashed shard keys). if (KeyPattern::isOrderedKeyPattern(manager->getShardKeyPattern().toBSON())) { if (minIsInf) { BSONObj key = findExtremeKeyForShard( opCtx, nss, chunk->getShardId(), manager->getShardKeyPattern(), true); if (!key.isEmpty()) { splitPoints.front() = key.getOwned(); } } else if (maxIsInf) { BSONObj key = findExtremeKeyForShard( opCtx, nss, chunk->getShardId(), manager->getShardKeyPattern(), false); if (!key.isEmpty()) { splitPoints.back() = key.getOwned(); } } } const auto suggestedMigrateChunk = uassertStatusOK(shardutil::splitChunkAtMultiplePoints(opCtx, chunk->getShardId(), nss, manager->getShardKeyPattern(), manager->getVersion(), chunkRange, splitPoints)); // Balance the resulting chunks if the option is enabled and if the shard suggested a chunk // to balance const bool shouldBalance = [&]() { if (!balancerConfig->shouldBalanceForAutoSplit()) return false; auto collStatus = Grid::get(opCtx)->catalogClient()->getCollection(opCtx, manager->getns()); if (!collStatus.isOK()) { log() << "Auto-split for " << nss << " failed to load collection metadata" << causedBy(redact(collStatus.getStatus())); return false; } return collStatus.getValue().value.getAllowBalance(); }(); log() << "autosplitted " << nss << " chunk: " << redact(chunk->toString()) << " into " << (splitPoints.size() + 1) << " parts (desiredChunkSize " << desiredChunkSize << ")" << (suggestedMigrateChunk ? "" : (std::string) " (migrate suggested" + (shouldBalance ? ")" : ", but no migrations allowed)")); // Reload the chunk manager after the split auto routingInfo = uassertStatusOK( Grid::get(opCtx)->catalogCache()->getShardedCollectionRoutingInfoWithRefresh(opCtx, nss)); if (!shouldBalance || !suggestedMigrateChunk) { return; } // Top chunk optimization - try to move the top chunk out of this shard to prevent the hot // spot from staying on a single shard. This is based on the assumption that succeeding // inserts will fall on the top chunk. // We need to use the latest chunk manager (after the split) in order to have the most // up-to-date view of the chunk we are about to move auto suggestedChunk = routingInfo.cm()->findIntersectingChunkWithSimpleCollation( suggestedMigrateChunk->getMin()); ChunkType chunkToMove; chunkToMove.setNS(nss.ns()); chunkToMove.setShard(suggestedChunk->getShardId()); chunkToMove.setMin(suggestedChunk->getMin()); chunkToMove.setMax(suggestedChunk->getMax()); chunkToMove.setVersion(suggestedChunk->getLastmod()); uassertStatusOK(configsvr_client::rebalanceChunk(opCtx, chunkToMove)); // Ensure the collection gets reloaded because of the move Grid::get(opCtx)->catalogCache()->invalidateShardedCollection(nss); } catch (const DBException& ex) { chunk->clearBytesWritten(); if (ErrorCodes::isStaleShardingError(ErrorCodes::Error(ex.getCode()))) { log() << "Unable to auto-split chunk " << redact(chunkRange.toString()) << causedBy(ex) << ", going to invalidate routing table entry for " << nss; Grid::get(opCtx)->catalogCache()->invalidateShardedCollection(nss); } } }
StatusWith<boost::optional<ChunkRange>> splitChunkAtMultiplePoints( OperationContext* txn, const ShardId& shardId, const NamespaceString& nss, const ShardKeyPattern& shardKeyPattern, ChunkVersion collectionVersion, const ChunkRange& chunkRange, const std::vector<BSONObj>& splitPoints) { invariant(!splitPoints.empty()); const size_t kMaxSplitPoints = 8192; if (splitPoints.size() > kMaxSplitPoints) { return {ErrorCodes::BadValue, str::stream() << "Cannot split chunk in more than " << kMaxSplitPoints << " parts at a time."}; } // Sanity check that we are not attempting to split at the boundaries of the chunk. This check // is already performed at chunk split commit time, but we are performing it here for parity // with old auto-split code, which might rely on it. if (SimpleBSONObjComparator::kInstance.evaluate(chunkRange.getMin() == splitPoints.front())) { const std::string msg(str::stream() << "not splitting chunk " << chunkRange.toString() << ", split point " << splitPoints.front() << " is exactly on chunk bounds"); return {ErrorCodes::CannotSplit, msg}; } if (SimpleBSONObjComparator::kInstance.evaluate(chunkRange.getMax() == splitPoints.back())) { const std::string msg(str::stream() << "not splitting chunk " << chunkRange.toString() << ", split point " << splitPoints.back() << " is exactly on chunk bounds"); return {ErrorCodes::CannotSplit, msg}; } BSONObjBuilder cmd; cmd.append("splitChunk", nss.ns()); cmd.append("configdb", Grid::get(txn)->shardRegistry()->getConfigServerConnectionString().toString()); cmd.append("from", shardId.toString()); cmd.append("keyPattern", shardKeyPattern.toBSON()); collectionVersion.appendForCommands(&cmd); chunkRange.append(&cmd); cmd.append("splitKeys", splitPoints); BSONObj cmdObj = cmd.obj(); Status status{ErrorCodes::InternalError, "Uninitialized value"}; BSONObj cmdResponse; auto shardStatus = Grid::get(txn)->shardRegistry()->getShard(txn, shardId); if (!shardStatus.isOK()) { status = shardStatus.getStatus(); } else { auto cmdStatus = shardStatus.getValue()->runCommandWithFixedRetryAttempts( txn, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, "admin", cmdObj, Shard::RetryPolicy::kNotIdempotent); if (!cmdStatus.isOK()) { status = std::move(cmdStatus.getStatus()); } else { status = std::move(cmdStatus.getValue().commandStatus); cmdResponse = std::move(cmdStatus.getValue().response); } } if (!status.isOK()) { log() << "Split chunk " << redact(cmdObj) << " failed" << causedBy(redact(status)); return {status.code(), str::stream() << "split failed due to " << status.toString()}; } BSONElement shouldMigrateElement; status = bsonExtractTypedField(cmdResponse, kShouldMigrate, Object, &shouldMigrateElement); if (status.isOK()) { auto chunkRangeStatus = ChunkRange::fromBSON(shouldMigrateElement.embeddedObject()); if (!chunkRangeStatus.isOK()) { return chunkRangeStatus.getStatus(); } return boost::optional<ChunkRange>(std::move(chunkRangeStatus.getValue())); } else if (status != ErrorCodes::NoSuchKey) { warning() << "Chunk migration will be skipped because splitChunk returned invalid response: " << redact(cmdResponse) << ". Extracting " << kShouldMigrate << " field failed" << causedBy(redact(status)); } return boost::optional<ChunkRange>(); }
void MetadataManager::setFilteringMetadata(CollectionMetadata remoteMetadata) { stdx::lock_guard<stdx::mutex> lg(_managerLock); // Collection is becoming sharded if (_metadata.empty()) { LOG(0) << "Marking collection " << _nss.ns() << " as " << remoteMetadata.toStringBasic(); invariant(_receivingChunks.empty()); invariant(_rangesToClean.isEmpty()); _setActiveMetadata(lg, std::move(remoteMetadata)); return; } const auto& activeMetadata = _metadata.back()->metadata; // If the metadata being installed has a different epoch from ours, this means the collection // was dropped and recreated, so we must entirely reset the metadata state if (activeMetadata.getCollVersion().epoch() != remoteMetadata.getCollVersion().epoch()) { LOG(0) << "Updating metadata for collection " << _nss.ns() << " from " << activeMetadata.toStringBasic() << " to " << remoteMetadata.toStringBasic() << " due to epoch change"; _receivingChunks.clear(); _clearAllCleanups(lg); _metadata.clear(); _setActiveMetadata(lg, std::move(remoteMetadata)); return; } // We already have the same or newer version if (activeMetadata.getCollVersion() >= remoteMetadata.getCollVersion()) { LOG(1) << "Ignoring update of active metadata " << activeMetadata.toStringBasic() << " with an older " << remoteMetadata.toStringBasic(); return; } LOG(0) << "Updating metadata for collection " << _nss.ns() << " from " << activeMetadata.toStringBasic() << " to " << remoteMetadata.toStringBasic() << " due to version change"; // Resolve any receiving chunks, which might have completed by now for (auto it = _receivingChunks.begin(); it != _receivingChunks.end();) { const ChunkRange receivingRange(it->first, it->second); if (!remoteMetadata.rangeOverlapsChunk(receivingRange)) { ++it; continue; } // The remote metadata contains a chunk we were earlier in the process of receiving, so we // deem it successfully received LOG(2) << "Verified chunk " << redact(receivingRange.toString()) << " for collection " << _nss.ns() << " has been migrated to this shard earlier"; _receivingChunks.erase(it); it = _receivingChunks.begin(); } _setActiveMetadata(lg, std::move(remoteMetadata)); }