void MetadataManager::append(BSONObjBuilder* builder) const { stdx::lock_guard<stdx::mutex> lg(_managerLock); _rangesToClean.append(builder); BSONArrayBuilder pcArr(builder->subarrayStart("pendingChunks")); for (const auto& entry : _receivingChunks) { BSONObjBuilder obj; ChunkRange r = ChunkRange(entry.first, entry.second); r.append(&obj); pcArr.append(obj.done()); } pcArr.done(); if (_metadata.empty()) { return; } BSONArrayBuilder amrArr(builder->subarrayStart("activeMetadataRanges")); for (const auto& entry : _metadata.back()->metadata.getChunks()) { BSONObjBuilder obj; ChunkRange r = ChunkRange(entry.first, entry.second); r.append(&obj); amrArr.append(obj.done()); } amrArr.done(); }
ChunkRange::ChunkRange(const ChunkRange& min, const ChunkRange& max) : _manager(min.getManager()), _shardId(min.getShardId()), _min(min.getMin()), _max(max.getMax()) { invariant(min.getShardId() == max.getShardId()); invariant(min.getManager() == max.getManager()); invariant(min.getMax() == max.getMin()); }
void MoveChunkRequest::appendAsCommand(BSONObjBuilder* builder, const NamespaceString& nss, const ChunkVersion& shardVersion, const ConnectionString& configServerConnectionString, const ShardId& fromShardId, const ShardId& toShardId, const ChunkRange& range, int64_t maxChunkSizeBytes, const MigrationSecondaryThrottleOptions& secondaryThrottle, bool waitForDelete, bool takeDistLock) { invariant(builder->asTempObj().isEmpty()); invariant(nss.isValid()); builder->append(kMoveChunk, nss.ns()); shardVersion.appendForCommands(builder); builder->append(kConfigServerConnectionString, configServerConnectionString.toString()); builder->append(kFromShardId, fromShardId.toString()); builder->append(kToShardId, toShardId.toString()); range.append(builder); builder->append(kMaxChunkSizeBytes, static_cast<long long>(maxChunkSizeBytes)); secondaryThrottle.append(builder); builder->append(kWaitForDelete, waitForDelete); builder->append(kTakeDistLock, takeDistLock); }
void MoveChunkRequest::appendAsCommand(BSONObjBuilder* builder, const NamespaceString& nss, ChunkVersion chunkVersion, const ConnectionString& configServerConnectionString, const ShardId& fromShardId, const ShardId& toShardId, const ChunkRange& range, int64_t maxChunkSizeBytes, const MigrationSecondaryThrottleOptions& secondaryThrottle, bool waitForDelete) { invariant(builder->asTempObj().isEmpty()); invariant(nss.isValid()); builder->append(kMoveChunk, nss.ns()); chunkVersion.appendToCommand(builder); // 3.4 shard compatibility builder->append(kEpoch, chunkVersion.epoch()); // config connection string is included for 3.4 shard compatibility builder->append(kConfigServerConnectionString, configServerConnectionString.toString()); builder->append(kFromShardId, fromShardId.toString()); builder->append(kToShardId, toShardId.toString()); range.append(builder); builder->append(kMaxChunkSizeBytes, static_cast<long long>(maxChunkSizeBytes)); secondaryThrottle.append(builder); builder->append(kWaitForDelete, waitForDelete); builder->append(kTakeDistLock, false); }
void MetadataManager::_removeRangeToClean_inlock(const ChunkRange& range) { auto it = _rangesToClean.upper_bound(range.getMin()); // We want our iterator to point at the greatest value // that is still less than or equal to range. if (it != _rangesToClean.begin()) { --it; } for (; it != _rangesToClean.end() && it->first < range.getMax();) { if (it->second <= range.getMin()) { ++it; continue; } // There's overlap between *it and range so we remove *it // and then replace with new ranges. BSONObj oldMin = it->first, oldMax = it->second; _rangesToClean.erase(it++); if (oldMin < range.getMin()) { _addRangeToClean_inlock(ChunkRange(oldMin, range.getMin())); } if (oldMax > range.getMax()) { _addRangeToClean_inlock(ChunkRange(range.getMax(), oldMax)); } } }
Status CollectionShardingState::waitForClean(OperationContext* opCtx, const NamespaceString& nss, OID const& epoch, ChunkRange orphanRange) { while (true) { boost::optional<CleanupNotification> stillScheduled; { AutoGetCollection autoColl(opCtx, nss, MODE_IX); auto css = CollectionShardingState::get(opCtx, nss); { // First, see if collection was dropped, but do it in a separate scope in order to // not hold reference on it, which would make it appear in use auto metadata = css->_metadataManager->getActiveMetadata(css->_metadataManager); if (!metadata || metadata->getCollVersion().epoch() != epoch) { return {ErrorCodes::StaleShardVersion, "Collection being migrated was dropped"}; } } stillScheduled = css->trackOrphanedDataCleanup(orphanRange); if (!stillScheduled) { log() << "Finished deleting " << nss.ns() << " range " << redact(orphanRange.toString()); return Status::OK(); } } log() << "Waiting for deletion of " << nss.ns() << " range " << orphanRange; Status result = stillScheduled->waitStatus(opCtx); if (!result.isOK()) { return result.withContext(str::stream() << "Failed to delete orphaned " << nss.ns() << " range " << orphanRange.toString()); } } MONGO_UNREACHABLE; }
void MetadataManager::append(BSONObjBuilder* builder) { stdx::lock_guard<stdx::mutex> scopedLock(_managerLock); BSONArrayBuilder rtcArr(builder->subarrayStart("rangesToClean")); for (const auto& entry : _rangesToClean) { BSONObjBuilder obj; ChunkRange r = ChunkRange(entry.first, entry.second); r.append(&obj); rtcArr.append(obj.done()); } rtcArr.done(); BSONArrayBuilder pcArr(builder->subarrayStart("pendingChunks")); for (const auto& entry : _receivingChunks) { BSONObjBuilder obj; ChunkRange r = ChunkRange(entry.first, entry.second); r.append(&obj); pcArr.append(obj.done()); } pcArr.done(); BSONArrayBuilder amrArr(builder->subarrayStart("activeMetadataRanges")); for (const auto& entry : _activeMetadataTracker->metadata->getChunks()) { BSONObjBuilder obj; ChunkRange r = ChunkRange(entry.first, entry.second); r.append(&obj); amrArr.append(obj.done()); } amrArr.done(); }
void MetadataManager::beginReceive(const ChunkRange& range) { stdx::lock_guard<stdx::mutex> scopedLock(_managerLock); // Collection is not known to be sharded if the active metadata tracker is null invariant(_activeMetadataTracker); // If range is contained within pending chunks, this means a previous migration must have failed // and we need to clean all overlaps RangeVector overlappedChunks; getRangeMapOverlap(_receivingChunks, range.getMin(), range.getMax(), &overlappedChunks); for (const auto& overlapChunkMin : overlappedChunks) { auto itRecv = _receivingChunks.find(overlapChunkMin.first); invariant(itRecv != _receivingChunks.end()); const ChunkRange receivingRange(itRecv->first, itRecv->second); _receivingChunks.erase(itRecv); // Make sure any potentially partially copied chunks are scheduled to be cleaned up _addRangeToClean_inlock(receivingRange); } // Need to ensure that the background range deleter task won't delete the range we are about to // receive _removeRangeToClean_inlock(range); _receivingChunks.insert(std::make_pair(range.getMin().getOwned(), range.getMax().getOwned())); // For compatibility with the current range deleter, update the pending chunks on the collection // metadata to include the chunk being received ChunkType chunk; chunk.setMin(range.getMin()); chunk.setMax(range.getMax()); _setActiveMetadata_inlock(_activeMetadataTracker->metadata->clonePlusPending(chunk)); }
void MetadataManager::forgetReceive(const ChunkRange& range) { stdx::lock_guard<stdx::mutex> scopedLock(_managerLock); { auto it = _receivingChunks.find(range.getMin()); invariant(it != _receivingChunks.end()); // Verify entire ChunkRange is identical, not just the min key. invariant(it->second == range.getMax()); _receivingChunks.erase(it); } // This is potentially a partially received data, which needs to be cleaned up _addRangeToClean_inlock(range); // For compatibility with the current range deleter, update the pending chunks on the collection // metadata to exclude the chunk being received, which was added in beginReceive ChunkType chunk; chunk.setMin(range.getMin()); chunk.setMax(range.getMax()); _setActiveMetadata_inlock(_activeMetadataTracker->metadata->cloneMinusPending(chunk)); }
/* static */ Status CollectionShardingState::waitForClean(OperationContext* opCtx, NamespaceString nss, OID const& epoch, ChunkRange orphanRange) { do { auto stillScheduled = boost::optional<CleanupNotification>(); { AutoGetCollection autoColl(opCtx, nss, MODE_IX); // First, see if collection was dropped. auto css = CollectionShardingState::get(opCtx, nss); { auto metadata = css->_metadataManager->getActiveMetadata(css->_metadataManager); if (!metadata || metadata->getCollVersion().epoch() != epoch) { return {ErrorCodes::StaleShardVersion, "Collection being migrated was dropped"}; } } // drop metadata stillScheduled = css->trackOrphanedDataCleanup(orphanRange); if (!stillScheduled) { log() << "Finished deleting " << nss.ns() << " range " << redact(orphanRange.toString()); return Status::OK(); } } // drop collection lock log() << "Waiting for deletion of " << nss.ns() << " range " << orphanRange; Status result = stillScheduled->waitStatus(opCtx); if (!result.isOK()) { return Status{result.code(), str::stream() << "Failed to delete orphaned " << nss.ns() << " range " << orphanRange.toString() << ": " << result.reason()}; } } while (true); MONGO_UNREACHABLE; }
void MetadataManager::_addRangeToClean_inlock(const ChunkRange& range) { invariant(!rangeMapOverlaps(_rangesToClean, range.getMin(), range.getMax())); invariant(!rangeMapOverlaps(_receivingChunks, range.getMin(), range.getMax())); _rangesToClean.insert(std::make_pair(range.getMin().getOwned(), range.getMax().getOwned())); // If _rangesToClean was previously empty, we need to start the collection range deleter if (_rangesToClean.size() == 1UL) { ShardingState::get(_serviceContext)->scheduleCleanup(_nss); } }
void MetadataManager::forgetReceive(ChunkRange const& range) { stdx::lock_guard<stdx::mutex> lg(_managerLock); invariant(!_metadata.empty()); // This is potentially a partially received chunk, which needs to be cleaned up. We know none // of these documents are in use, so they can go straight to the deletion queue. log() << "Abandoning in-migration of " << _nss.ns() << " range " << range << "; scheduling deletion of any documents already copied"; invariant(!_overlapsInUseChunk(lg, range)); auto it = _receivingChunks.find(range.getMin()); invariant(it != _receivingChunks.end()); _receivingChunks.erase(it); _pushRangeToClean(lg, range, Date_t{}).abandon(); }
StatusWith<std::vector<BSONObj>> selectChunkSplitPoints(OperationContext* txn, const ShardId& shardId, const NamespaceString& nss, const ShardKeyPattern& shardKeyPattern, const ChunkRange& chunkRange, long long chunkSizeBytes, boost::optional<int> maxObjs) { BSONObjBuilder cmd; cmd.append("splitVector", nss.ns()); cmd.append("keyPattern", shardKeyPattern.toBSON()); chunkRange.append(&cmd); cmd.append("maxChunkSizeBytes", chunkSizeBytes); if (maxObjs) { cmd.append("maxChunkObjects", *maxObjs); } auto shardStatus = Grid::get(txn)->shardRegistry()->getShard(txn, shardId); if (!shardStatus.isOK()) { return shardStatus.getStatus(); } auto cmdStatus = shardStatus.getValue()->runCommandWithFixedRetryAttempts( txn, ReadPreferenceSetting{ReadPreference::PrimaryPreferred}, "admin", cmd.obj(), Shard::RetryPolicy::kIdempotent); if (!cmdStatus.isOK()) { return std::move(cmdStatus.getStatus()); } if (!cmdStatus.getValue().commandStatus.isOK()) { return std::move(cmdStatus.getValue().commandStatus); } const auto response = std::move(cmdStatus.getValue().response); std::vector<BSONObj> splitPoints; BSONObjIterator it(response.getObjectField("splitKeys")); while (it.more()) { splitPoints.push_back(it.next().Obj().getOwned()); } return std::move(splitPoints); }
std::shared_ptr<Notification<Status>> MetadataManager::_addRangeToClean_inlock( const ChunkRange& range) { // This first invariant currently makes an unnecessary copy, to reuse the // rangeMapOverlaps helper function. invariant(!rangeMapOverlaps(_getCopyOfRangesToClean_inlock(), range.getMin(), range.getMax())); invariant(!rangeMapOverlaps(_receivingChunks, range.getMin(), range.getMax())); RangeToCleanDescriptor descriptor(range.getMax().getOwned()); _rangesToClean.insert(std::make_pair(range.getMin().getOwned(), descriptor)); // If _rangesToClean was previously empty, we need to start the collection range deleter if (_rangesToClean.size() == 1UL) { ShardingState::get(_serviceContext)->scheduleCleanup(_nss); } return descriptor.getNotification(); }
void MetadataManager::_addRangeToClean_inlock(const ChunkRange& range) { invariant(!rangeMapOverlaps(_rangesToClean, range.getMin(), range.getMax())); invariant(!rangeMapOverlaps(_receivingChunks, range.getMin(), range.getMax())); _rangesToClean.insert(std::make_pair(range.getMin().getOwned(), range.getMax().getOwned())); }
void updateChunkWriteStatsAndSplitIfNeeded(OperationContext* opCtx, ChunkManager* manager, Chunk* chunk, long dataWritten) { // Disable lastError tracking so that any errors, which occur during auto-split do not get // bubbled up on the client connection doing a write. LastError::Disabled d(&LastError::get(cc())); const auto balancerConfig = Grid::get(opCtx)->getBalancerConfiguration(); const bool minIsInf = (0 == manager->getShardKeyPattern().getKeyPattern().globalMin().woCompare(chunk->getMin())); const bool maxIsInf = (0 == manager->getShardKeyPattern().getKeyPattern().globalMax().woCompare(chunk->getMax())); const uint64_t chunkBytesWritten = chunk->addBytesWritten(dataWritten); const uint64_t desiredChunkSize = calculateDesiredChunkSize(balancerConfig->getMaxChunkSizeBytes(), manager->numChunks()); if (!chunk->shouldSplit(desiredChunkSize, minIsInf, maxIsInf)) { return; } const NamespaceString nss(manager->getns()); if (!manager->_autoSplitThrottle._splitTickets.tryAcquire()) { LOG(1) << "won't auto split because not enough tickets: " << nss; return; } TicketHolderReleaser releaser(&(manager->_autoSplitThrottle._splitTickets)); const ChunkRange chunkRange(chunk->getMin(), chunk->getMax()); try { // Ensure we have the most up-to-date balancer configuration uassertStatusOK(balancerConfig->refreshAndCheck(opCtx)); if (!balancerConfig->getShouldAutoSplit()) { return; } LOG(1) << "about to initiate autosplit: " << redact(chunk->toString()) << " dataWritten: " << chunkBytesWritten << " desiredChunkSize: " << desiredChunkSize; const uint64_t chunkSizeToUse = [&]() { const uint64_t estNumSplitPoints = chunkBytesWritten / desiredChunkSize * 2; if (estNumSplitPoints >= kTooManySplitPoints) { // The current desired chunk size will split the chunk into lots of small chunk and // at the worst case this can result into thousands of chunks. So check and see if a // bigger value can be used. return std::min(chunkBytesWritten, balancerConfig->getMaxChunkSizeBytes()); } else { return desiredChunkSize; } }(); auto splitPoints = uassertStatusOK(shardutil::selectChunkSplitPoints(opCtx, chunk->getShardId(), nss, manager->getShardKeyPattern(), chunkRange, chunkSizeToUse, boost::none)); if (splitPoints.size() <= 1) { // No split points means there isn't enough data to split on; 1 split point means we // have // between half the chunk size to full chunk size so there is no need to split yet chunk->clearBytesWritten(); return; } if (minIsInf || maxIsInf) { // We don't want to reset _dataWritten since we want to check the other side right away } else { // We're splitting, so should wait a bit chunk->clearBytesWritten(); } // We assume that if the chunk being split is the first (or last) one on the collection, // this chunk is likely to see more insertions. Instead of splitting mid-chunk, we use the // very first (or last) key as a split point. // // This heuristic is skipped for "special" shard key patterns that are not likely to produce // monotonically increasing or decreasing values (e.g. hashed shard keys). if (KeyPattern::isOrderedKeyPattern(manager->getShardKeyPattern().toBSON())) { if (minIsInf) { BSONObj key = findExtremeKeyForShard( opCtx, nss, chunk->getShardId(), manager->getShardKeyPattern(), true); if (!key.isEmpty()) { splitPoints.front() = key.getOwned(); } } else if (maxIsInf) { BSONObj key = findExtremeKeyForShard( opCtx, nss, chunk->getShardId(), manager->getShardKeyPattern(), false); if (!key.isEmpty()) { splitPoints.back() = key.getOwned(); } } } const auto suggestedMigrateChunk = uassertStatusOK(shardutil::splitChunkAtMultiplePoints(opCtx, chunk->getShardId(), nss, manager->getShardKeyPattern(), manager->getVersion(), chunkRange, splitPoints)); // Balance the resulting chunks if the option is enabled and if the shard suggested a chunk // to balance const bool shouldBalance = [&]() { if (!balancerConfig->shouldBalanceForAutoSplit()) return false; auto collStatus = Grid::get(opCtx)->catalogClient()->getCollection(opCtx, manager->getns()); if (!collStatus.isOK()) { log() << "Auto-split for " << nss << " failed to load collection metadata" << causedBy(redact(collStatus.getStatus())); return false; } return collStatus.getValue().value.getAllowBalance(); }(); log() << "autosplitted " << nss << " chunk: " << redact(chunk->toString()) << " into " << (splitPoints.size() + 1) << " parts (desiredChunkSize " << desiredChunkSize << ")" << (suggestedMigrateChunk ? "" : (std::string) " (migrate suggested" + (shouldBalance ? ")" : ", but no migrations allowed)")); // Reload the chunk manager after the split auto routingInfo = uassertStatusOK( Grid::get(opCtx)->catalogCache()->getShardedCollectionRoutingInfoWithRefresh(opCtx, nss)); if (!shouldBalance || !suggestedMigrateChunk) { return; } // Top chunk optimization - try to move the top chunk out of this shard to prevent the hot // spot from staying on a single shard. This is based on the assumption that succeeding // inserts will fall on the top chunk. // We need to use the latest chunk manager (after the split) in order to have the most // up-to-date view of the chunk we are about to move auto suggestedChunk = routingInfo.cm()->findIntersectingChunkWithSimpleCollation( suggestedMigrateChunk->getMin()); ChunkType chunkToMove; chunkToMove.setNS(nss.ns()); chunkToMove.setShard(suggestedChunk->getShardId()); chunkToMove.setMin(suggestedChunk->getMin()); chunkToMove.setMax(suggestedChunk->getMax()); chunkToMove.setVersion(suggestedChunk->getLastmod()); uassertStatusOK(configsvr_client::rebalanceChunk(opCtx, chunkToMove)); // Ensure the collection gets reloaded because of the move Grid::get(opCtx)->catalogCache()->invalidateShardedCollection(nss); } catch (const DBException& ex) { chunk->clearBytesWritten(); if (ErrorCodes::isStaleShardingError(ErrorCodes::Error(ex.getCode()))) { log() << "Unable to auto-split chunk " << redact(chunkRange.toString()) << causedBy(ex) << ", going to invalidate routing table entry for " << nss; Grid::get(opCtx)->catalogCache()->invalidateShardedCollection(nss); } } }
ChunkType::ChunkType(NamespaceString nss, ChunkRange range, ChunkVersion version, ShardId shardId) : _nss(nss), _min(range.getMin()), _max(range.getMax()), _version(version), _shard(std::move(shardId)) {}
bool MetadataManager::isInRangesToClean(const ChunkRange& range) { stdx::lock_guard<stdx::mutex> scopedLock(_managerLock); // For convenience, this line makes an unnecessary copy, to reuse the // rangeMapContains helper function. return rangeMapContains(_getCopyOfRangesToClean_inlock(), range.getMin(), range.getMax()); }
void MetadataManager::setFilteringMetadata(CollectionMetadata remoteMetadata) { stdx::lock_guard<stdx::mutex> lg(_managerLock); // Collection is becoming sharded if (_metadata.empty()) { LOG(0) << "Marking collection " << _nss.ns() << " as " << remoteMetadata.toStringBasic(); invariant(_receivingChunks.empty()); invariant(_rangesToClean.isEmpty()); _setActiveMetadata(lg, std::move(remoteMetadata)); return; } const auto& activeMetadata = _metadata.back()->metadata; // If the metadata being installed has a different epoch from ours, this means the collection // was dropped and recreated, so we must entirely reset the metadata state if (activeMetadata.getCollVersion().epoch() != remoteMetadata.getCollVersion().epoch()) { LOG(0) << "Updating metadata for collection " << _nss.ns() << " from " << activeMetadata.toStringBasic() << " to " << remoteMetadata.toStringBasic() << " due to epoch change"; _receivingChunks.clear(); _clearAllCleanups(lg); _metadata.clear(); _setActiveMetadata(lg, std::move(remoteMetadata)); return; } // We already have the same or newer version if (activeMetadata.getCollVersion() >= remoteMetadata.getCollVersion()) { LOG(1) << "Ignoring update of active metadata " << activeMetadata.toStringBasic() << " with an older " << remoteMetadata.toStringBasic(); return; } LOG(0) << "Updating metadata for collection " << _nss.ns() << " from " << activeMetadata.toStringBasic() << " to " << remoteMetadata.toStringBasic() << " due to version change"; // Resolve any receiving chunks, which might have completed by now for (auto it = _receivingChunks.begin(); it != _receivingChunks.end();) { const ChunkRange receivingRange(it->first, it->second); if (!remoteMetadata.rangeOverlapsChunk(receivingRange)) { ++it; continue; } // The remote metadata contains a chunk we were earlier in the process of receiving, so we // deem it successfully received LOG(2) << "Verified chunk " << redact(receivingRange.toString()) << " for collection " << _nss.ns() << " has been migrated to this shard earlier"; _receivingChunks.erase(it); it = _receivingChunks.begin(); } _setActiveMetadata(lg, std::move(remoteMetadata)); }
TagsType::TagsType(NamespaceString nss, std::string tag, ChunkRange range) : _ns(std::move(nss)), _tag(std::move(tag)), _minKey(range.getMin().getOwned()), _maxKey(range.getMax().getOwned()) {}
StatusWith<int> CollectionRangeDeleter::_doDeletion(OperationContext* opCtx, Collection* collection, BSONObj const& keyPattern, ChunkRange const& range, int maxToDelete) { invariant(collection != nullptr); invariant(!isEmpty()); auto const& nss = collection->ns(); // The IndexChunk has a keyPattern that may apply to more than one index - we need to // select the index and get the full index keyPattern here. auto catalog = collection->getIndexCatalog(); const IndexDescriptor* idx = catalog->findShardKeyPrefixedIndex(opCtx, keyPattern, false); if (!idx) { std::string msg = str::stream() << "Unable to find shard key index for " << keyPattern.toString() << " in " << nss.ns(); LOG(0) << msg; return {ErrorCodes::InternalError, msg}; } // Extend bounds to match the index we found const KeyPattern indexKeyPattern(idx->keyPattern()); const auto extend = [&](const auto& key) { return Helpers::toKeyFormat(indexKeyPattern.extendRangeBound(key, false)); }; const auto min = extend(range.getMin()); const auto max = extend(range.getMax()); LOG(1) << "begin removal of " << min << " to " << max << " in " << nss.ns(); const auto indexName = idx->indexName(); const IndexDescriptor* descriptor = collection->getIndexCatalog()->findIndexByName(opCtx, indexName); if (!descriptor) { std::string msg = str::stream() << "shard key index with name " << indexName << " on '" << nss.ns() << "' was dropped"; LOG(0) << msg; return {ErrorCodes::InternalError, msg}; } auto deleteStageParams = std::make_unique<DeleteStageParams>(); deleteStageParams->fromMigrate = true; deleteStageParams->isMulti = true; deleteStageParams->returnDeleted = true; if (serverGlobalParams.moveParanoia) { deleteStageParams->removeSaver = std::make_unique<RemoveSaver>("moveChunk", nss.ns(), "cleaning"); } auto exec = InternalPlanner::deleteWithIndexScan(opCtx, collection, std::move(deleteStageParams), descriptor, min, max, BoundInclusion::kIncludeStartKeyOnly, PlanExecutor::YIELD_MANUAL, InternalPlanner::FORWARD); PlanYieldPolicy planYieldPolicy(exec.get(), PlanExecutor::YIELD_MANUAL); int numDeleted = 0; do { BSONObj deletedObj; PlanExecutor::ExecState state = exec->getNext(&deletedObj, nullptr); if (state == PlanExecutor::IS_EOF) { break; } if (state == PlanExecutor::FAILURE) { warning() << PlanExecutor::statestr(state) << " - cursor error while trying to delete " << redact(min) << " to " << redact(max) << " in " << nss << ": FAILURE, stats: " << Explain::getWinningPlanStats(exec.get()); break; } invariant(PlanExecutor::ADVANCED == state); ShardingStatistics::get(opCtx).countDocsDeletedOnDonor.addAndFetch(1); } while (++numDeleted < maxToDelete); return numDeleted; }
StatusWith<boost::optional<ChunkRange>> splitChunkAtMultiplePoints( OperationContext* txn, const ShardId& shardId, const NamespaceString& nss, const ShardKeyPattern& shardKeyPattern, ChunkVersion collectionVersion, const ChunkRange& chunkRange, const std::vector<BSONObj>& splitPoints) { invariant(!splitPoints.empty()); const size_t kMaxSplitPoints = 8192; if (splitPoints.size() > kMaxSplitPoints) { return {ErrorCodes::BadValue, str::stream() << "Cannot split chunk in more than " << kMaxSplitPoints << " parts at a time."}; } // Sanity check that we are not attempting to split at the boundaries of the chunk. This check // is already performed at chunk split commit time, but we are performing it here for parity // with old auto-split code, which might rely on it. if (SimpleBSONObjComparator::kInstance.evaluate(chunkRange.getMin() == splitPoints.front())) { const std::string msg(str::stream() << "not splitting chunk " << chunkRange.toString() << ", split point " << splitPoints.front() << " is exactly on chunk bounds"); return {ErrorCodes::CannotSplit, msg}; } if (SimpleBSONObjComparator::kInstance.evaluate(chunkRange.getMax() == splitPoints.back())) { const std::string msg(str::stream() << "not splitting chunk " << chunkRange.toString() << ", split point " << splitPoints.back() << " is exactly on chunk bounds"); return {ErrorCodes::CannotSplit, msg}; } BSONObjBuilder cmd; cmd.append("splitChunk", nss.ns()); cmd.append("configdb", Grid::get(txn)->shardRegistry()->getConfigServerConnectionString().toString()); cmd.append("from", shardId.toString()); cmd.append("keyPattern", shardKeyPattern.toBSON()); collectionVersion.appendForCommands(&cmd); chunkRange.append(&cmd); cmd.append("splitKeys", splitPoints); BSONObj cmdObj = cmd.obj(); Status status{ErrorCodes::InternalError, "Uninitialized value"}; BSONObj cmdResponse; auto shardStatus = Grid::get(txn)->shardRegistry()->getShard(txn, shardId); if (!shardStatus.isOK()) { status = shardStatus.getStatus(); } else { auto cmdStatus = shardStatus.getValue()->runCommandWithFixedRetryAttempts( txn, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, "admin", cmdObj, Shard::RetryPolicy::kNotIdempotent); if (!cmdStatus.isOK()) { status = std::move(cmdStatus.getStatus()); } else { status = std::move(cmdStatus.getValue().commandStatus); cmdResponse = std::move(cmdStatus.getValue().response); } } if (!status.isOK()) { log() << "Split chunk " << redact(cmdObj) << " failed" << causedBy(redact(status)); return {status.code(), str::stream() << "split failed due to " << status.toString()}; } BSONElement shouldMigrateElement; status = bsonExtractTypedField(cmdResponse, kShouldMigrate, Object, &shouldMigrateElement); if (status.isOK()) { auto chunkRangeStatus = ChunkRange::fromBSON(shouldMigrateElement.embeddedObject()); if (!chunkRangeStatus.isOK()) { return chunkRangeStatus.getStatus(); } return boost::optional<ChunkRange>(std::move(chunkRangeStatus.getValue())); } else if (status != ErrorCodes::NoSuchKey) { warning() << "Chunk migration will be skipped because splitChunk returned invalid response: " << redact(cmdResponse) << ". Extracting " << kShouldMigrate << " field failed" << causedBy(redact(status)); } return boost::optional<ChunkRange>(); }
StatusWith<int> CollectionRangeDeleter::_doDeletion(OperationContext* opCtx, Collection* collection, BSONObj const& keyPattern, ChunkRange const& range, int maxToDelete) { invariant(collection != nullptr); invariant(!isEmpty()); auto const& nss = collection->ns(); // The IndexChunk has a keyPattern that may apply to more than one index - we need to // select the index and get the full index keyPattern here. auto catalog = collection->getIndexCatalog(); const IndexDescriptor* idx = catalog->findShardKeyPrefixedIndex(opCtx, keyPattern, false); if (!idx) { std::string msg = str::stream() << "Unable to find shard key index for " << keyPattern.toString() << " in " << nss.ns(); LOG(0) << msg; return {ErrorCodes::InternalError, msg}; } // Extend bounds to match the index we found const KeyPattern indexKeyPattern(idx->keyPattern()); const auto extend = [&](const auto& key) { return Helpers::toKeyFormat(indexKeyPattern.extendRangeBound(key, false)); }; const auto min = extend(range.getMin()); const auto max = extend(range.getMax()); LOG(1) << "begin removal of " << min << " to " << max << " in " << nss.ns(); const auto indexName = idx->indexName(); IndexDescriptor* descriptor = collection->getIndexCatalog()->findIndexByName(opCtx, indexName); if (!descriptor) { std::string msg = str::stream() << "shard key index with name " << indexName << " on '" << nss.ns() << "' was dropped"; LOG(0) << msg; return {ErrorCodes::InternalError, msg}; } boost::optional<Helpers::RemoveSaver> saver; if (serverGlobalParams.moveParanoia) { saver.emplace("moveChunk", nss.ns(), "cleaning"); } auto halfOpen = BoundInclusion::kIncludeStartKeyOnly; auto manual = PlanExecutor::YIELD_MANUAL; auto forward = InternalPlanner::FORWARD; auto fetch = InternalPlanner::IXSCAN_FETCH; auto exec = InternalPlanner::indexScan( opCtx, collection, descriptor, min, max, halfOpen, manual, forward, fetch); int numDeleted = 0; do { RecordId rloc; BSONObj obj; PlanExecutor::ExecState state = exec->getNext(&obj, &rloc); if (state == PlanExecutor::IS_EOF) { break; } if (state == PlanExecutor::FAILURE || state == PlanExecutor::DEAD) { warning() << PlanExecutor::statestr(state) << " - cursor error while trying to delete " << redact(min) << " to " << redact(max) << " in " << nss << ": " << redact(WorkingSetCommon::toStatusString(obj)) << ", stats: " << Explain::getWinningPlanStats(exec.get()); break; } invariant(PlanExecutor::ADVANCED == state); exec->saveState(); writeConflictRetry(opCtx, "delete range", nss.ns(), [&] { WriteUnitOfWork wuow(opCtx); if (saver) { uassertStatusOK(saver->goingToDelete(obj)); } collection->deleteDocument(opCtx, kUninitializedStmtId, rloc, nullptr, true); wuow.commit(); }); try { exec->restoreState(); } catch (const DBException& ex) { warning() << "error restoring cursor state while trying to delete " << redact(min) << " to " << redact(max) << " in " << nss << ", stats: " << Explain::getWinningPlanStats(exec.get()) << ": " << redact(ex.toStatus()); break; } ShardingStatistics::get(opCtx).countDocsDeletedOnDonor.addAndFetch(1); } while (++numDeleted < maxToDelete); return numDeleted; }