MigrateInfo MigrationType::toMigrateInfo() const { ChunkType chunk; chunk.setNS(_nss); chunk.setShard(_fromShard); chunk.setMin(_min); chunk.setMax(_max); chunk.setVersion(_chunkVersion); return MigrateInfo(_toShard, chunk); }
Status ChunkManager::createFirstChunks(OperationContext* txn, const ShardId& primaryShardId, const vector<BSONObj>* initPoints, const set<ShardId>* initShardIds) { // TODO distlock? // TODO: Race condition if we shard the collection and insert data while we split across // the non-primary shard. vector<BSONObj> splitPoints; vector<ShardId> shardIds; calcInitSplitsAndShards(txn, primaryShardId, initPoints, initShardIds, &splitPoints, &shardIds); // this is the first chunk; start the versioning from scratch ChunkVersion version(1, 0, OID::gen()); log() << "going to create " << splitPoints.size() + 1 << " chunk(s) for: " << _ns << " using new epoch " << version.epoch(); for (unsigned i = 0; i <= splitPoints.size(); i++) { BSONObj min = i == 0 ? _keyPattern.getKeyPattern().globalMin() : splitPoints[i - 1]; BSONObj max = i < splitPoints.size() ? splitPoints[i] : _keyPattern.getKeyPattern().globalMax(); ChunkType chunk; chunk.setName(Chunk::genID(_ns, min)); chunk.setNS(_ns); chunk.setMin(min); chunk.setMax(max); chunk.setShard(shardIds[i % shardIds.size()]); chunk.setVersion(version); Status status = grid.catalogManager(txn) ->insertConfigDocument(txn, ChunkType::ConfigNS, chunk.toBSON()); if (!status.isOK()) { const string errMsg = str::stream() << "Creating first chunks failed: " << status.reason(); error() << errMsg; return Status(status.code(), errMsg); } version.incMinor(); } _version = ChunkVersion(0, 0, version.epoch()); return Status::OK(); }
/** * Stores ranges for a particular collection and shard starting from some version */ void storeCollectionRanges( const NamespaceString& nss, const string& shardName, const vector<KeyRange>& ranges, const ChunkVersion& startVersion ) { // Get key pattern from first range ASSERT_GREATER_THAN( ranges.size(), 0u ); CollectionType coll; coll.setNS( nss.ns() ); coll.setKeyPattern( ranges.begin()->keyPattern ); coll.setEpoch( startVersion.epoch() ); coll.setUpdatedAt( 1ULL ); string errMsg; ASSERT( coll.isValid( &errMsg ) ); DBDirectClient client(&_txn); client.update( CollectionType::ConfigNS, BSON( CollectionType::ns( coll.getNS() ) ), coll.toBSON(), true, false ); ChunkVersion nextVersion = startVersion; for ( vector<KeyRange>::const_iterator it = ranges.begin(); it != ranges.end(); ++it ) { ChunkType chunk; // TODO: We should not rely on the serialized ns, minkey being unique in the future, // causes problems since it links string serialization to correctness. chunk.setName( Chunk::genID( nss, it->minKey ) ); chunk.setShard( shardName ); chunk.setNS( nss.ns() ); chunk.setVersion( nextVersion ); chunk.setMin( it->minKey ); chunk.setMax( it->maxKey ); nextVersion.incMajor(); client.insert( ChunkType::ConfigNS, chunk.toBSON() ); } }
BSONObj buildApplyOpsCmd( const OwnedPointerVector<ChunkType>& chunksToMerge, const ChunkVersion& currShardVersion, const ChunkVersion& newMergedVersion ) { BSONObjBuilder applyOpsCmdB; BSONArrayBuilder updatesB( applyOpsCmdB.subarrayStart( "applyOps" ) ); // The chunk we'll be "expanding" is the first chunk const ChunkType* chunkToMerge = *chunksToMerge.begin(); // Fill in details not tracked by metadata ChunkType mergedChunk; chunkToMerge->cloneTo( &mergedChunk ); mergedChunk.setName( Chunk::genID( chunkToMerge->getNS(), chunkToMerge->getMin() ) ); mergedChunk.setMax( ( *chunksToMerge.vector().rbegin() )->getMax() ); mergedChunk.setVersion( newMergedVersion ); updatesB.append( buildOpMergeChunk( mergedChunk ) ); // Don't remove chunk we're expanding OwnedPointerVector<ChunkType>::const_iterator it = chunksToMerge.begin(); for ( ++it; it != chunksToMerge.end(); ++it ) { ChunkType* chunkToMerge = *it; chunkToMerge->setName( Chunk::genID( chunkToMerge->getNS(), chunkToMerge->getMin() ) ); updatesB.append( buildOpRemoveChunk( *chunkToMerge ) ); } updatesB.done(); applyOpsCmdB.append( "preCondition", buildOpPrecond( chunkToMerge->getNS(), chunkToMerge->getShard(), currShardVersion ) ); return applyOpsCmdB.obj(); }
void updateChunkWriteStatsAndSplitIfNeeded(OperationContext* opCtx, ChunkManager* manager, Chunk* chunk, long dataWritten) { // Disable lastError tracking so that any errors, which occur during auto-split do not get // bubbled up on the client connection doing a write. LastError::Disabled d(&LastError::get(cc())); const auto balancerConfig = Grid::get(opCtx)->getBalancerConfiguration(); const bool minIsInf = (0 == manager->getShardKeyPattern().getKeyPattern().globalMin().woCompare(chunk->getMin())); const bool maxIsInf = (0 == manager->getShardKeyPattern().getKeyPattern().globalMax().woCompare(chunk->getMax())); const uint64_t chunkBytesWritten = chunk->addBytesWritten(dataWritten); const uint64_t desiredChunkSize = calculateDesiredChunkSize(balancerConfig->getMaxChunkSizeBytes(), manager->numChunks()); if (!chunk->shouldSplit(desiredChunkSize, minIsInf, maxIsInf)) { return; } const NamespaceString nss(manager->getns()); if (!manager->_autoSplitThrottle._splitTickets.tryAcquire()) { LOG(1) << "won't auto split because not enough tickets: " << nss; return; } TicketHolderReleaser releaser(&(manager->_autoSplitThrottle._splitTickets)); const ChunkRange chunkRange(chunk->getMin(), chunk->getMax()); try { // Ensure we have the most up-to-date balancer configuration uassertStatusOK(balancerConfig->refreshAndCheck(opCtx)); if (!balancerConfig->getShouldAutoSplit()) { return; } LOG(1) << "about to initiate autosplit: " << redact(chunk->toString()) << " dataWritten: " << chunkBytesWritten << " desiredChunkSize: " << desiredChunkSize; const uint64_t chunkSizeToUse = [&]() { const uint64_t estNumSplitPoints = chunkBytesWritten / desiredChunkSize * 2; if (estNumSplitPoints >= kTooManySplitPoints) { // The current desired chunk size will split the chunk into lots of small chunk and // at the worst case this can result into thousands of chunks. So check and see if a // bigger value can be used. return std::min(chunkBytesWritten, balancerConfig->getMaxChunkSizeBytes()); } else { return desiredChunkSize; } }(); auto splitPoints = uassertStatusOK(shardutil::selectChunkSplitPoints(opCtx, chunk->getShardId(), nss, manager->getShardKeyPattern(), chunkRange, chunkSizeToUse, boost::none)); if (splitPoints.size() <= 1) { // No split points means there isn't enough data to split on; 1 split point means we // have // between half the chunk size to full chunk size so there is no need to split yet chunk->clearBytesWritten(); return; } if (minIsInf || maxIsInf) { // We don't want to reset _dataWritten since we want to check the other side right away } else { // We're splitting, so should wait a bit chunk->clearBytesWritten(); } // We assume that if the chunk being split is the first (or last) one on the collection, // this chunk is likely to see more insertions. Instead of splitting mid-chunk, we use the // very first (or last) key as a split point. // // This heuristic is skipped for "special" shard key patterns that are not likely to produce // monotonically increasing or decreasing values (e.g. hashed shard keys). if (KeyPattern::isOrderedKeyPattern(manager->getShardKeyPattern().toBSON())) { if (minIsInf) { BSONObj key = findExtremeKeyForShard( opCtx, nss, chunk->getShardId(), manager->getShardKeyPattern(), true); if (!key.isEmpty()) { splitPoints.front() = key.getOwned(); } } else if (maxIsInf) { BSONObj key = findExtremeKeyForShard( opCtx, nss, chunk->getShardId(), manager->getShardKeyPattern(), false); if (!key.isEmpty()) { splitPoints.back() = key.getOwned(); } } } const auto suggestedMigrateChunk = uassertStatusOK(shardutil::splitChunkAtMultiplePoints(opCtx, chunk->getShardId(), nss, manager->getShardKeyPattern(), manager->getVersion(), chunkRange, splitPoints)); // Balance the resulting chunks if the option is enabled and if the shard suggested a chunk // to balance const bool shouldBalance = [&]() { if (!balancerConfig->shouldBalanceForAutoSplit()) return false; auto collStatus = Grid::get(opCtx)->catalogClient()->getCollection(opCtx, manager->getns()); if (!collStatus.isOK()) { log() << "Auto-split for " << nss << " failed to load collection metadata" << causedBy(redact(collStatus.getStatus())); return false; } return collStatus.getValue().value.getAllowBalance(); }(); log() << "autosplitted " << nss << " chunk: " << redact(chunk->toString()) << " into " << (splitPoints.size() + 1) << " parts (desiredChunkSize " << desiredChunkSize << ")" << (suggestedMigrateChunk ? "" : (std::string) " (migrate suggested" + (shouldBalance ? ")" : ", but no migrations allowed)")); // Reload the chunk manager after the split auto routingInfo = uassertStatusOK( Grid::get(opCtx)->catalogCache()->getShardedCollectionRoutingInfoWithRefresh(opCtx, nss)); if (!shouldBalance || !suggestedMigrateChunk) { return; } // Top chunk optimization - try to move the top chunk out of this shard to prevent the hot // spot from staying on a single shard. This is based on the assumption that succeeding // inserts will fall on the top chunk. // We need to use the latest chunk manager (after the split) in order to have the most // up-to-date view of the chunk we are about to move auto suggestedChunk = routingInfo.cm()->findIntersectingChunkWithSimpleCollation( suggestedMigrateChunk->getMin()); ChunkType chunkToMove; chunkToMove.setNS(nss.ns()); chunkToMove.setShard(suggestedChunk->getShardId()); chunkToMove.setMin(suggestedChunk->getMin()); chunkToMove.setMax(suggestedChunk->getMax()); chunkToMove.setVersion(suggestedChunk->getLastmod()); uassertStatusOK(configsvr_client::rebalanceChunk(opCtx, chunkToMove)); // Ensure the collection gets reloaded because of the move Grid::get(opCtx)->catalogCache()->invalidateShardedCollection(nss); } catch (const DBException& ex) { chunk->clearBytesWritten(); if (ErrorCodes::isStaleShardingError(ErrorCodes::Error(ex.getCode()))) { log() << "Unable to auto-split chunk " << redact(chunkRange.toString()) << causedBy(ex) << ", going to invalidate routing table entry for " << nss; Grid::get(opCtx)->catalogCache()->invalidateShardedCollection(nss); } } }
bool Chunk::splitIfShould(OperationContext* txn, long dataWritten) { LastError::Disabled d(&LastError::get(cc())); try { _dataWritten += dataWritten; uint64_t splitThreshold = _manager->getCurrentDesiredChunkSize(); if (_minIsInf() || _maxIsInf()) { splitThreshold = static_cast<uint64_t>((double)splitThreshold * 0.9); } if (_dataWritten < splitThreshold / ChunkManager::SplitHeuristics::splitTestFactor) { return false; } if (!_manager->_splitHeuristics._splitTickets.tryAcquire()) { LOG(1) << "won't auto split because not enough tickets: " << _manager->getns(); return false; } TicketHolderReleaser releaser(&(_manager->_splitHeuristics._splitTickets)); const auto balancerConfig = Grid::get(txn)->getBalancerConfiguration(); Status refreshStatus = balancerConfig->refreshAndCheck(txn); if (!refreshStatus.isOK()) { warning() << "Unable to refresh balancer settings" << causedBy(refreshStatus); return false; } bool shouldAutoSplit = balancerConfig->getShouldAutoSplit(); if (!shouldAutoSplit) { return false; } LOG(1) << "about to initiate autosplit: " << *this << " dataWritten: " << _dataWritten << " splitThreshold: " << splitThreshold; size_t splitCount = 0; auto splitStatus = split(txn, Chunk::autoSplitInternal, &splitCount); if (!splitStatus.isOK()) { // Split would have issued a message if we got here. This means there wasn't enough // data to split, so don't want to try again until considerable more data _dataWritten = 0; return false; } if (_maxIsInf() || _minIsInf()) { // we don't want to reset _dataWritten since we kind of want to check the other side // right away } else { // we're splitting, so should wait a bit _dataWritten = 0; } bool shouldBalance = balancerConfig->shouldBalanceForAutoSplit(); if (shouldBalance) { auto collStatus = grid.catalogClient(txn)->getCollection(txn, _manager->getns()); if (!collStatus.isOK()) { warning() << "Auto-split for " << _manager->getns() << " failed to load collection metadata" << causedBy(collStatus.getStatus()); return false; } shouldBalance = collStatus.getValue().value.getAllowBalance(); } const auto suggestedMigrateChunk = std::move(splitStatus.getValue()); log() << "autosplitted " << _manager->getns() << " shard: " << toString() << " into " << (splitCount + 1) << " (splitThreshold " << splitThreshold << ")" << (suggestedMigrateChunk ? "" : (string) " (migrate suggested" + (shouldBalance ? ")" : ", but no migrations allowed)")); // Top chunk optimization - try to move the top chunk out of this shard to prevent the hot // spot from staying on a single shard. This is based on the assumption that succeeding // inserts will fall on the top chunk. if (suggestedMigrateChunk && shouldBalance) { const NamespaceString nss(_manager->getns()); // We need to use the latest chunk manager (after the split) in order to have the most // up-to-date view of the chunk we are about to move auto scopedCM = uassertStatusOK(ScopedChunkManager::getExisting(txn, nss)); auto suggestedChunk = scopedCM.cm()->findIntersectingChunk(txn, suggestedMigrateChunk->getMin()); ChunkType chunkToMove; chunkToMove.setNS(nss.ns()); chunkToMove.setShard(suggestedChunk->getShardId()); chunkToMove.setMin(suggestedChunk->getMin()); chunkToMove.setMax(suggestedChunk->getMax()); chunkToMove.setVersion(suggestedChunk->getLastmod()); Status rebalanceStatus = Balancer::get(txn)->rebalanceSingleChunk(txn, chunkToMove); if (!rebalanceStatus.isOK()) { msgassertedNoTraceWithStatus(10412, rebalanceStatus); } _manager->reload(txn); } return true; } catch (const DBException& e) { // TODO: Make this better - there are lots of reasons a split could fail // Random so that we don't sync up with other failed splits _dataWritten = mkDataWritten(); // if the collection lock is taken (e.g. we're migrating), it is fine for the split to fail. warning() << "could not autosplit collection " << _manager->getns() << causedBy(e); return false; } }