void ShardingState::donateChunk(OperationContext* txn, const string& ns, const BSONObj& min, const BSONObj& max, ChunkVersion version) { invariant(txn->lockState()->isCollectionLockedForMode(ns, MODE_X)); stdx::lock_guard<stdx::mutex> lk(_mutex); CollectionMetadataMap::const_iterator it = _collMetadata.find(ns); verify(it != _collMetadata.end()); shared_ptr<CollectionMetadata> p = it->second; // empty shards should have version 0 version = (p->getNumChunks() > 1) ? version : ChunkVersion(0, 0, p->getCollVersion().epoch()); ChunkType chunk; chunk.setMin(min); chunk.setMax(max); string errMsg; shared_ptr<CollectionMetadata> cloned(p->cloneMigrate(chunk, version, &errMsg)); // uassert to match old behavior, TODO: report errors w/o throwing uassert(16855, errMsg, NULL != cloned.get()); // TODO: a bit dangerous to have two different zero-version states - no-metadata and // no-version _collMetadata[ns] = cloned; }
void MetadataManager::beginReceive(const ChunkRange& range) { stdx::lock_guard<stdx::mutex> scopedLock(_managerLock); // Collection is not known to be sharded if the active metadata tracker is null invariant(_activeMetadataTracker); // If range is contained within pending chunks, this means a previous migration must have failed // and we need to clean all overlaps RangeVector overlappedChunks; getRangeMapOverlap(_receivingChunks, range.getMin(), range.getMax(), &overlappedChunks); for (const auto& overlapChunkMin : overlappedChunks) { auto itRecv = _receivingChunks.find(overlapChunkMin.first); invariant(itRecv != _receivingChunks.end()); const ChunkRange receivingRange(itRecv->first, itRecv->second); _receivingChunks.erase(itRecv); // Make sure any potentially partially copied chunks are scheduled to be cleaned up _addRangeToClean_inlock(receivingRange); } // Need to ensure that the background range deleter task won't delete the range we are about to // receive _removeRangeToClean_inlock(range); _receivingChunks.insert(std::make_pair(range.getMin().getOwned(), range.getMax().getOwned())); // For compatibility with the current range deleter, update the pending chunks on the collection // metadata to include the chunk being received ChunkType chunk; chunk.setMin(range.getMin()); chunk.setMax(range.getMax()); _setActiveMetadata_inlock(_activeMetadataTracker->metadata->clonePlusPending(chunk)); }
void ShardingState::donateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ChunkVersion version ) { scoped_lock lk( _mutex ); CollectionMetadataMap::const_iterator it = _collMetadata.find( ns ); verify( it != _collMetadata.end() ) ; CollectionMetadataPtr p = it->second; // empty shards should have version 0 version = ( p->getNumChunks() > 1 ) ? version : ChunkVersion( 0, 0, p->getCollVersion().epoch() ); ChunkType chunk; chunk.setMin( min ); chunk.setMax( max ); string errMsg; CollectionMetadataPtr cloned( p->cloneMigrate( chunk, version, &errMsg ) ); // uassert to match old behavior, TODO: report errors w/o throwing uassert( 16855, errMsg, NULL != cloned.get() ); // TODO: a bit dangerous to have two different zero-version states - no-metadata and // no-version _collMetadata[ns] = cloned; }
MigrateInfo MigrationType::toMigrateInfo() const { ChunkType chunk; chunk.setNS(_nss); chunk.setShard(_fromShard); chunk.setMin(_min); chunk.setMax(_max); chunk.setVersion(_chunkVersion); return MigrateInfo(_toShard, chunk); }
Status ChunkManager::createFirstChunks(OperationContext* txn, const ShardId& primaryShardId, const vector<BSONObj>* initPoints, const set<ShardId>* initShardIds) { // TODO distlock? // TODO: Race condition if we shard the collection and insert data while we split across // the non-primary shard. vector<BSONObj> splitPoints; vector<ShardId> shardIds; calcInitSplitsAndShards(txn, primaryShardId, initPoints, initShardIds, &splitPoints, &shardIds); // this is the first chunk; start the versioning from scratch ChunkVersion version(1, 0, OID::gen()); log() << "going to create " << splitPoints.size() + 1 << " chunk(s) for: " << _ns << " using new epoch " << version.epoch(); for (unsigned i = 0; i <= splitPoints.size(); i++) { BSONObj min = i == 0 ? _keyPattern.getKeyPattern().globalMin() : splitPoints[i - 1]; BSONObj max = i < splitPoints.size() ? splitPoints[i] : _keyPattern.getKeyPattern().globalMax(); ChunkType chunk; chunk.setName(Chunk::genID(_ns, min)); chunk.setNS(_ns); chunk.setMin(min); chunk.setMax(max); chunk.setShard(shardIds[i % shardIds.size()]); chunk.setVersion(version); Status status = grid.catalogManager(txn) ->insertConfigDocument(txn, ChunkType::ConfigNS, chunk.toBSON()); if (!status.isOK()) { const string errMsg = str::stream() << "Creating first chunks failed: " << status.reason(); error() << errMsg; return Status(status.code(), errMsg); } version.incMinor(); } _version = ChunkVersion(0, 0, version.epoch()); return Status::OK(); }
bool ShardingState::notePending(OperationContext* txn, const string& ns, const BSONObj& min, const BSONObj& max, const OID& epoch, string* errMsg) { invariant(txn->lockState()->isCollectionLockedForMode(ns, MODE_X)); stdx::lock_guard<stdx::mutex> lk(_mutex); CollectionMetadataMap::const_iterator it = _collMetadata.find(ns); if (it == _collMetadata.end()) { *errMsg = str::stream() << "could not note chunk " << "[" << min << "," << max << ")" << " as pending because the local metadata for " << ns << " has changed"; return false; } shared_ptr<CollectionMetadata> metadata = it->second; // This can currently happen because drops aren't synchronized with in-migrations // The idea for checking this here is that in the future we shouldn't have this problem if (metadata->getCollVersion().epoch() != epoch) { *errMsg = str::stream() << "could not note chunk " << "[" << min << "," << max << ")" << " as pending because the epoch for " << ns << " has changed from " << epoch << " to " << metadata->getCollVersion().epoch(); return false; } ChunkType chunk; chunk.setMin(min); chunk.setMax(max); shared_ptr<CollectionMetadata> cloned(metadata->clonePlusPending(chunk, errMsg)); if (!cloned) return false; _collMetadata[ns] = cloned; return true; }
bool ShardingState::forgetPending( const string& ns, const BSONObj& min, const BSONObj& max, const OID& epoch, string* errMsg ) { scoped_lock lk( _mutex ); CollectionMetadataMap::const_iterator it = _collMetadata.find( ns ); if ( it == _collMetadata.end() ) { *errMsg = str::stream() << "no need to forget pending chunk " << "[" << min << "," << max << ")" << " because the local metadata for " << ns << " has changed"; return false; } CollectionMetadataPtr metadata = it->second; // This can currently happen because drops aren't synchronized with in-migrations // The idea for checking this here is that in the future we shouldn't have this problem if ( metadata->getCollVersion().epoch() != epoch ) { *errMsg = str::stream() << "no need to forget pending chunk " << "[" << min << "," << max << ")" << " because the epoch for " << ns << " has changed from " << epoch << " to " << metadata->getCollVersion().epoch(); return false; } ChunkType chunk; chunk.setMin( min ); chunk.setMax( max ); CollectionMetadataPtr cloned( metadata->cloneMinusPending( chunk, errMsg ) ); if ( !cloned ) return false; _collMetadata[ns] = cloned; return true; }
void ShardingState::splitChunk( const string& ns, const BSONObj& min, const BSONObj& max, const vector<BSONObj>& splitKeys, ChunkVersion version ) { scoped_lock lk( _mutex ); CollectionMetadataMap::const_iterator it = _collMetadata.find( ns ); verify( it != _collMetadata.end() ) ; ChunkType chunk; chunk.setMin( min ); chunk.setMax( max ); string errMsg; CollectionMetadataPtr cloned( it->second->cloneSplit( chunk, splitKeys, version, &errMsg ) ); // uassert to match old behavior, TODO: report errors w/o throwing uassert( 16857, errMsg, NULL != cloned.get() ); _collMetadata[ns] = cloned; }
/** * Stores ranges for a particular collection and shard starting from some version */ void storeCollectionRanges( const NamespaceString& nss, const string& shardName, const vector<KeyRange>& ranges, const ChunkVersion& startVersion ) { // Get key pattern from first range ASSERT_GREATER_THAN( ranges.size(), 0u ); CollectionType coll; coll.setNS( nss.ns() ); coll.setKeyPattern( ranges.begin()->keyPattern ); coll.setEpoch( startVersion.epoch() ); coll.setUpdatedAt( 1ULL ); string errMsg; ASSERT( coll.isValid( &errMsg ) ); DBDirectClient client(&_txn); client.update( CollectionType::ConfigNS, BSON( CollectionType::ns( coll.getNS() ) ), coll.toBSON(), true, false ); ChunkVersion nextVersion = startVersion; for ( vector<KeyRange>::const_iterator it = ranges.begin(); it != ranges.end(); ++it ) { ChunkType chunk; // TODO: We should not rely on the serialized ns, minkey being unique in the future, // causes problems since it links string serialization to correctness. chunk.setName( Chunk::genID( nss, it->minKey ) ); chunk.setShard( shardName ); chunk.setNS( nss.ns() ); chunk.setVersion( nextVersion ); chunk.setMin( it->minKey ); chunk.setMax( it->maxKey ); nextVersion.incMajor(); client.insert( ChunkType::ConfigNS, chunk.toBSON() ); } }
void MetadataManager::forgetReceive(const ChunkRange& range) { stdx::lock_guard<stdx::mutex> scopedLock(_managerLock); { auto it = _receivingChunks.find(range.getMin()); invariant(it != _receivingChunks.end()); // Verify entire ChunkRange is identical, not just the min key. invariant(it->second == range.getMax()); _receivingChunks.erase(it); } // This is potentially a partially received data, which needs to be cleaned up _addRangeToClean_inlock(range); // For compatibility with the current range deleter, update the pending chunks on the collection // metadata to exclude the chunk being received, which was added in beginReceive ChunkType chunk; chunk.setMin(range.getMin()); chunk.setMax(range.getMax()); _setActiveMetadata_inlock(_activeMetadataTracker->metadata->cloneMinusPending(chunk)); }
void ShardingState::splitChunk(OperationContext* txn, const string& ns, const BSONObj& min, const BSONObj& max, const vector<BSONObj>& splitKeys, ChunkVersion version) { invariant(txn->lockState()->isCollectionLockedForMode(ns, MODE_X)); stdx::lock_guard<stdx::mutex> lk(_mutex); CollectionMetadataMap::const_iterator it = _collMetadata.find(ns); verify(it != _collMetadata.end()); ChunkType chunk; chunk.setMin(min); chunk.setMax(max); string errMsg; shared_ptr<CollectionMetadata> cloned( it->second->cloneSplit(chunk, splitKeys, version, &errMsg)); // uassert to match old behavior, TODO: report errors w/o throwing uassert(16857, errMsg, NULL != cloned.get()); _collMetadata[ns] = cloned; }
void DistributionStatus::populateShardToChunksMap(const ShardStatisticsVector& allShards, const ChunkManager& chunkMgr, ShardToChunksMap* shardToChunksMap) { // Makes sure there is an entry in shardToChunksMap for every shard. for (const auto& stat : allShards) { (*shardToChunksMap)[stat.shardId]; } const ChunkMap& chunkMap = chunkMgr.getChunkMap(); for (ChunkMap::const_iterator it = chunkMap.begin(); it != chunkMap.end(); ++it) { const ChunkPtr chunkPtr = it->second; ChunkType chunk; chunk.setNS(chunkMgr.getns()); chunk.setMin(chunkPtr->getMin().getOwned()); chunk.setMax(chunkPtr->getMax().getOwned()); chunk.setJumbo(chunkPtr->isJumbo()); // TODO: is this reliable? const string shardName(chunkPtr->getShardId()); chunk.setShard(shardName); (*shardToChunksMap)[shardName].push_back(chunk); } }
bool Chunk::splitIfShould(long dataWritten) const { dassert(ShouldAutoSplit); LastError::Disabled d(&LastError::get(cc())); try { _dataWritten += dataWritten; int splitThreshold = getManager()->getCurrentDesiredChunkSize(); if (_minIsInf() || _maxIsInf()) { splitThreshold = (int)((double)splitThreshold * .9); } if (_dataWritten < splitThreshold / ChunkManager::SplitHeuristics::splitTestFactor) return false; if (!getManager()->_splitHeuristics._splitTickets.tryAcquire()) { LOG(1) << "won't auto split because not enough tickets: " << getManager()->getns(); return false; } TicketHolderReleaser releaser(&(getManager()->_splitHeuristics._splitTickets)); // this is a bit ugly // we need it so that mongos blocks for the writes to actually be committed // this does mean mongos has more back pressure than mongod alone // since it nots 100% tcp queue bound // this was implicit before since we did a splitVector on the same socket ShardConnection::sync(); LOG(1) << "about to initiate autosplit: " << *this << " dataWritten: " << _dataWritten << " splitThreshold: " << splitThreshold; BSONObj res; size_t splitCount = 0; Status status = split(Chunk::autoSplitInternal, &splitCount, &res); if (!status.isOK()) { // Split would have issued a message if we got here. This means there wasn't enough // data to split, so don't want to try again until considerable more data _dataWritten = 0; return false; } if (_maxIsInf() || _minIsInf()) { // we don't want to reset _dataWritten since we kind of want to check the other side // right away } else { // we're splitting, so should wait a bit _dataWritten = 0; } bool shouldBalance = grid.getConfigShouldBalance(); if (shouldBalance) { auto status = grid.catalogManager()->getCollection(_manager->getns()); if (!status.isOK()) { log() << "Auto-split for " << _manager->getns() << " failed to load collection metadata due to " << status.getStatus(); return false; } shouldBalance = status.getValue().getAllowBalance(); } log() << "autosplitted " << _manager->getns() << " shard: " << toString() << " into " << (splitCount + 1) << " (splitThreshold " << splitThreshold << ")" << (res["shouldMigrate"].eoo() ? "" : (string) " (migrate suggested" + (shouldBalance ? ")" : ", but no migrations allowed)")); // Top chunk optimization - try to move the top chunk out of this shard // to prevent the hot spot from staying on a single shard. This is based on // the assumption that succeeding inserts will fall on the top chunk. BSONElement shouldMigrate = res["shouldMigrate"]; // not in mongod < 1.9.1 but that is ok if (!shouldMigrate.eoo() && shouldBalance) { BSONObj range = shouldMigrate.embeddedObject(); ChunkType chunkToMove; { const auto shard = grid.shardRegistry()->getShard(getShardId()); chunkToMove.setShard(shard->toString()); } chunkToMove.setMin(range["min"].embeddedObject()); chunkToMove.setMax(range["max"].embeddedObject()); tryMoveToOtherShard(*_manager, chunkToMove); } return true; } catch (DBException& e) { // TODO: Make this better - there are lots of reasons a split could fail // Random so that we don't sync up with other failed splits _dataWritten = mkDataWritten(); // if the collection lock is taken (e.g. we're migrating), it is fine for the split to fail. warning() << "could not autosplit collection " << _manager->getns() << causedBy(e); return false; } }
bool mergeChunks( OperationContext* txn, const NamespaceString& nss, const BSONObj& minKey, const BSONObj& maxKey, const OID& epoch, string* errMsg ) { // // Get sharding state up-to-date // ConnectionString configLoc = ConnectionString::parse( shardingState.getConfigServer(), *errMsg ); if ( !configLoc.isValid() ){ warning() << *errMsg << endl; return false; } // // Get the distributed lock // ScopedDistributedLock collLock( configLoc, nss.ns() ); collLock.setLockMessage( stream() << "merging chunks in " << nss.ns() << " from " << minKey << " to " << maxKey ); Status acquisitionStatus = collLock.tryAcquire(); if (!acquisitionStatus.isOK()) { *errMsg = stream() << "could not acquire collection lock for " << nss.ns() << " to merge chunks in [" << minKey << "," << maxKey << ")" << causedBy(acquisitionStatus); warning() << *errMsg << endl; return false; } // // We now have the collection lock, refresh metadata to latest version and sanity check // ChunkVersion shardVersion; Status status = shardingState.refreshMetadataNow(txn, nss.ns(), &shardVersion); if ( !status.isOK() ) { *errMsg = str::stream() << "could not merge chunks, failed to refresh metadata for " << nss.ns() << causedBy( status.reason() ); warning() << *errMsg << endl; return false; } if ( epoch.isSet() && shardVersion.epoch() != epoch ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has changed" << " since merge was sent" << "(sent epoch : " << epoch.toString() << ", current epoch : " << shardVersion.epoch().toString() << ")"; warning() << *errMsg << endl; return false; } CollectionMetadataPtr metadata = shardingState.getCollectionMetadata( nss.ns() ); if ( !metadata || metadata->getKeyPattern().isEmpty() ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " is not sharded"; warning() << *errMsg << endl; return false; } dassert( metadata->getShardVersion().equals( shardVersion ) ); if ( !metadata->isValidKey( minKey ) || !metadata->isValidKey( maxKey ) ) { *errMsg = stream() << "could not merge chunks, the range " << rangeToString( minKey, maxKey ) << " is not valid" << " for collection " << nss.ns() << " with key pattern " << metadata->getKeyPattern(); warning() << *errMsg << endl; return false; } // // Get merged chunk information // ChunkVersion mergeVersion = metadata->getCollVersion(); mergeVersion.incMinor(); OwnedPointerVector<ChunkType> chunksToMerge; ChunkType itChunk; itChunk.setMin( minKey ); itChunk.setMax( minKey ); itChunk.setNS( nss.ns() ); itChunk.setShard( shardingState.getShardName() ); while ( itChunk.getMax().woCompare( maxKey ) < 0 && metadata->getNextChunk( itChunk.getMax(), &itChunk ) ) { auto_ptr<ChunkType> saved( new ChunkType ); itChunk.cloneTo( saved.get() ); chunksToMerge.mutableVector().push_back( saved.release() ); } if ( chunksToMerge.empty() ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " and ending at " << maxKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } // // Validate the range starts and ends at chunks and has no holes, error if not valid // BSONObj firstDocMin = ( *chunksToMerge.begin() )->getMin(); BSONObj firstDocMax = ( *chunksToMerge.begin() )->getMax(); // minKey is inclusive bool minKeyInRange = rangeContains( firstDocMin, firstDocMax, minKey ); if ( !minKeyInRange ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } BSONObj lastDocMin = ( *chunksToMerge.rbegin() )->getMin(); BSONObj lastDocMax = ( *chunksToMerge.rbegin() )->getMax(); // maxKey is exclusive bool maxKeyInRange = lastDocMin.woCompare( maxKey ) < 0 && lastDocMax.woCompare( maxKey ) >= 0; if ( !maxKeyInRange ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range ending at " << maxKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } bool validRangeStartKey = firstDocMin.woCompare( minKey ) == 0; bool validRangeEndKey = lastDocMax.woCompare( maxKey ) == 0; if ( !validRangeStartKey || !validRangeEndKey ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " does not contain a chunk " << ( !validRangeStartKey ? "starting at " + minKey.toString() : "" ) << ( !validRangeStartKey && !validRangeEndKey ? " or " : "" ) << ( !validRangeEndKey ? "ending at " + maxKey.toString() : "" ); warning() << *errMsg << endl; return false; } if ( chunksToMerge.size() == 1 ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " already contains chunk for " << rangeToString( minKey, maxKey ); warning() << *errMsg << endl; return false; } bool holeInRange = false; // Look for hole in range ChunkType* prevChunk = *chunksToMerge.begin(); ChunkType* nextChunk = NULL; for ( OwnedPointerVector<ChunkType>::const_iterator it = chunksToMerge.begin(); it != chunksToMerge.end(); ++it ) { if ( it == chunksToMerge.begin() ) continue; nextChunk = *it; if ( prevChunk->getMax().woCompare( nextChunk->getMin() ) != 0 ) { holeInRange = true; break; } prevChunk = nextChunk; } if ( holeInRange ) { dassert( NULL != nextChunk ); *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has a hole in the range " << rangeToString( minKey, maxKey ) << " at " << rangeToString( prevChunk->getMax(), nextChunk->getMin() ); warning() << *errMsg << endl; return false; } // // Run apply ops command // BSONObj applyOpsCmd = buildApplyOpsCmd( chunksToMerge, shardVersion, mergeVersion ); bool ok; BSONObj result; try { ScopedDbConnection conn( configLoc, 30.0 ); ok = conn->runCommand( "config", applyOpsCmd, result ); if ( !ok ) *errMsg = result.toString(); conn.done(); } catch( const DBException& ex ) { ok = false; *errMsg = ex.toString(); } if ( !ok ) { *errMsg = stream() << "could not merge chunks for " << nss.ns() << ", writing to config failed" << causedBy( errMsg ); warning() << *errMsg << endl; return false; } // // Install merged chunk metadata // { Lock::DBLock writeLk(txn->lockState(), nss.db(), newlm::MODE_X); shardingState.mergeChunks(txn, nss.ns(), minKey, maxKey, mergeVersion); } // // Log change // BSONObj mergeLogEntry = buildMergeLogEntry( chunksToMerge, shardVersion, mergeVersion ); configServer.logChange( "merge", nss.ns(), mergeLogEntry ); return true; }
bool mergeChunks(OperationContext* txn, const NamespaceString& nss, const BSONObj& minKey, const BSONObj& maxKey, const OID& epoch, string* errMsg) { // Get the distributed lock string whyMessage = stream() << "merging chunks in " << nss.ns() << " from " << minKey << " to " << maxKey; auto scopedDistLock = grid.catalogManager(txn)->distLock( txn, nss.ns(), whyMessage, DistLockManager::kSingleLockAttemptTimeout); if (!scopedDistLock.isOK()) { *errMsg = stream() << "could not acquire collection lock for " << nss.ns() << " to merge chunks in [" << minKey << "," << maxKey << ")" << causedBy(scopedDistLock.getStatus()); warning() << *errMsg; return false; } ShardingState* shardingState = ShardingState::get(txn); // // We now have the collection lock, refresh metadata to latest version and sanity check // ChunkVersion shardVersion; Status status = shardingState->refreshMetadataNow(txn, nss.ns(), &shardVersion); if (!status.isOK()) { *errMsg = str::stream() << "could not merge chunks, failed to refresh metadata for " << nss.ns() << causedBy(status.reason()); warning() << *errMsg; return false; } if (epoch.isSet() && shardVersion.epoch() != epoch) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has changed" << " since merge was sent" << "(sent epoch : " << epoch.toString() << ", current epoch : " << shardVersion.epoch().toString() << ")"; warning() << *errMsg; return false; } shared_ptr<CollectionMetadata> metadata = shardingState->getCollectionMetadata(nss.ns()); if (!metadata || metadata->getKeyPattern().isEmpty()) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " is not sharded"; warning() << *errMsg; return false; } dassert(metadata->getShardVersion().equals(shardVersion)); if (!metadata->isValidKey(minKey) || !metadata->isValidKey(maxKey)) { *errMsg = stream() << "could not merge chunks, the range " << rangeToString(minKey, maxKey) << " is not valid" << " for collection " << nss.ns() << " with key pattern " << metadata->getKeyPattern(); warning() << *errMsg; return false; } // // Get merged chunk information // ChunkVersion mergeVersion = metadata->getCollVersion(); mergeVersion.incMinor(); std::vector<ChunkType> chunksToMerge; ChunkType itChunk; itChunk.setMin(minKey); itChunk.setMax(minKey); itChunk.setNS(nss.ns()); itChunk.setShard(shardingState->getShardName()); while (itChunk.getMax().woCompare(maxKey) < 0 && metadata->getNextChunk(itChunk.getMax(), &itChunk)) { chunksToMerge.push_back(itChunk); } if (chunksToMerge.empty()) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " and ending at " << maxKey << " does not belong to shard " << shardingState->getShardName(); warning() << *errMsg; return false; } // // Validate the range starts and ends at chunks and has no holes, error if not valid // BSONObj firstDocMin = chunksToMerge.front().getMin(); BSONObj firstDocMax = chunksToMerge.front().getMax(); // minKey is inclusive bool minKeyInRange = rangeContains(firstDocMin, firstDocMax, minKey); if (!minKeyInRange) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " does not belong to shard " << shardingState->getShardName(); warning() << *errMsg; return false; } BSONObj lastDocMin = chunksToMerge.back().getMin(); BSONObj lastDocMax = chunksToMerge.back().getMax(); // maxKey is exclusive bool maxKeyInRange = lastDocMin.woCompare(maxKey) < 0 && lastDocMax.woCompare(maxKey) >= 0; if (!maxKeyInRange) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range ending at " << maxKey << " does not belong to shard " << shardingState->getShardName(); warning() << *errMsg; return false; } bool validRangeStartKey = firstDocMin.woCompare(minKey) == 0; bool validRangeEndKey = lastDocMax.woCompare(maxKey) == 0; if (!validRangeStartKey || !validRangeEndKey) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " does not contain a chunk " << (!validRangeStartKey ? "starting at " + minKey.toString() : "") << (!validRangeStartKey && !validRangeEndKey ? " or " : "") << (!validRangeEndKey ? "ending at " + maxKey.toString() : ""); warning() << *errMsg; return false; } if (chunksToMerge.size() == 1) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " already contains chunk for " << rangeToString(minKey, maxKey); warning() << *errMsg; return false; } // Look for hole in range for (size_t i = 1; i < chunksToMerge.size(); ++i) { if (chunksToMerge[i - 1].getMax().woCompare(chunksToMerge[i].getMin()) != 0) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has a hole in the range " << rangeToString(minKey, maxKey) << " at " << rangeToString(chunksToMerge[i - 1].getMax(), chunksToMerge[i].getMin()); warning() << *errMsg; return false; } } // // Run apply ops command // Status applyOpsStatus = runApplyOpsCmd(txn, chunksToMerge, shardVersion, mergeVersion); if (!applyOpsStatus.isOK()) { warning() << applyOpsStatus; return false; } // // Install merged chunk metadata // { ScopedTransaction transaction(txn, MODE_IX); Lock::DBLock writeLk(txn->lockState(), nss.db(), MODE_IX); Lock::CollectionLock collLock(txn->lockState(), nss.ns(), MODE_X); shardingState->mergeChunks(txn, nss.ns(), minKey, maxKey, mergeVersion); } // // Log change // BSONObj mergeLogEntry = buildMergeLogEntry(chunksToMerge, shardVersion, mergeVersion); grid.catalogManager(txn)->logChange(txn, "merge", nss.ns(), mergeLogEntry); return true; }
void updateChunkWriteStatsAndSplitIfNeeded(OperationContext* opCtx, ChunkManager* manager, Chunk* chunk, long dataWritten) { // Disable lastError tracking so that any errors, which occur during auto-split do not get // bubbled up on the client connection doing a write. LastError::Disabled d(&LastError::get(cc())); const auto balancerConfig = Grid::get(opCtx)->getBalancerConfiguration(); const bool minIsInf = (0 == manager->getShardKeyPattern().getKeyPattern().globalMin().woCompare(chunk->getMin())); const bool maxIsInf = (0 == manager->getShardKeyPattern().getKeyPattern().globalMax().woCompare(chunk->getMax())); const uint64_t chunkBytesWritten = chunk->addBytesWritten(dataWritten); const uint64_t desiredChunkSize = calculateDesiredChunkSize(balancerConfig->getMaxChunkSizeBytes(), manager->numChunks()); if (!chunk->shouldSplit(desiredChunkSize, minIsInf, maxIsInf)) { return; } const NamespaceString nss(manager->getns()); if (!manager->_autoSplitThrottle._splitTickets.tryAcquire()) { LOG(1) << "won't auto split because not enough tickets: " << nss; return; } TicketHolderReleaser releaser(&(manager->_autoSplitThrottle._splitTickets)); const ChunkRange chunkRange(chunk->getMin(), chunk->getMax()); try { // Ensure we have the most up-to-date balancer configuration uassertStatusOK(balancerConfig->refreshAndCheck(opCtx)); if (!balancerConfig->getShouldAutoSplit()) { return; } LOG(1) << "about to initiate autosplit: " << redact(chunk->toString()) << " dataWritten: " << chunkBytesWritten << " desiredChunkSize: " << desiredChunkSize; const uint64_t chunkSizeToUse = [&]() { const uint64_t estNumSplitPoints = chunkBytesWritten / desiredChunkSize * 2; if (estNumSplitPoints >= kTooManySplitPoints) { // The current desired chunk size will split the chunk into lots of small chunk and // at the worst case this can result into thousands of chunks. So check and see if a // bigger value can be used. return std::min(chunkBytesWritten, balancerConfig->getMaxChunkSizeBytes()); } else { return desiredChunkSize; } }(); auto splitPoints = uassertStatusOK(shardutil::selectChunkSplitPoints(opCtx, chunk->getShardId(), nss, manager->getShardKeyPattern(), chunkRange, chunkSizeToUse, boost::none)); if (splitPoints.size() <= 1) { // No split points means there isn't enough data to split on; 1 split point means we // have // between half the chunk size to full chunk size so there is no need to split yet chunk->clearBytesWritten(); return; } if (minIsInf || maxIsInf) { // We don't want to reset _dataWritten since we want to check the other side right away } else { // We're splitting, so should wait a bit chunk->clearBytesWritten(); } // We assume that if the chunk being split is the first (or last) one on the collection, // this chunk is likely to see more insertions. Instead of splitting mid-chunk, we use the // very first (or last) key as a split point. // // This heuristic is skipped for "special" shard key patterns that are not likely to produce // monotonically increasing or decreasing values (e.g. hashed shard keys). if (KeyPattern::isOrderedKeyPattern(manager->getShardKeyPattern().toBSON())) { if (minIsInf) { BSONObj key = findExtremeKeyForShard( opCtx, nss, chunk->getShardId(), manager->getShardKeyPattern(), true); if (!key.isEmpty()) { splitPoints.front() = key.getOwned(); } } else if (maxIsInf) { BSONObj key = findExtremeKeyForShard( opCtx, nss, chunk->getShardId(), manager->getShardKeyPattern(), false); if (!key.isEmpty()) { splitPoints.back() = key.getOwned(); } } } const auto suggestedMigrateChunk = uassertStatusOK(shardutil::splitChunkAtMultiplePoints(opCtx, chunk->getShardId(), nss, manager->getShardKeyPattern(), manager->getVersion(), chunkRange, splitPoints)); // Balance the resulting chunks if the option is enabled and if the shard suggested a chunk // to balance const bool shouldBalance = [&]() { if (!balancerConfig->shouldBalanceForAutoSplit()) return false; auto collStatus = Grid::get(opCtx)->catalogClient()->getCollection(opCtx, manager->getns()); if (!collStatus.isOK()) { log() << "Auto-split for " << nss << " failed to load collection metadata" << causedBy(redact(collStatus.getStatus())); return false; } return collStatus.getValue().value.getAllowBalance(); }(); log() << "autosplitted " << nss << " chunk: " << redact(chunk->toString()) << " into " << (splitPoints.size() + 1) << " parts (desiredChunkSize " << desiredChunkSize << ")" << (suggestedMigrateChunk ? "" : (std::string) " (migrate suggested" + (shouldBalance ? ")" : ", but no migrations allowed)")); // Reload the chunk manager after the split auto routingInfo = uassertStatusOK( Grid::get(opCtx)->catalogCache()->getShardedCollectionRoutingInfoWithRefresh(opCtx, nss)); if (!shouldBalance || !suggestedMigrateChunk) { return; } // Top chunk optimization - try to move the top chunk out of this shard to prevent the hot // spot from staying on a single shard. This is based on the assumption that succeeding // inserts will fall on the top chunk. // We need to use the latest chunk manager (after the split) in order to have the most // up-to-date view of the chunk we are about to move auto suggestedChunk = routingInfo.cm()->findIntersectingChunkWithSimpleCollation( suggestedMigrateChunk->getMin()); ChunkType chunkToMove; chunkToMove.setNS(nss.ns()); chunkToMove.setShard(suggestedChunk->getShardId()); chunkToMove.setMin(suggestedChunk->getMin()); chunkToMove.setMax(suggestedChunk->getMax()); chunkToMove.setVersion(suggestedChunk->getLastmod()); uassertStatusOK(configsvr_client::rebalanceChunk(opCtx, chunkToMove)); // Ensure the collection gets reloaded because of the move Grid::get(opCtx)->catalogCache()->invalidateShardedCollection(nss); } catch (const DBException& ex) { chunk->clearBytesWritten(); if (ErrorCodes::isStaleShardingError(ErrorCodes::Error(ex.getCode()))) { log() << "Unable to auto-split chunk " << redact(chunkRange.toString()) << causedBy(ex) << ", going to invalidate routing table entry for " << nss; Grid::get(opCtx)->catalogCache()->invalidateShardedCollection(nss); } } }
bool Chunk::splitIfShould(OperationContext* txn, long dataWritten) { LastError::Disabled d(&LastError::get(cc())); try { _dataWritten += dataWritten; uint64_t splitThreshold = _manager->getCurrentDesiredChunkSize(); if (_minIsInf() || _maxIsInf()) { splitThreshold = static_cast<uint64_t>((double)splitThreshold * 0.9); } if (_dataWritten < splitThreshold / ChunkManager::SplitHeuristics::splitTestFactor) { return false; } if (!_manager->_splitHeuristics._splitTickets.tryAcquire()) { LOG(1) << "won't auto split because not enough tickets: " << _manager->getns(); return false; } TicketHolderReleaser releaser(&(_manager->_splitHeuristics._splitTickets)); const auto balancerConfig = Grid::get(txn)->getBalancerConfiguration(); Status refreshStatus = balancerConfig->refreshAndCheck(txn); if (!refreshStatus.isOK()) { warning() << "Unable to refresh balancer settings" << causedBy(refreshStatus); return false; } bool shouldAutoSplit = balancerConfig->getShouldAutoSplit(); if (!shouldAutoSplit) { return false; } LOG(1) << "about to initiate autosplit: " << *this << " dataWritten: " << _dataWritten << " splitThreshold: " << splitThreshold; size_t splitCount = 0; auto splitStatus = split(txn, Chunk::autoSplitInternal, &splitCount); if (!splitStatus.isOK()) { // Split would have issued a message if we got here. This means there wasn't enough // data to split, so don't want to try again until considerable more data _dataWritten = 0; return false; } if (_maxIsInf() || _minIsInf()) { // we don't want to reset _dataWritten since we kind of want to check the other side // right away } else { // we're splitting, so should wait a bit _dataWritten = 0; } bool shouldBalance = balancerConfig->shouldBalanceForAutoSplit(); if (shouldBalance) { auto collStatus = grid.catalogClient(txn)->getCollection(txn, _manager->getns()); if (!collStatus.isOK()) { warning() << "Auto-split for " << _manager->getns() << " failed to load collection metadata" << causedBy(collStatus.getStatus()); return false; } shouldBalance = collStatus.getValue().value.getAllowBalance(); } const auto suggestedMigrateChunk = std::move(splitStatus.getValue()); log() << "autosplitted " << _manager->getns() << " shard: " << toString() << " into " << (splitCount + 1) << " (splitThreshold " << splitThreshold << ")" << (suggestedMigrateChunk ? "" : (string) " (migrate suggested" + (shouldBalance ? ")" : ", but no migrations allowed)")); // Top chunk optimization - try to move the top chunk out of this shard to prevent the hot // spot from staying on a single shard. This is based on the assumption that succeeding // inserts will fall on the top chunk. if (suggestedMigrateChunk && shouldBalance) { const NamespaceString nss(_manager->getns()); // We need to use the latest chunk manager (after the split) in order to have the most // up-to-date view of the chunk we are about to move auto scopedCM = uassertStatusOK(ScopedChunkManager::getExisting(txn, nss)); auto suggestedChunk = scopedCM.cm()->findIntersectingChunk(txn, suggestedMigrateChunk->getMin()); ChunkType chunkToMove; chunkToMove.setNS(nss.ns()); chunkToMove.setShard(suggestedChunk->getShardId()); chunkToMove.setMin(suggestedChunk->getMin()); chunkToMove.setMax(suggestedChunk->getMax()); chunkToMove.setVersion(suggestedChunk->getLastmod()); Status rebalanceStatus = Balancer::get(txn)->rebalanceSingleChunk(txn, chunkToMove); if (!rebalanceStatus.isOK()) { msgassertedNoTraceWithStatus(10412, rebalanceStatus); } _manager->reload(txn); } return true; } catch (const DBException& e) { // TODO: Make this better - there are lots of reasons a split could fail // Random so that we don't sync up with other failed splits _dataWritten = mkDataWritten(); // if the collection lock is taken (e.g. we're migrating), it is fine for the split to fail. warning() << "could not autosplit collection " << _manager->getns() << causedBy(e); return false; } }
Status MigrationSourceManager::commitDonateChunk(OperationContext* txn) { invariant(!txn->lockState()->isLocked()); invariant(_state == kCriticalSection); auto scopedGuard = MakeGuard([&] { cleanupOnError(txn); }); // Tell the recipient shard to fetch the latest changes Status commitCloneStatus = _cloneDriver->commitClone(txn); if (MONGO_FAIL_POINT(failMigrationCommit) && commitCloneStatus.isOK()) { commitCloneStatus = {ErrorCodes::InternalError, "Failing _recvChunkCommit due to failpoint."}; } if (!commitCloneStatus.isOK()) { return {commitCloneStatus.code(), str::stream() << "commit clone failed due to " << commitCloneStatus.toString()}; } // Generate the next collection version. ChunkVersion uncommittedCollVersion = _committedMetadata->getCollVersion(); uncommittedCollVersion.incMajor(); // applyOps preparation for reflecting the uncommitted metadata on the config server // Preconditions BSONArrayBuilder preCond; { BSONObjBuilder b; b.append("ns", ChunkType::ConfigNS); b.append("q", BSON("query" << BSON(ChunkType::ns(_args.getNss().ns())) << "orderby" << BSON(ChunkType::DEPRECATED_lastmod() << -1))); { BSONObjBuilder bb(b.subobjStart("res")); // TODO: For backwards compatibility, we can't yet require an epoch here bb.appendTimestamp(ChunkType::DEPRECATED_lastmod(), _committedMetadata->getCollVersion().toLong()); bb.done(); } preCond.append(b.obj()); } // Update for the chunk which is being donated BSONArrayBuilder updates; { BSONObjBuilder op; op.append("op", "u"); op.appendBool("b", false); // No upserting op.append("ns", ChunkType::ConfigNS); BSONObjBuilder n(op.subobjStart("o")); n.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), _args.getMinKey())); uncommittedCollVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod()); n.append(ChunkType::ns(), _args.getNss().ns()); n.append(ChunkType::min(), _args.getMinKey()); n.append(ChunkType::max(), _args.getMaxKey()); n.append(ChunkType::shard(), _args.getToShardId()); n.done(); BSONObjBuilder q(op.subobjStart("o2")); q.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), _args.getMinKey())); q.done(); updates.append(op.obj()); } // Update for the chunk being moved // Version at which the next highest lastmod will be set. If the chunk being moved is the last // in the shard, nextVersion is that chunk's lastmod otherwise the highest version is from the // chunk being bumped on the FROM-shard. ChunkVersion nextVersion = uncommittedCollVersion; // If we have chunks left on the FROM shard, update the version of one of them as well. We can // figure that out by grabbing the metadata as it has been changed. if (_committedMetadata->getNumChunks() > 1) { ChunkType bumpChunk; invariant(_committedMetadata->getDifferentChunk(_args.getMinKey(), &bumpChunk)); BSONObj bumpMin = bumpChunk.getMin(); BSONObj bumpMax = bumpChunk.getMax(); nextVersion.incMinor(); dassert(bumpMin.woCompare(_args.getMinKey()) != 0); BSONObjBuilder op; op.append("op", "u"); op.appendBool("b", false); op.append("ns", ChunkType::ConfigNS); BSONObjBuilder n(op.subobjStart("o")); n.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), bumpMin)); nextVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod()); n.append(ChunkType::ns(), _args.getNss().ns()); n.append(ChunkType::min(), bumpMin); n.append(ChunkType::max(), bumpMax); n.append(ChunkType::shard(), _args.getFromShardId()); n.done(); BSONObjBuilder q(op.subobjStart("o2")); q.append(ChunkType::name(), ChunkType::genID(_args.getNss().ns(), bumpMin)); q.done(); updates.append(op.obj()); log() << "moveChunk updating self version to: " << nextVersion << " through " << bumpMin << " -> " << bumpMax << " for collection '" << _args.getNss().ns() << "'"; } else { log() << "moveChunk moved last chunk out for collection '" << _args.getNss().ns() << "'"; } MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangBeforeCommitMigration); Status applyOpsStatus = grid.catalogClient(txn)->applyChunkOpsDeprecated( txn, updates.arr(), preCond.arr(), _args.getNss().ns(), nextVersion); if (MONGO_FAIL_POINT(failCommitMigrationCommand)) { applyOpsStatus = Status(ErrorCodes::InternalError, "Failpoint 'failCommitMigrationCommand' generated error"); } if (applyOpsStatus.isOK()) { // Now that applyOps succeeded and the new collection version is committed, update the // collection metadata to the new collection version and forget the migrated chunk. ScopedTransaction scopedXact(txn, MODE_IX); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IX, MODE_X); ChunkType migratingChunkToForget; migratingChunkToForget.setMin(_args.getMinKey()); migratingChunkToForget.setMax(_args.getMaxKey()); _committedMetadata = _committedMetadata->cloneMigrate(migratingChunkToForget, uncommittedCollVersion); auto css = CollectionShardingState::get(txn, _args.getNss().ns()); css->setMetadata(_committedMetadata); } else { // This could be an unrelated error (e.g. network error). Check whether the metadata update // succeeded by refreshing the collection metadata from the config server and checking that // the original chunks no longer exist. warning() << "Migration metadata commit may have failed: refreshing metadata to check" << causedBy(applyOpsStatus); // Need to get the latest optime in case the refresh request goes to a secondary -- // otherwise the read won't wait for the write that applyChunkOpsDeprecated may have done. Status status = grid.catalogClient(txn)->logChange( txn, "moveChunk.validating", _args.getNss().ns(), BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey() << "from" << _args.getFromShardId() << "to" << _args.getToShardId())); if (!status.isOK()) { fassertStatusOK( 40137, {status.code(), str::stream() << "applyOps failed to commit chunk [" << _args.getMinKey() << "," << _args.getMaxKey() << ") due to " << causedBy(applyOpsStatus) << ", and updating the optime with a write before refreshing the " << "metadata also failed: " << causedBy(status)}); } ShardingState* const shardingState = ShardingState::get(txn); ChunkVersion shardVersion; Status refreshStatus = shardingState->refreshMetadataNow(txn, _args.getNss().ns(), &shardVersion); fassertStatusOK(34431, {refreshStatus.code(), str::stream() << "applyOps failed to commit chunk [" << _args.getMinKey() << "," << _args.getMaxKey() << ") due to " << causedBy(applyOpsStatus) << ", and refreshing collection metadata failed: " << causedBy(refreshStatus)}); { ScopedTransaction scopedXact(txn, MODE_IS); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS); auto css = CollectionShardingState::get(txn, _args.getNss()); std::shared_ptr<CollectionMetadata> refreshedMetadata = css->getMetadata(); if (refreshedMetadata->keyBelongsToMe(_args.getMinKey())) { invariant(refreshedMetadata->getCollVersion() == _committedMetadata->getCollVersion()); // After refresh, the collection metadata indicates that the donor shard still owns // the chunk, so no migration changes were written to the config server metadata. return {applyOpsStatus.code(), str::stream() << "Migration was not committed, applyOps failed: " << causedBy(applyOpsStatus)}; } ChunkVersion refreshedCollectionVersion = refreshedMetadata->getCollVersion(); if (!refreshedCollectionVersion.equals(nextVersion)) { // The refreshed collection metadata's collection version does not match the control // chunk's updated collection version, which should now be the highest. The control // chunk was not committed, but the migrated chunk was. This state is not // recoverable. fassertStatusOK(40138, {applyOpsStatus.code(), str::stream() << "Migration was partially committed, state is " << "unrecoverable. applyOps error: " << causedBy(applyOpsStatus)}); } } } MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangBeforeLeavingCriticalSection); scopedGuard.Dismiss(); _cleanup(txn); grid.catalogClient(txn)->logChange(txn, "moveChunk.commit", _args.getNss().ns(), BSON("min" << _args.getMinKey() << "max" << _args.getMaxKey() << "from" << _args.getFromShardId() << "to" << _args.getToShardId())); return Status::OK(); }
void MetadataManager::refreshActiveMetadata(std::unique_ptr<CollectionMetadata> remoteMetadata) { LOG(1) << "Refreshing the active metadata from " << (_activeMetadataTracker->metadata ? _activeMetadataTracker->metadata->toStringBasic() : "(empty)") << ", to " << (remoteMetadata ? remoteMetadata->toStringBasic() : "(empty)"); stdx::lock_guard<stdx::mutex> scopedLock(_managerLock); // Collection is not sharded anymore if (!remoteMetadata) { log() << "Marking collection as not sharded."; _receivingChunks.clear(); _rangesToClean.clear(); _setActiveMetadata_inlock(nullptr); return; } invariant(!remoteMetadata->getCollVersion().isWriteCompatibleWith(ChunkVersion::UNSHARDED())); invariant(!remoteMetadata->getShardVersion().isWriteCompatibleWith(ChunkVersion::UNSHARDED())); // Collection is not sharded currently if (!_activeMetadataTracker->metadata) { log() << "Marking collection as sharded with version " << remoteMetadata->toStringBasic(); invariant(_receivingChunks.empty()); invariant(_rangesToClean.empty()); _setActiveMetadata_inlock(std::move(remoteMetadata)); return; } // If the metadata being installed has a different epoch from ours, this means the collection // was dropped and recreated, so we must entirely reset the metadata state if (_activeMetadataTracker->metadata->getCollVersion().epoch() != remoteMetadata->getCollVersion().epoch()) { log() << "Overwriting collection metadata due to epoch change."; _receivingChunks.clear(); _rangesToClean.clear(); _setActiveMetadata_inlock(std::move(remoteMetadata)); return; } // We already have newer version if (_activeMetadataTracker->metadata->getCollVersion() >= remoteMetadata->getCollVersion()) { LOG(1) << "Attempted to refresh active metadata " << _activeMetadataTracker->metadata->toStringBasic() << " with an older version " << remoteMetadata->toStringBasic(); return; } // Resolve any receiving chunks, which might have completed by now for (auto it = _receivingChunks.begin(); it != _receivingChunks.end();) { const BSONObj min = it->first; const BSONObj max = it->second; // Our pending range overlaps at least one chunk if (rangeMapContains(remoteMetadata->getChunks(), min, max)) { // The remote metadata contains a chunk we were earlier in the process of receiving, so // we deem it successfully received. LOG(2) << "Verified chunk " << ChunkRange(min, max).toString() << " was migrated earlier to this shard"; _receivingChunks.erase(it++); continue; } else if (!rangeMapOverlaps(remoteMetadata->getChunks(), min, max)) { ++it; continue; } // Partial overlap indicates that the earlier migration has failed, but the chunk being // migrated underwent some splits and other migrations and ended up here again. In this // case, we will request full reload of the metadata. Currently this cannot happen, because // all migrations are with the explicit knowledge of the recipient shard. However, we leave // the option open so that chunk splits can do empty chunk move without having to notify the // recipient. RangeVector overlappedChunks; getRangeMapOverlap(remoteMetadata->getChunks(), min, max, &overlappedChunks); for (const auto& overlapChunkMin : overlappedChunks) { auto itRecv = _receivingChunks.find(overlapChunkMin.first); invariant(itRecv != _receivingChunks.end()); const ChunkRange receivingRange(itRecv->first, itRecv->second); _receivingChunks.erase(itRecv); // Make sure any potentially partially copied chunks are scheduled to be cleaned up _addRangeToClean_inlock(receivingRange); } // Need to reset the iterator it = _receivingChunks.begin(); } // For compatibility with the current range deleter, which is driven entirely by the contents of // the CollectionMetadata update the pending chunks for (const auto& receivingChunk : _receivingChunks) { ChunkType chunk; chunk.setMin(receivingChunk.first); chunk.setMax(receivingChunk.second); remoteMetadata = remoteMetadata->clonePlusPending(chunk); } _setActiveMetadata_inlock(std::move(remoteMetadata)); }
StatusWith<MigrateInfoVector> BalancerChunkSelectionPolicyImpl::_getMigrateCandidatesForCollection( OperationContext* txn, const NamespaceString& nss, const ShardStatisticsVector& shardStats, bool aggressiveBalanceHint) { // Ensure the database exists auto dbStatus = Grid::get(txn)->catalogCache()->getDatabase(txn, nss.db().toString()); if (!dbStatus.isOK()) { return {dbStatus.getStatus().code(), str::stream() << "Database " << nss.ns() << " was not found due to " << dbStatus.getStatus().toString()}; } shared_ptr<DBConfig> db = dbStatus.getValue(); invariant(db); // Ensure that the collection is sharded shared_ptr<ChunkManager> cm = db->getChunkManagerIfExists(txn, nss.ns(), true); if (!cm) { return {ErrorCodes::NamespaceNotSharded, str::stream() << "Collection " << nss.ns() << " does not exist or is not sharded."}; } if (cm->getChunkMap().empty()) { return {ErrorCodes::NamespaceNotSharded, str::stream() << "Collection " << nss.ns() << " does not have any chunks."}; } ShardToChunksMap shardToChunksMap; std::set<BSONObj> allChunkMinimums; for (const auto& entry : cm->getChunkMap()) { const auto& chunkEntry = entry.second; ChunkType chunk; chunk.setMin(chunkEntry->getMin()); chunk.setMax(chunkEntry->getMax()); chunk.setJumbo(chunkEntry->isJumbo()); shardToChunksMap[chunkEntry->getShardId()].push_back(chunk); allChunkMinimums.insert(chunkEntry->getMin()); } for (const auto& stat : shardStats) { // This loop just makes sure there is an entry in shardToChunksMap for every shard, which we // plan to consider. shardToChunksMap[stat.shardId]; } DistributionStatus distStatus(shardStats, shardToChunksMap); { vector<TagsType> collectionTags; Status status = grid.catalogManager(txn)->getTagsForCollection(txn, nss.ns(), &collectionTags); if (!status.isOK()) { return status; } for (const auto& tagInfo : collectionTags) { BSONObj min = cm->getShardKeyPattern().getKeyPattern().extendRangeBound( tagInfo.getMinKey(), false); if (!allChunkMinimums.count(min)) { // This tag falls somewhere at the middle of a chunk. Therefore we must skip // balancing this collection until it is split at the next iteration. // // TODO: We should be able to just skip chunks, which straddle tags and still make // some progress balancing. return {ErrorCodes::IllegalOperation, str::stream() << "Tag boundaries " << tagInfo.toString() << " fall in the middle of an existing chunk. Balancing for collection " << nss.ns() << " will be postponed until the chunk is split appropriately."}; } // TODO: TagRange contains all the information from TagsType except for the namespace, // so maybe the two can be merged at some point in order to avoid the transformation // below. if (!distStatus.addTagRange(TagRange(tagInfo.getMinKey().getOwned(), tagInfo.getMaxKey().getOwned(), tagInfo.getTag()))) { return {ErrorCodes::BadValue, str::stream() << "Tag ranges are not valid for collection " << nss.ns() << ". Balancing for this collection will be skipped until " "the ranges are fixed."}; } } } unique_ptr<MigrateInfo> migrateInfo( BalancerPolicy::balance(nss.ns(), distStatus, aggressiveBalanceHint)); if (migrateInfo) { return MigrateInfoVector{*migrateInfo}; } return MigrateInfoVector{}; }