void ChunkManager::createFirstChunks(OperationContext* txn, const ShardId& primaryShardId, const vector<BSONObj>* initPoints, const set<ShardId>* initShardIds) { // TODO distlock? // TODO: Race condition if we shard the collection and insert data while we split across // the non-primary shard. vector<BSONObj> splitPoints; vector<ShardId> shardIds; calcInitSplitsAndShards(txn, primaryShardId, initPoints, initShardIds, &splitPoints, &shardIds); // this is the first chunk; start the versioning from scratch ChunkVersion version; version.incEpoch(); version.incMajor(); log() << "going to create " << splitPoints.size() + 1 << " chunk(s) for: " << _ns << " using new epoch " << version.epoch(); for (unsigned i = 0; i <= splitPoints.size(); i++) { BSONObj min = i == 0 ? _keyPattern.getKeyPattern().globalMin() : splitPoints[i - 1]; BSONObj max = i < splitPoints.size() ? splitPoints[i] : _keyPattern.getKeyPattern().globalMax(); Chunk temp(this, min, max, shardIds[i % shardIds.size()], version); BSONObjBuilder chunkBuilder; temp.serialize(chunkBuilder); BSONObj chunkObj = chunkBuilder.obj(); Status result = grid.catalogManager(txn)->update(txn, ChunkType::ConfigNS, BSON(ChunkType::name(temp.genID())), chunkObj, true, false, NULL); version.incMinor(); if (!result.isOK()) { string ss = str::stream() << "creating first chunks failed. result: " << result.reason(); error() << ss; msgasserted(15903, ss); } } _version = ChunkVersion(0, 0, version.epoch()); }
void ChunkManagerTargeter::noteStaleResponse(const ShardEndpoint& endpoint, const StaleConfigInfo& staleInfo) { dassert(!_needsTargetingRefresh); ChunkVersion remoteShardVersion; if (!staleInfo.getVersionWanted()) { // If we don't have a vWanted sent, assume the version is higher than our current version. remoteShardVersion = getShardVersion(*_routingInfo, endpoint.shardName); remoteShardVersion.incMajor(); } else { remoteShardVersion = *staleInfo.getVersionWanted(); } ShardVersionMap::iterator it = _remoteShardVersions.find(endpoint.shardName); if (it == _remoteShardVersions.end()) { _remoteShardVersions.insert(std::make_pair(endpoint.shardName, remoteShardVersion)); } else { ChunkVersion& previouslyNotedVersion = it->second; if (previouslyNotedVersion.epoch() == remoteShardVersion.epoch()) { if (previouslyNotedVersion.isOlderThan(remoteShardVersion)) { previouslyNotedVersion = remoteShardVersion; } } else { // Epoch changed midway while applying the batch so set the version to something // unique // and non-existent to force a reload when refreshIsNeeded is called. previouslyNotedVersion = ChunkVersion::IGNORED(); } } }
void MoveChunkRequest::appendAsCommand(BSONObjBuilder* builder, const NamespaceString& nss, ChunkVersion collectionVersion, const ConnectionString& configServerConnectionString, const ShardId& fromShardId, const ShardId& toShardId, const ChunkRange& range, ChunkVersion chunkVersion, int64_t maxChunkSizeBytes, const MigrationSecondaryThrottleOptions& secondaryThrottle, bool waitForDelete, bool takeDistLock) { invariant(builder->asTempObj().isEmpty()); invariant(nss.isValid()); builder->append(kMoveChunk, nss.ns()); collectionVersion.appendForCommands(builder); builder->append(kEpoch, collectionVersion.epoch()); builder->append(kConfigServerConnectionString, configServerConnectionString.toString()); builder->append(kFromShardId, fromShardId.toString()); builder->append(kToShardId, toShardId.toString()); range.append(builder); chunkVersion.appendWithFieldForCommands(builder, kChunkVersion); builder->append(kMaxChunkSizeBytes, static_cast<long long>(maxChunkSizeBytes)); secondaryThrottle.append(builder); builder->append(kWaitForDelete, waitForDelete); builder->append(kTakeDistLock, takeDistLock); }
CollectionMetadata::CollectionMetadata(const BSONObj& keyPattern, ChunkVersion collectionVersion) : _collVersion(collectionVersion), _shardVersion(ChunkVersion(0, 0, collectionVersion.epoch())), _keyPattern(keyPattern.getOwned()), _pendingMap(SimpleBSONObjComparator::kInstance.makeBSONObjIndexedMap<CachedChunkInfo>()), _chunksMap(SimpleBSONObjComparator::kInstance.makeBSONObjIndexedMap<CachedChunkInfo>()), _rangesMap(SimpleBSONObjComparator::kInstance.makeBSONObjIndexedMap<CachedChunkInfo>()) {}
void MoveChunkRequest::appendAsCommand(BSONObjBuilder* builder, const NamespaceString& nss, ChunkVersion chunkVersion, const ConnectionString& configServerConnectionString, const ShardId& fromShardId, const ShardId& toShardId, const ChunkRange& range, int64_t maxChunkSizeBytes, const MigrationSecondaryThrottleOptions& secondaryThrottle, bool waitForDelete) { invariant(builder->asTempObj().isEmpty()); invariant(nss.isValid()); builder->append(kMoveChunk, nss.ns()); chunkVersion.appendToCommand(builder); // 3.4 shard compatibility builder->append(kEpoch, chunkVersion.epoch()); // config connection string is included for 3.4 shard compatibility builder->append(kConfigServerConnectionString, configServerConnectionString.toString()); builder->append(kFromShardId, fromShardId.toString()); builder->append(kToShardId, toShardId.toString()); range.append(builder); builder->append(kMaxChunkSizeBytes, static_cast<long long>(maxChunkSizeBytes)); secondaryThrottle.append(builder); builder->append(kWaitForDelete, waitForDelete); builder->append(kTakeDistLock, false); }
std::unique_ptr<CollectionMetadata> CollectionMetadata::cloneMigrate( const ChunkType& chunk, const ChunkVersion& newCollectionVersion) const { invariant(newCollectionVersion.epoch() == _collVersion.epoch()); invariant(newCollectionVersion > _collVersion); invariant(rangeMapContains(_chunksMap, chunk.getMin(), chunk.getMax())); unique_ptr<CollectionMetadata> metadata(stdx::make_unique<CollectionMetadata>()); metadata->_keyPattern = _keyPattern.getOwned(); metadata->fillKeyPatternFields(); metadata->_pendingMap = _pendingMap; metadata->_chunksMap = _chunksMap; metadata->_chunksMap.erase(chunk.getMin()); metadata->_shardVersion = (metadata->_chunksMap.empty() ? ChunkVersion(0, 0, newCollectionVersion.epoch()) : newCollectionVersion); metadata->_collVersion = newCollectionVersion; metadata->fillRanges(); invariant(metadata->isValid()); return metadata; }
Status onShardVersionMismatch(OperationContext* opCtx, const NamespaceString& nss, ChunkVersion shardVersionReceived, bool forceRefreshFromThisThread) noexcept { invariant(!opCtx->lockState()->isLocked()); invariant(!opCtx->getClient()->isInDirectClient()); auto const shardingState = ShardingState::get(opCtx); invariant(shardingState->canAcceptShardedCommands()); LOG(2) << "Metadata refresh requested for " << nss.ns() << " at shard version " << shardVersionReceived; ShardingStatistics::get(opCtx).countStaleConfigErrors.addAndFetch(1); // Ensure any ongoing migrations have completed before trying to do the refresh. This wait is // just an optimization so that MongoS does not exhaust its maximum number of StaleConfig retry // attempts while the migration is being committed. try { auto& oss = OperationShardingState::get(opCtx); oss.waitForMigrationCriticalSectionSignal(opCtx); } catch (const DBException& ex) { return ex.toStatus(); } const auto currentShardVersion = [&] { AutoGetCollection autoColl(opCtx, nss, MODE_IS); const auto currentMetadata = CollectionShardingState::get(opCtx, nss)->getMetadata(opCtx); if (currentMetadata) { return currentMetadata->getShardVersion(); } return ChunkVersion::UNSHARDED(); }(); if (currentShardVersion.epoch() == shardVersionReceived.epoch() && currentShardVersion.majorVersion() >= shardVersionReceived.majorVersion()) { // Don't need to remotely reload if we're in the same epoch and the requested version is // smaller than the one we know about. This means that the remote side is behind. return Status::OK(); } try { forceShardFilteringMetadataRefresh(opCtx, nss, forceRefreshFromThisThread); return Status::OK(); } catch (const DBException& ex) { log() << "Failed to refresh metadata for collection" << nss << causedBy(redact(ex)); return ex.toStatus(); } }
unique_ptr<CollectionMetadata> CollectionMetadata::clonePlusChunk( const BSONObj& minKey, const BSONObj& maxKey, const ChunkVersion& newShardVersion) const { invariant(newShardVersion.epoch() == _shardVersion.epoch()); invariant(newShardVersion.isSet()); invariant(minKey.woCompare(maxKey) < 0); invariant(!rangeMapOverlaps(_chunksMap, minKey, maxKey)); unique_ptr<CollectionMetadata> metadata(stdx::make_unique<CollectionMetadata>()); metadata->_keyPattern = _keyPattern.getOwned(); metadata->fillKeyPatternFields(); metadata->_pendingMap = _pendingMap; metadata->_chunksMap = _chunksMap; metadata->_chunksMap.insert(make_pair(minKey.getOwned(), maxKey.getOwned())); metadata->_shardVersion = newShardVersion; metadata->_collVersion = newShardVersion > _collVersion ? newShardVersion : _collVersion; metadata->fillRanges(); invariant(metadata->isValid()); return metadata; }
/** * Stores ranges for a particular collection and shard starting from some version */ void storeCollectionRanges( const NamespaceString& nss, const string& shardName, const vector<KeyRange>& ranges, const ChunkVersion& startVersion ) { // Get key pattern from first range ASSERT_GREATER_THAN( ranges.size(), 0u ); CollectionType coll; coll.setNS( nss.ns() ); coll.setKeyPattern( ranges.begin()->keyPattern ); coll.setEpoch( startVersion.epoch() ); coll.setUpdatedAt( 1ULL ); string errMsg; ASSERT( coll.isValid( &errMsg ) ); DBDirectClient client(&_txn); client.update( CollectionType::ConfigNS, BSON( CollectionType::ns( coll.getNS() ) ), coll.toBSON(), true, false ); ChunkVersion nextVersion = startVersion; for ( vector<KeyRange>::const_iterator it = ranges.begin(); it != ranges.end(); ++it ) { ChunkType chunk; // TODO: We should not rely on the serialized ns, minkey being unique in the future, // causes problems since it links string serialization to correctness. chunk.setName( Chunk::genID( nss, it->minKey ) ); chunk.setShard( shardName ); chunk.setNS( nss.ns() ); chunk.setVersion( nextVersion ); chunk.setMin( it->minKey ); chunk.setMax( it->maxKey ); nextVersion.incMajor(); client.insert( ChunkType::ConfigNS, chunk.toBSON() ); } }
Status ShardingState::refreshMetadataIfNeeded(OperationContext* txn, const string& ns, const ChunkVersion& reqShardVersion, ChunkVersion* latestShardVersion) { // The _configServerTickets serializes this process such that only a small number of threads // can try to refresh at the same time. LOG(2) << "metadata refresh requested for " << ns << " at shard version " << reqShardVersion; // // Queuing of refresh requests starts here when remote reload is needed. This may take time. // TODO: Explicitly expose the queuing discipline. // _configServerTickets.waitForTicket(); TicketHolderReleaser needTicketFrom(&_configServerTickets); // // Fast path - check if the requested version is at a higher version than the current // metadata version or a different epoch before verifying against config server. // shared_ptr<CollectionMetadata> storedMetadata; { stdx::lock_guard<stdx::mutex> lk(_mutex); CollectionMetadataMap::iterator it = _collMetadata.find(ns); if (it != _collMetadata.end()) storedMetadata = it->second; } ChunkVersion storedShardVersion; if (storedMetadata) storedShardVersion = storedMetadata->getShardVersion(); *latestShardVersion = storedShardVersion; if (storedShardVersion >= reqShardVersion && storedShardVersion.epoch() == reqShardVersion.epoch()) { // Don't need to remotely reload if we're in the same epoch with a >= version return Status::OK(); } // // Slow path - remotely reload // // Cases: // A) Initial config load and/or secondary take-over. // B) Migration TO this shard finished, notified by mongos. // C) Dropping a collection, notified (currently) by mongos. // D) Stale client wants to reload metadata with a different *epoch*, so we aren't sure. if (storedShardVersion.epoch() != reqShardVersion.epoch()) { // Need to remotely reload if our epochs aren't the same, to verify LOG(1) << "metadata change requested for " << ns << ", from shard version " << storedShardVersion << " to " << reqShardVersion << ", need to verify with config server"; } else { // Need to remotely reload since our epochs aren't the same but our version is greater LOG(1) << "metadata version update requested for " << ns << ", from shard version " << storedShardVersion << " to " << reqShardVersion << ", need to verify with config server"; } return doRefreshMetadata(txn, ns, reqShardVersion, true, latestShardVersion); }
bool mergeChunks( OperationContext* txn, const NamespaceString& nss, const BSONObj& minKey, const BSONObj& maxKey, const OID& epoch, string* errMsg ) { // // Get sharding state up-to-date // ConnectionString configLoc = ConnectionString::parse( shardingState.getConfigServer(), *errMsg ); if ( !configLoc.isValid() ){ warning() << *errMsg << endl; return false; } // // Get the distributed lock // ScopedDistributedLock collLock( configLoc, nss.ns() ); collLock.setLockMessage( stream() << "merging chunks in " << nss.ns() << " from " << minKey << " to " << maxKey ); Status acquisitionStatus = collLock.tryAcquire(); if (!acquisitionStatus.isOK()) { *errMsg = stream() << "could not acquire collection lock for " << nss.ns() << " to merge chunks in [" << minKey << "," << maxKey << ")" << causedBy(acquisitionStatus); warning() << *errMsg << endl; return false; } // // We now have the collection lock, refresh metadata to latest version and sanity check // ChunkVersion shardVersion; Status status = shardingState.refreshMetadataNow(txn, nss.ns(), &shardVersion); if ( !status.isOK() ) { *errMsg = str::stream() << "could not merge chunks, failed to refresh metadata for " << nss.ns() << causedBy( status.reason() ); warning() << *errMsg << endl; return false; } if ( epoch.isSet() && shardVersion.epoch() != epoch ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has changed" << " since merge was sent" << "(sent epoch : " << epoch.toString() << ", current epoch : " << shardVersion.epoch().toString() << ")"; warning() << *errMsg << endl; return false; } CollectionMetadataPtr metadata = shardingState.getCollectionMetadata( nss.ns() ); if ( !metadata || metadata->getKeyPattern().isEmpty() ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " is not sharded"; warning() << *errMsg << endl; return false; } dassert( metadata->getShardVersion().equals( shardVersion ) ); if ( !metadata->isValidKey( minKey ) || !metadata->isValidKey( maxKey ) ) { *errMsg = stream() << "could not merge chunks, the range " << rangeToString( minKey, maxKey ) << " is not valid" << " for collection " << nss.ns() << " with key pattern " << metadata->getKeyPattern(); warning() << *errMsg << endl; return false; } // // Get merged chunk information // ChunkVersion mergeVersion = metadata->getCollVersion(); mergeVersion.incMinor(); OwnedPointerVector<ChunkType> chunksToMerge; ChunkType itChunk; itChunk.setMin( minKey ); itChunk.setMax( minKey ); itChunk.setNS( nss.ns() ); itChunk.setShard( shardingState.getShardName() ); while ( itChunk.getMax().woCompare( maxKey ) < 0 && metadata->getNextChunk( itChunk.getMax(), &itChunk ) ) { auto_ptr<ChunkType> saved( new ChunkType ); itChunk.cloneTo( saved.get() ); chunksToMerge.mutableVector().push_back( saved.release() ); } if ( chunksToMerge.empty() ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " and ending at " << maxKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } // // Validate the range starts and ends at chunks and has no holes, error if not valid // BSONObj firstDocMin = ( *chunksToMerge.begin() )->getMin(); BSONObj firstDocMax = ( *chunksToMerge.begin() )->getMax(); // minKey is inclusive bool minKeyInRange = rangeContains( firstDocMin, firstDocMax, minKey ); if ( !minKeyInRange ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } BSONObj lastDocMin = ( *chunksToMerge.rbegin() )->getMin(); BSONObj lastDocMax = ( *chunksToMerge.rbegin() )->getMax(); // maxKey is exclusive bool maxKeyInRange = lastDocMin.woCompare( maxKey ) < 0 && lastDocMax.woCompare( maxKey ) >= 0; if ( !maxKeyInRange ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range ending at " << maxKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } bool validRangeStartKey = firstDocMin.woCompare( minKey ) == 0; bool validRangeEndKey = lastDocMax.woCompare( maxKey ) == 0; if ( !validRangeStartKey || !validRangeEndKey ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " does not contain a chunk " << ( !validRangeStartKey ? "starting at " + minKey.toString() : "" ) << ( !validRangeStartKey && !validRangeEndKey ? " or " : "" ) << ( !validRangeEndKey ? "ending at " + maxKey.toString() : "" ); warning() << *errMsg << endl; return false; } if ( chunksToMerge.size() == 1 ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " already contains chunk for " << rangeToString( minKey, maxKey ); warning() << *errMsg << endl; return false; } bool holeInRange = false; // Look for hole in range ChunkType* prevChunk = *chunksToMerge.begin(); ChunkType* nextChunk = NULL; for ( OwnedPointerVector<ChunkType>::const_iterator it = chunksToMerge.begin(); it != chunksToMerge.end(); ++it ) { if ( it == chunksToMerge.begin() ) continue; nextChunk = *it; if ( prevChunk->getMax().woCompare( nextChunk->getMin() ) != 0 ) { holeInRange = true; break; } prevChunk = nextChunk; } if ( holeInRange ) { dassert( NULL != nextChunk ); *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has a hole in the range " << rangeToString( minKey, maxKey ) << " at " << rangeToString( prevChunk->getMax(), nextChunk->getMin() ); warning() << *errMsg << endl; return false; } // // Run apply ops command // BSONObj applyOpsCmd = buildApplyOpsCmd( chunksToMerge, shardVersion, mergeVersion ); bool ok; BSONObj result; try { ScopedDbConnection conn( configLoc, 30.0 ); ok = conn->runCommand( "config", applyOpsCmd, result ); if ( !ok ) *errMsg = result.toString(); conn.done(); } catch( const DBException& ex ) { ok = false; *errMsg = ex.toString(); } if ( !ok ) { *errMsg = stream() << "could not merge chunks for " << nss.ns() << ", writing to config failed" << causedBy( errMsg ); warning() << *errMsg << endl; return false; } // // Install merged chunk metadata // { Lock::DBLock writeLk(txn->lockState(), nss.db(), newlm::MODE_X); shardingState.mergeChunks(txn, nss.ns(), minKey, maxKey, mergeVersion); } // // Log change // BSONObj mergeLogEntry = buildMergeLogEntry( chunksToMerge, shardVersion, mergeVersion ); configServer.logChange( "merge", nss.ns(), mergeLogEntry ); return true; }
StatusWith<std::unique_ptr<CollectionMetadata>> CollectionMetadata::cloneMerge( const BSONObj& minKey, const BSONObj& maxKey, const ChunkVersion& newShardVersion) const { invariant(newShardVersion.epoch() == _shardVersion.epoch()); invariant(newShardVersion > _shardVersion); RangeVector overlap; getRangeMapOverlap(_chunksMap, minKey, maxKey, &overlap); if (overlap.empty() || overlap.size() == 1) { return {ErrorCodes::IllegalOperation, stream() << "cannot merge range " << rangeToString(minKey, maxKey) << (overlap.empty() ? ", no chunks found in this range" : ", only one chunk found in this range")}; } bool validStartEnd = true; bool validNoHoles = true; if (overlap.begin()->first.woCompare(minKey) != 0) { // First chunk doesn't start with minKey validStartEnd = false; } else if (overlap.rbegin()->second.woCompare(maxKey) != 0) { // Last chunk doesn't end with maxKey validStartEnd = false; } else { // Check that there are no holes BSONObj prevMaxKey = minKey; for (RangeVector::iterator it = overlap.begin(); it != overlap.end(); ++it) { if (it->first.woCompare(prevMaxKey) != 0) { validNoHoles = false; break; } prevMaxKey = it->second; } } if (!validStartEnd || !validNoHoles) { return {ErrorCodes::IllegalOperation, stream() << "cannot merge range " << rangeToString(minKey, maxKey) << ", overlapping chunks " << overlapToString(overlap) << (!validStartEnd ? " do not have the same min and max key" : " are not all adjacent")}; } unique_ptr<CollectionMetadata> metadata(stdx::make_unique<CollectionMetadata>()); metadata->_keyPattern = _keyPattern.getOwned(); metadata->fillKeyPatternFields(); metadata->_pendingMap = _pendingMap; metadata->_chunksMap = _chunksMap; metadata->_rangesMap = _rangesMap; metadata->_shardVersion = newShardVersion; metadata->_collVersion = newShardVersion > _collVersion ? newShardVersion : this->_collVersion; for (RangeVector::iterator it = overlap.begin(); it != overlap.end(); ++it) { metadata->_chunksMap.erase(it->first); } metadata->_chunksMap.insert(make_pair(minKey, maxKey)); invariant(metadata->isValid()); return std::move(metadata); }
StatusWith<std::unique_ptr<CollectionMetadata>> CollectionMetadata::cloneSplit( const BSONObj& minKey, const BSONObj& maxKey, const std::vector<BSONObj>& splitKeys, const ChunkVersion& newShardVersion) const { invariant(newShardVersion.epoch() == _shardVersion.epoch()); invariant(newShardVersion > _shardVersion); // The version required in both resulting chunks could be simply an increment in the // minor portion of the current version. However, we are enforcing uniqueness over the // attributes <ns, version> of the configdb collection 'chunks'. So in practice, a // migrate somewhere may force this split to pick up a version that has the major // portion higher than the one that this shard has been using. // // TODO drop the uniqueness constraint and tighten the check below so that only the // minor portion of version changes // Check that we have the exact chunk that will be subtracted. if (!rangeMapContains(_chunksMap, minKey, maxKey)) { stream errMsg; errMsg << "cannot split chunk " << rangeToString(minKey, maxKey) << ", this shard does not contain the chunk"; if (rangeMapOverlaps(_chunksMap, minKey, maxKey)) { RangeVector overlap; getRangeMapOverlap(_chunksMap, minKey, maxKey, &overlap); errMsg << " and it overlaps " << overlapToString(overlap); } return {ErrorCodes::IllegalOperation, errMsg}; } unique_ptr<CollectionMetadata> metadata(stdx::make_unique<CollectionMetadata>()); metadata->_keyPattern = _keyPattern.getOwned(); metadata->fillKeyPatternFields(); metadata->_pendingMap = _pendingMap; metadata->_chunksMap = _chunksMap; metadata->_shardVersion = newShardVersion; // will increment 2nd, 3rd,... chunks below BSONObj startKey = minKey; for (const auto& split : splitKeys) { // Check that the split key is valid if (!rangeContains(minKey, maxKey, split)) { return {ErrorCodes::IllegalOperation, stream() << "cannot split chunk " << rangeToString(minKey, maxKey) << " at key " << split}; } // Check that the split keys are in order if (split.woCompare(startKey) <= 0) { // The split keys came in out of order, this probably indicates a bug, so fail the // operation. Re-iterate splitKeys to build a useful error message including the array // of splitKeys in the order received. str::stream errMsg; errMsg << "Invalid input to splitChunk, split keys must be in order, got: ["; for (auto it2 = splitKeys.cbegin(); it2 != splitKeys.cend(); ++it2) { if (it2 != splitKeys.begin()) { errMsg << ", "; } errMsg << it2->toString(); } errMsg << "]"; return {ErrorCodes::IllegalOperation, errMsg}; } metadata->_chunksMap[startKey] = split.getOwned(); metadata->_chunksMap.insert(make_pair(split.getOwned(), maxKey.getOwned())); metadata->_shardVersion.incMinor(); startKey = split; } metadata->_collVersion = metadata->_shardVersion > _collVersion ? metadata->_shardVersion : _collVersion; metadata->fillRanges(); invariant(metadata->isValid()); return std::move(metadata); }
StatusWith<boost::optional<ChunkRange>> splitChunkAtMultiplePoints( OperationContext* opCtx, const ShardId& shardId, const NamespaceString& nss, const ShardKeyPattern& shardKeyPattern, ChunkVersion collectionVersion, const ChunkRange& chunkRange, const std::vector<BSONObj>& splitPoints) { invariant(!splitPoints.empty()); const size_t kMaxSplitPoints = 8192; if (splitPoints.size() > kMaxSplitPoints) { return {ErrorCodes::BadValue, str::stream() << "Cannot split chunk in more than " << kMaxSplitPoints << " parts at a time."}; } // Sanity check that we are not attempting to split at the boundaries of the chunk. This check // is already performed at chunk split commit time, but we are performing it here for parity // with old auto-split code, which might rely on it. if (SimpleBSONObjComparator::kInstance.evaluate(chunkRange.getMin() == splitPoints.front())) { const std::string msg(str::stream() << "not splitting chunk " << chunkRange.toString() << ", split point " << splitPoints.front() << " is exactly on chunk bounds"); return {ErrorCodes::CannotSplit, msg}; } if (SimpleBSONObjComparator::kInstance.evaluate(chunkRange.getMax() == splitPoints.back())) { const std::string msg(str::stream() << "not splitting chunk " << chunkRange.toString() << ", split point " << splitPoints.back() << " is exactly on chunk bounds"); return {ErrorCodes::CannotSplit, msg}; } BSONObjBuilder cmd; cmd.append("splitChunk", nss.ns()); cmd.append("from", shardId.toString()); cmd.append("keyPattern", shardKeyPattern.toBSON()); cmd.append("epoch", collectionVersion.epoch()); collectionVersion.appendForCommands(&cmd); // backwards compatibility with v3.4 chunkRange.append(&cmd); cmd.append("splitKeys", splitPoints); BSONObj cmdObj = cmd.obj(); Status status{ErrorCodes::InternalError, "Uninitialized value"}; BSONObj cmdResponse; auto shardStatus = Grid::get(opCtx)->shardRegistry()->getShard(opCtx, shardId); if (!shardStatus.isOK()) { status = shardStatus.getStatus(); } else { auto cmdStatus = shardStatus.getValue()->runCommandWithFixedRetryAttempts( opCtx, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, "admin", cmdObj, Shard::RetryPolicy::kNotIdempotent); if (!cmdStatus.isOK()) { status = std::move(cmdStatus.getStatus()); } else { status = std::move(cmdStatus.getValue().commandStatus); cmdResponse = std::move(cmdStatus.getValue().response); } } if (!status.isOK()) { log() << "Split chunk " << redact(cmdObj) << " failed" << causedBy(redact(status)); return {status.code(), str::stream() << "split failed due to " << status.toString()}; } BSONElement shouldMigrateElement; status = bsonExtractTypedField(cmdResponse, kShouldMigrate, Object, &shouldMigrateElement); if (status.isOK()) { auto chunkRangeStatus = ChunkRange::fromBSON(shouldMigrateElement.embeddedObject()); if (!chunkRangeStatus.isOK()) { return chunkRangeStatus.getStatus(); } return boost::optional<ChunkRange>(std::move(chunkRangeStatus.getValue())); } else if (status != ErrorCodes::NoSuchKey) { warning() << "Chunk migration will be skipped because splitChunk returned invalid response: " << redact(cmdResponse) << ". Extracting " << kShouldMigrate << " field failed" << causedBy(redact(status)); } return boost::optional<ChunkRange>(); }
Status ShardingState::doRefreshMetadata(OperationContext* txn, const string& ns, const ChunkVersion& reqShardVersion, bool useRequestedVersion, ChunkVersion* latestShardVersion) { // The idea here is that we're going to reload the metadata from the config server, but // we need to do so outside any locks. When we get our result back, if the current metadata // has changed, we may not be able to install the new metadata. // // Get the initial metadata // No DBLock is needed since the metadata is expected to change during reload. // shared_ptr<CollectionMetadata> beforeMetadata; { stdx::lock_guard<stdx::mutex> lk(_mutex); // We can't reload if sharding is not enabled - i.e. without a config server location if (!_enabled) { string errMsg = str::stream() << "cannot refresh metadata for " << ns << " before sharding has been enabled"; warning() << errMsg; return Status(ErrorCodes::NotYetInitialized, errMsg); } // We also can't reload if a shard name has not yet been set. if (_shardName.empty()) { string errMsg = str::stream() << "cannot refresh metadata for " << ns << " before shard name has been set"; warning() << errMsg; return Status(ErrorCodes::NotYetInitialized, errMsg); } CollectionMetadataMap::iterator it = _collMetadata.find(ns); if (it != _collMetadata.end()) { beforeMetadata = it->second; } } ChunkVersion beforeShardVersion; ChunkVersion beforeCollVersion; if (beforeMetadata) { beforeShardVersion = beforeMetadata->getShardVersion(); beforeCollVersion = beforeMetadata->getCollVersion(); } *latestShardVersion = beforeShardVersion; // // Determine whether we need to diff or fully reload // bool fullReload = false; if (!beforeMetadata) { // We don't have any metadata to reload from fullReload = true; } else if (useRequestedVersion && reqShardVersion.epoch() != beforeShardVersion.epoch()) { // It's not useful to use the metadata as a base because we think the epoch will differ fullReload = true; } // // Load the metadata from the remote server, start construction // LOG(0) << "remotely refreshing metadata for " << ns << (useRequestedVersion ? string(" with requested shard version ") + reqShardVersion.toString() : "") << (fullReload ? ", current shard version is " : " based on current shard version ") << beforeShardVersion << ", current metadata version is " << beforeCollVersion; string errMsg; MetadataLoader mdLoader; CollectionMetadata* remoteMetadataRaw = new CollectionMetadata(); shared_ptr<CollectionMetadata> remoteMetadata(remoteMetadataRaw); Timer refreshTimer; Status status = mdLoader.makeCollectionMetadata(grid.catalogManager(), ns, getShardName(), fullReload ? NULL : beforeMetadata.get(), remoteMetadataRaw); long long refreshMillis = refreshTimer.millis(); if (status.code() == ErrorCodes::NamespaceNotFound) { remoteMetadata.reset(); remoteMetadataRaw = NULL; } else if (!status.isOK()) { warning() << "could not remotely refresh metadata for " << ns << causedBy(status.reason()); return status; } ChunkVersion remoteShardVersion; ChunkVersion remoteCollVersion; if (remoteMetadata) { remoteShardVersion = remoteMetadata->getShardVersion(); remoteCollVersion = remoteMetadata->getCollVersion(); } // // Get ready to install loaded metadata if needed // shared_ptr<CollectionMetadata> afterMetadata; ChunkVersion afterShardVersion; ChunkVersion afterCollVersion; ChunkVersion::VersionChoice choice; // If we choose to install the new metadata, this describes the kind of install enum InstallType { InstallType_New, InstallType_Update, InstallType_Replace, InstallType_Drop, InstallType_None } installType = InstallType_None; // compiler complains otherwise { // Exclusive collection lock needed since we're now potentially changing the metadata, // and don't want reads/writes to be ongoing. ScopedTransaction transaction(txn, MODE_IX); Lock::DBLock dbLock(txn->lockState(), nsToDatabaseSubstring(ns), MODE_IX); Lock::CollectionLock collLock(txn->lockState(), ns, MODE_X); // // Get the metadata now that the load has completed // stdx::lock_guard<stdx::mutex> lk(_mutex); // Don't reload if our config server has changed or sharding is no longer enabled if (!_enabled) { string errMsg = str::stream() << "could not refresh metadata for " << ns << ", sharding is no longer enabled"; warning() << errMsg; return Status(ErrorCodes::NotYetInitialized, errMsg); } CollectionMetadataMap::iterator it = _collMetadata.find(ns); if (it != _collMetadata.end()) afterMetadata = it->second; if (afterMetadata) { afterShardVersion = afterMetadata->getShardVersion(); afterCollVersion = afterMetadata->getCollVersion(); } *latestShardVersion = afterShardVersion; // // Resolve newer pending chunks with the remote metadata, finish construction // status = mdLoader.promotePendingChunks(afterMetadata.get(), remoteMetadataRaw); if (!status.isOK()) { warning() << "remote metadata for " << ns << " is inconsistent with current pending chunks" << causedBy(status.reason()); return status; } // // Compare the 'before', 'after', and 'remote' versions/epochs and choose newest // Zero-epochs (sentinel value for "dropped" collections), are tested by // !epoch.isSet(). // choice = ChunkVersion::chooseNewestVersion( beforeCollVersion, afterCollVersion, remoteCollVersion); if (choice == ChunkVersion::VersionChoice_Remote) { dassert(!remoteCollVersion.epoch().isSet() || remoteShardVersion >= beforeShardVersion); if (!afterCollVersion.epoch().isSet()) { // First metadata load installType = InstallType_New; dassert(it == _collMetadata.end()); _collMetadata.insert(make_pair(ns, remoteMetadata)); } else if (remoteCollVersion.epoch().isSet() && remoteCollVersion.epoch() == afterCollVersion.epoch()) { // Update to existing metadata installType = InstallType_Update; // Invariant: If CollMetadata was not found, version should be have been 0. dassert(it != _collMetadata.end()); it->second = remoteMetadata; } else if (remoteCollVersion.epoch().isSet()) { // New epoch detected, replacing metadata installType = InstallType_Replace; // Invariant: If CollMetadata was not found, version should be have been 0. dassert(it != _collMetadata.end()); it->second = remoteMetadata; } else { dassert(!remoteCollVersion.epoch().isSet()); // Drop detected installType = InstallType_Drop; _collMetadata.erase(it); } *latestShardVersion = remoteShardVersion; } } // End _mutex // End DBWrite // // Do messaging based on what happened above // string localShardVersionMsg = beforeShardVersion.epoch() == afterShardVersion.epoch() ? afterShardVersion.toString() : beforeShardVersion.toString() + " / " + afterShardVersion.toString(); if (choice == ChunkVersion::VersionChoice_Unknown) { string errMsg = str::stream() << "need to retry loading metadata for " << ns << ", collection may have been dropped or recreated during load" << " (loaded shard version : " << remoteShardVersion.toString() << ", stored shard versions : " << localShardVersionMsg << ", took " << refreshMillis << "ms)"; warning() << errMsg; return Status(ErrorCodes::RemoteChangeDetected, errMsg); } if (choice == ChunkVersion::VersionChoice_Local) { LOG(0) << "metadata of collection " << ns << " already up to date (shard version : " << afterShardVersion.toString() << ", took " << refreshMillis << "ms)"; return Status::OK(); } dassert(choice == ChunkVersion::VersionChoice_Remote); switch (installType) { case InstallType_New: LOG(0) << "collection " << ns << " was previously unsharded" << ", new metadata loaded with shard version " << remoteShardVersion; break; case InstallType_Update: LOG(0) << "updating metadata for " << ns << " from shard version " << localShardVersionMsg << " to shard version " << remoteShardVersion; break; case InstallType_Replace: LOG(0) << "replacing metadata for " << ns << " at shard version " << localShardVersionMsg << " with a new epoch (shard version " << remoteShardVersion << ")"; break; case InstallType_Drop: LOG(0) << "dropping metadata for " << ns << " at shard version " << localShardVersionMsg << ", took " << refreshMillis << "ms"; break; default: verify(false); break; } if (installType != InstallType_Drop) { LOG(0) << "collection version was loaded at version " << remoteCollVersion << ", took " << refreshMillis << "ms"; } return Status::OK(); }
MigrationSourceManager::MigrationSourceManager(OperationContext* txn, MoveChunkRequest request) : _args(std::move(request)), _startTime() { invariant(!txn->lockState()->isLocked()); const auto& oss = OperationShardingState::get(txn); if (!oss.hasShardVersion()) { uasserted(ErrorCodes::InvalidOptions, "collection version is missing"); } // Even though the moveChunk command transmits a value in the operation's shardVersion field, // this value does not actually contain the shard version, but the global collection version. const ChunkVersion expectedCollectionVersion = oss.getShardVersion(_args.getNss()); log() << "Starting chunk migration for " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " with expected collection version " << expectedCollectionVersion; // Now that the collection is locked, snapshot the metadata and fetch the latest versions ShardingState* const shardingState = ShardingState::get(txn); ChunkVersion shardVersion; Status refreshStatus = shardingState->refreshMetadataNow(txn, _args.getNss().ns(), &shardVersion); if (!refreshStatus.isOK()) { uasserted(refreshStatus.code(), str::stream() << "cannot start migrate of chunk " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " due to " << refreshStatus.toString()); } if (shardVersion.majorVersion() == 0) { // If the major version is zero, this means we do not have any chunks locally to migrate in // the first place uasserted(ErrorCodes::IncompatibleShardingMetadata, str::stream() << "cannot start migrate of chunk " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " with zero shard version"); } // Snapshot the committed metadata from the time the migration starts { ScopedTransaction scopedXact(txn, MODE_IS); AutoGetCollection autoColl(txn, _args.getNss(), MODE_IS); auto css = CollectionShardingState::get(txn, _args.getNss()); _committedMetadata = css->getMetadata(); } const ChunkVersion collectionVersion = _committedMetadata->getCollVersion(); if (expectedCollectionVersion.epoch() != collectionVersion.epoch()) { throw SendStaleConfigException( _args.getNss().ns(), str::stream() << "cannot move chunk " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " because collection may have been dropped. " << "current epoch: " << collectionVersion.epoch() << ", cmd epoch: " << expectedCollectionVersion.epoch(), expectedCollectionVersion, collectionVersion); } // With nonzero shard version, we must have a coll version >= our shard version invariant(collectionVersion >= shardVersion); // With nonzero shard version, we must have a shard key invariant(!_committedMetadata->getKeyPattern().isEmpty()); ChunkType origChunk; if (!_committedMetadata->getNextChunk(_args.getMinKey(), &origChunk)) { // If this assertion is hit, it means that whoever called the shard moveChunk command // (mongos or the CSRS balancer) did not check whether the chunk actually belongs to this // shard. It is a benign error and does not indicate data corruption. uasserted(40145, str::stream() << "Chunk with bounds " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " is not owned by this shard."); } uassert(40146, str::stream() << "Unable to find chunk with the exact bounds " << ChunkRange(_args.getMinKey(), _args.getMaxKey()).toString() << " at collection version " << collectionVersion.toString() << ". This indicates corrupted metadata.", origChunk.getMin().woCompare(_args.getMinKey()) == 0 && origChunk.getMax().woCompare(_args.getMaxKey()) == 0); }
bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) { // Steps // 1. check basic config // 2. extract params from command // 3. fast check // 4. slow check (LOCKS) // step 1 lastError.disableForCommand(); ShardedConnectionInfo* info = ShardedConnectionInfo::get( true ); // make sure we have the mongos id for writebacks if ( ! checkMongosID( info , cmdObj["serverID"] , errmsg ) ) return false; bool authoritative = cmdObj.getBoolField( "authoritative" ); // check config server is ok or enable sharding if ( ! checkConfigOrInit( cmdObj["configdb"].valuestrsafe() , authoritative , errmsg , result ) ) return false; // check shard name/hosts are correct if ( cmdObj["shard"].type() == String ) { shardingState.gotShardName( cmdObj["shard"].String() ); } // Handle initial shard connection if( cmdObj["version"].eoo() && cmdObj["init"].trueValue() ){ result.append( "initialized", true ); // Send back wire version to let mongos know what protocol we can speak result.append( "minWireVersion", minWireVersion ); result.append( "maxWireVersion", maxWireVersion ); return true; } // we can run on a slave up to here if ( ! isMaster( "admin" ) ) { result.append( "errmsg" , "not master" ); result.append( "note" , "from post init in setShardVersion" ); return false; } // step 2 string ns = cmdObj["setShardVersion"].valuestrsafe(); if ( ns.size() == 0 ) { errmsg = "need to specify namespace"; return false; } if( ! ChunkVersion::canParseBSON( cmdObj, "version" ) ){ errmsg = "need to specify version"; return false; } const ChunkVersion version = ChunkVersion::fromBSON( cmdObj, "version" ); // step 3 const ChunkVersion oldVersion = info->getVersion(ns); const ChunkVersion globalVersion = shardingState.getVersion(ns); oldVersion.addToBSON( result, "oldVersion" ); if ( globalVersion.isSet() && version.isSet() ) { // this means there is no reset going on an either side // so its safe to make some assumptions if ( version.isWriteCompatibleWith( globalVersion ) ) { // mongos and mongod agree! if ( ! oldVersion.isWriteCompatibleWith( version ) ) { if ( oldVersion < globalVersion && oldVersion.hasCompatibleEpoch(globalVersion) ) { info->setVersion( ns , version ); } else if ( authoritative ) { // this means there was a drop and our version is reset info->setVersion( ns , version ); } else { result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "verifying drop on '" + ns + "'"; return false; } } return true; } } // step 4 // this is because of a weird segfault I saw and I can't see why this should ever be set massert( 13647 , str::stream() << "context should be empty here, is: " << cc().getContext()->ns() , cc().getContext() == 0 ); if ( oldVersion.isSet() && ! globalVersion.isSet() ) { // this had been reset info->setVersion( ns , ChunkVersion( 0, OID() ) ); } if ( ! version.isSet() && ! globalVersion.isSet() ) { // this connection is cleaning itself info->setVersion( ns , ChunkVersion( 0, OID() ) ); return true; } // Cases below all either return OR fall-through to remote metadata reload. if ( version.isSet() || !globalVersion.isSet() ) { // Not Dropping // TODO: Refactor all of this if ( version < oldVersion && version.hasCompatibleEpoch( oldVersion ) ) { errmsg = "this connection already had a newer version of collection '" + ns + "'"; result.append( "ns" , ns ); version.addToBSON( result, "newVersion" ); globalVersion.addToBSON( result, "globalVersion" ); return false; } // TODO: Refactor all of this if ( version < globalVersion && version.hasCompatibleEpoch( globalVersion ) ) { while ( shardingState.inCriticalMigrateSection() ) { log() << "waiting till out of critical section" << endl; shardingState.waitTillNotInCriticalSection( 10 ); } errmsg = "shard global version for collection is higher than trying to set to '" + ns + "'"; result.append( "ns" , ns ); version.addToBSON( result, "version" ); globalVersion.addToBSON( result, "globalVersion" ); result.appendBool( "reloadConfig" , true ); return false; } if ( ! globalVersion.isSet() && ! authoritative ) { // Needed b/c when the last chunk is moved off a shard, the version gets reset to zero, which // should require a reload. while ( shardingState.inCriticalMigrateSection() ) { log() << "waiting till out of critical section" << endl; shardingState.waitTillNotInCriticalSection( 10 ); } // need authoritative for first look result.append( "ns" , ns ); result.appendBool( "need_authoritative" , true ); errmsg = "first time for collection '" + ns + "'"; return false; } // Fall through to metadata reload below } else { // Dropping if ( ! authoritative ) { result.appendBool( "need_authoritative" , true ); result.append( "ns" , ns ); globalVersion.addToBSON( result, "globalVersion" ); errmsg = "dropping needs to be authoritative"; return false; } // Fall through to metadata reload below } ChunkVersion currVersion; Status status = shardingState.refreshMetadataIfNeeded( ns, version, &currVersion ); if (!status.isOK()) { // The reload itself was interrupted or confused here errmsg = str::stream() << "could not refresh metadata for " << ns << " with requested shard version " << version.toString() << ", stored shard version is " << currVersion.toString() << causedBy( status.reason() ); warning() << errmsg << endl; result.append( "ns" , ns ); version.addToBSON( result, "version" ); currVersion.addToBSON( result, "globalVersion" ); result.appendBool( "reloadConfig", true ); return false; } else if ( !version.isWriteCompatibleWith( currVersion ) ) { // We reloaded a version that doesn't match the version mongos was trying to // set. errmsg = str::stream() << "requested shard version differs from" << " config shard version for " << ns << ", requested version is " << version.toString() << " but found version " << currVersion.toString(); OCCASIONALLY warning() << errmsg << endl; // WARNING: the exact fields below are important for compatibility with mongos // version reload. result.append( "ns" , ns ); currVersion.addToBSON( result, "globalVersion" ); // If this was a reset of a collection or the last chunk moved out, inform mongos to // do a full reload. if (currVersion.epoch() != version.epoch() || !currVersion.isSet() ) { result.appendBool( "reloadConfig", true ); // Zero-version also needed to trigger full mongos reload, sadly // TODO: Make this saner, and less impactful (full reload on last chunk is bad) ChunkVersion( 0, 0, OID() ).addToBSON( result, "version" ); // For debugging version.addToBSON( result, "origVersion" ); } else { version.addToBSON( result, "version" ); } return false; } info->setVersion( ns , version ); return true; }
Status ShardingState::doRefreshMetadata( const string& ns, const ChunkVersion& reqShardVersion, bool useRequestedVersion, ChunkVersion* latestShardVersion ) { // The idea here is that we're going to reload the metadata from the config server, but // we need to do so outside any locks. When we get our result back, if the current metadata // has changed, we may not be able to install the new metadata. // // Get the initial metadata // No DBLock is needed since the metadata is expected to change during reload. // CollectionMetadataPtr beforeMetadata; string shardName; { scoped_lock lk( _mutex ); CollectionMetadataMap::iterator it = _collMetadata.find( ns ); if ( it != _collMetadata.end() ) beforeMetadata = it->second; shardName = _shardName; } ChunkVersion beforeShardVersion; ChunkVersion beforeCollVersion; if ( beforeMetadata ) { beforeShardVersion = beforeMetadata->getShardVersion(); beforeCollVersion = beforeMetadata->getCollVersion(); } *latestShardVersion = beforeShardVersion; // We can't reload without a shard name. Must check here before loading, since shard name // may have changed if we checked it earlier and released the _mutex. if ( shardName.empty() ) { string errMsg = str::stream() << "cannot refresh metadata for " << ns << " before shard name has been set"; LOG( 0 ) << errMsg << endl; return Status( ErrorCodes::IllegalOperation, errMsg ); } // // Determine whether we need to diff or fully reload // bool fullReload = false; if ( !beforeMetadata ) { // We don't have any metadata to reload from fullReload = true; } else if ( useRequestedVersion && reqShardVersion.epoch() != beforeShardVersion.epoch() ) { // It's not useful to use the metadata as a base because we think the epoch will differ fullReload = true; } // // Load the metadata from the remote server, start construction // LOG( 0 ) << "remotely refreshing metadata for " << ns << ( useRequestedVersion ? string( " with requested shard version " ) + reqShardVersion.toString() : "" ) << ( fullReload ? ", current shard version is " : " based on current shard version " ) << beforeShardVersion << ", current metadata version is " << beforeCollVersion << endl; string errMsg; ConnectionString configServerLoc = ConnectionString::parse( _configServer, errMsg ); MetadataLoader mdLoader( configServerLoc ); CollectionMetadata* remoteMetadataRaw = new CollectionMetadata(); CollectionMetadataPtr remoteMetadata( remoteMetadataRaw ); Timer refreshTimer; Status status = mdLoader.makeCollectionMetadata( ns, shardName, ( fullReload ? NULL : beforeMetadata.get() ), remoteMetadataRaw ); long long refreshMillis = refreshTimer.millis(); if ( status.code() == ErrorCodes::NamespaceNotFound ) { remoteMetadata.reset(); remoteMetadataRaw = NULL; } else if ( !status.isOK() ) { warning() << "could not remotely refresh metadata for " << ns << causedBy( status.reason() ) << endl; return status; } ChunkVersion remoteShardVersion; ChunkVersion remoteCollVersion; if ( remoteMetadata ) { remoteShardVersion = remoteMetadata->getShardVersion(); remoteCollVersion = remoteMetadata->getCollVersion(); } // // Get ready to install loaded metadata if needed // CollectionMetadataPtr afterMetadata; ChunkVersion afterShardVersion; ChunkVersion afterCollVersion; ChunkVersion::VersionChoice choice; // If we choose to install the new metadata, this describes the kind of install enum InstallType { InstallType_New, InstallType_Update, InstallType_Replace, InstallType_Drop, InstallType_None } installType = InstallType_None; // compiler complains otherwise { // DBLock needed since we're now potentially changing the metadata, and don't want // reads/writes to be ongoing. Lock::DBWrite writeLk( ns ); // // Get the metadata now that the load has completed // scoped_lock lk( _mutex ); CollectionMetadataMap::iterator it = _collMetadata.find( ns ); if ( it != _collMetadata.end() ) afterMetadata = it->second; if ( afterMetadata ) { afterShardVersion = afterMetadata->getShardVersion(); afterCollVersion = afterMetadata->getCollVersion(); } *latestShardVersion = afterShardVersion; // // Resolve newer pending chunks with the remote metadata, finish construction // status = mdLoader.promotePendingChunks( afterMetadata.get(), remoteMetadataRaw ); if ( !status.isOK() ) { warning() << "remote metadata for " << ns << " is inconsistent with current pending chunks" << causedBy( status.reason() ) << endl; return status; } // // Compare the 'before', 'after', and 'remote' versions/epochs and choose newest // Zero-epochs (sentinel value for "dropped" collections), are tested by // !epoch.isSet(). // choice = ChunkVersion::chooseNewestVersion( beforeCollVersion, afterCollVersion, remoteCollVersion ); if ( choice == ChunkVersion::VersionChoice_Remote ) { dassert(!remoteCollVersion.epoch().isSet() || remoteShardVersion >= beforeShardVersion); if ( !afterCollVersion.epoch().isSet() ) { // First metadata load installType = InstallType_New; dassert( it == _collMetadata.end() ); _collMetadata.insert( make_pair( ns, remoteMetadata ) ); } else if ( remoteCollVersion.epoch().isSet() && remoteCollVersion.epoch() == afterCollVersion.epoch() ) { // Update to existing metadata installType = InstallType_Update; // Invariant: If CollMetadata was not found, version should be have been 0. dassert( it != _collMetadata.end() ); it->second = remoteMetadata; } else if ( remoteCollVersion.epoch().isSet() ) { // New epoch detected, replacing metadata installType = InstallType_Replace; // Invariant: If CollMetadata was not found, version should be have been 0. dassert( it != _collMetadata.end() ); it->second = remoteMetadata; } else { dassert( !remoteCollVersion.epoch().isSet() ); // Drop detected installType = InstallType_Drop; _collMetadata.erase( it ); } *latestShardVersion = remoteShardVersion; } } // End _mutex // End DBWrite // // Do messaging based on what happened above // string versionMsg = str::stream() << " (loaded metadata version : " << remoteCollVersion.toString() << ( beforeCollVersion.epoch() == afterCollVersion.epoch() ? string( ", stored version : " ) + afterCollVersion.toString() : string( ", stored versions : " ) + beforeCollVersion.toString() + " / " + afterCollVersion.toString() ) << ", took " << refreshMillis << "ms)"; if ( choice == ChunkVersion::VersionChoice_Unknown ) { string errMsg = str::stream() << "need to retry loading metadata for " << ns << ", collection may have been dropped or recreated during load" << versionMsg; warning() << errMsg << endl; return Status( ErrorCodes::RemoteChangeDetected, errMsg ); } if ( choice == ChunkVersion::VersionChoice_Local ) { LOG( 0 ) << "newer metadata not found for " << ns << versionMsg << endl; return Status::OK(); } dassert( choice == ChunkVersion::VersionChoice_Remote ); switch( installType ) { case InstallType_New: LOG( 0 ) << "loaded new metadata for " << ns << versionMsg << endl; break; case InstallType_Update: LOG( 0 ) << "loaded newer metadata for " << ns << versionMsg << endl; break; case InstallType_Replace: LOG( 0 ) << "replacing metadata for " << ns << versionMsg << endl; break; case InstallType_Drop: LOG( 0 ) << "dropping metadata for " << ns << versionMsg << endl; break; default: verify( false ); break; } return Status::OK(); }
bool mergeChunks(OperationContext* txn, const NamespaceString& nss, const BSONObj& minKey, const BSONObj& maxKey, const OID& epoch, string* errMsg) { // Get the distributed lock string whyMessage = stream() << "merging chunks in " << nss.ns() << " from " << minKey << " to " << maxKey; auto scopedDistLock = grid.catalogManager(txn)->distLock( txn, nss.ns(), whyMessage, DistLockManager::kSingleLockAttemptTimeout); if (!scopedDistLock.isOK()) { *errMsg = stream() << "could not acquire collection lock for " << nss.ns() << " to merge chunks in [" << minKey << "," << maxKey << ")" << causedBy(scopedDistLock.getStatus()); warning() << *errMsg; return false; } ShardingState* shardingState = ShardingState::get(txn); // // We now have the collection lock, refresh metadata to latest version and sanity check // ChunkVersion shardVersion; Status status = shardingState->refreshMetadataNow(txn, nss.ns(), &shardVersion); if (!status.isOK()) { *errMsg = str::stream() << "could not merge chunks, failed to refresh metadata for " << nss.ns() << causedBy(status.reason()); warning() << *errMsg; return false; } if (epoch.isSet() && shardVersion.epoch() != epoch) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has changed" << " since merge was sent" << "(sent epoch : " << epoch.toString() << ", current epoch : " << shardVersion.epoch().toString() << ")"; warning() << *errMsg; return false; } shared_ptr<CollectionMetadata> metadata = shardingState->getCollectionMetadata(nss.ns()); if (!metadata || metadata->getKeyPattern().isEmpty()) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " is not sharded"; warning() << *errMsg; return false; } dassert(metadata->getShardVersion().equals(shardVersion)); if (!metadata->isValidKey(minKey) || !metadata->isValidKey(maxKey)) { *errMsg = stream() << "could not merge chunks, the range " << rangeToString(minKey, maxKey) << " is not valid" << " for collection " << nss.ns() << " with key pattern " << metadata->getKeyPattern(); warning() << *errMsg; return false; } // // Get merged chunk information // ChunkVersion mergeVersion = metadata->getCollVersion(); mergeVersion.incMinor(); std::vector<ChunkType> chunksToMerge; ChunkType itChunk; itChunk.setMin(minKey); itChunk.setMax(minKey); itChunk.setNS(nss.ns()); itChunk.setShard(shardingState->getShardName()); while (itChunk.getMax().woCompare(maxKey) < 0 && metadata->getNextChunk(itChunk.getMax(), &itChunk)) { chunksToMerge.push_back(itChunk); } if (chunksToMerge.empty()) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " and ending at " << maxKey << " does not belong to shard " << shardingState->getShardName(); warning() << *errMsg; return false; } // // Validate the range starts and ends at chunks and has no holes, error if not valid // BSONObj firstDocMin = chunksToMerge.front().getMin(); BSONObj firstDocMax = chunksToMerge.front().getMax(); // minKey is inclusive bool minKeyInRange = rangeContains(firstDocMin, firstDocMax, minKey); if (!minKeyInRange) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " does not belong to shard " << shardingState->getShardName(); warning() << *errMsg; return false; } BSONObj lastDocMin = chunksToMerge.back().getMin(); BSONObj lastDocMax = chunksToMerge.back().getMax(); // maxKey is exclusive bool maxKeyInRange = lastDocMin.woCompare(maxKey) < 0 && lastDocMax.woCompare(maxKey) >= 0; if (!maxKeyInRange) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range ending at " << maxKey << " does not belong to shard " << shardingState->getShardName(); warning() << *errMsg; return false; } bool validRangeStartKey = firstDocMin.woCompare(minKey) == 0; bool validRangeEndKey = lastDocMax.woCompare(maxKey) == 0; if (!validRangeStartKey || !validRangeEndKey) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " does not contain a chunk " << (!validRangeStartKey ? "starting at " + minKey.toString() : "") << (!validRangeStartKey && !validRangeEndKey ? " or " : "") << (!validRangeEndKey ? "ending at " + maxKey.toString() : ""); warning() << *errMsg; return false; } if (chunksToMerge.size() == 1) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " already contains chunk for " << rangeToString(minKey, maxKey); warning() << *errMsg; return false; } // Look for hole in range for (size_t i = 1; i < chunksToMerge.size(); ++i) { if (chunksToMerge[i - 1].getMax().woCompare(chunksToMerge[i].getMin()) != 0) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has a hole in the range " << rangeToString(minKey, maxKey) << " at " << rangeToString(chunksToMerge[i - 1].getMax(), chunksToMerge[i].getMin()); warning() << *errMsg; return false; } } // // Run apply ops command // Status applyOpsStatus = runApplyOpsCmd(txn, chunksToMerge, shardVersion, mergeVersion); if (!applyOpsStatus.isOK()) { warning() << applyOpsStatus; return false; } // // Install merged chunk metadata // { ScopedTransaction transaction(txn, MODE_IX); Lock::DBLock writeLk(txn->lockState(), nss.db(), MODE_IX); Lock::CollectionLock collLock(txn->lockState(), nss.ns(), MODE_X); shardingState->mergeChunks(txn, nss.ns(), minKey, maxKey, mergeVersion); } // // Log change // BSONObj mergeLogEntry = buildMergeLogEntry(chunksToMerge, shardVersion, mergeVersion); grid.catalogManager(txn)->logChange(txn, "merge", nss.ns(), mergeLogEntry); return true; }