static bool checkShardVersion( ShardingState* shardingState, const BatchedCommandRequest& request, WriteErrorDetail** error ) { const NamespaceString nss( request.getTargetingNS() ); Lock::assertWriteLocked( nss.ns() ); ChunkVersion requestShardVersion = request.isMetadataSet() && request.getMetadata()->isShardVersionSet() ? request.getMetadata()->getShardVersion() : ChunkVersion::IGNORED(); if ( shardingState->enabled() ) { CollectionMetadataPtr metadata = shardingState->getCollectionMetadata( nss.ns() ); if ( !ChunkVersion::isIgnoredVersion( requestShardVersion ) ) { ChunkVersion shardVersion = metadata ? metadata->getShardVersion() : ChunkVersion::UNSHARDED(); if ( !requestShardVersion.isWriteCompatibleWith( shardVersion ) ) { *error = new WriteErrorDetail; buildStaleError( requestShardVersion, shardVersion, *error ); return false; } } } return true; }
static bool checkShardVersion(OperationContext* txn, ShardingState* shardingState, const BatchedCommandRequest& request, WriteOpResult* result) { const NamespaceString nss( request.getTargetingNS() ); txn->lockState()->assertWriteLocked( nss.ns() ); ChunkVersion requestShardVersion = request.isMetadataSet() && request.getMetadata()->isShardVersionSet() ? request.getMetadata()->getShardVersion() : ChunkVersion::IGNORED(); if ( shardingState->enabled() ) { CollectionMetadataPtr metadata = shardingState->getCollectionMetadata( nss.ns() ); if ( !ChunkVersion::isIgnoredVersion( requestShardVersion ) ) { ChunkVersion shardVersion = metadata ? metadata->getShardVersion() : ChunkVersion::UNSHARDED(); if ( !requestShardVersion.isWriteCompatibleWith( shardVersion ) ) { result->setError(new WriteErrorDetail); buildStaleError(requestShardVersion, shardVersion, result->getError()); return false; } } } return true; }
bool ShardingState::hasVersion( const string& ns , ChunkVersion& version ) { scoped_lock lk(_mutex); CollectionMetadataMap::const_iterator it = _collMetadata.find(ns); if ( it == _collMetadata.end() ) return false; CollectionMetadataPtr p = it->second; version = p->getShardVersion(); return true; }
TEST_F(MergeChunkTests, CompoundMerge) { const NamespaceString nss("foo.bar"); const BSONObj kp = BSON("x" << 1 << "y" << 1); const OID epoch = OID::gen(); vector<KeyRange> ranges; // Setup chunk metadata ranges.push_back( KeyRange(nss.ns(), BSON("x" << 0 << "y" << 1), BSON("x" << 1 << "y" << 0), kp)); ranges.push_back( KeyRange(nss.ns(), BSON("x" << 1 << "y" << 0), BSON("x" << 2 << "y" << 1), kp)); storeCollectionRanges(nss, shardName(), ranges, ChunkVersion(1, 0, epoch)); // Get latest version ChunkVersion latestVersion; ShardingState::get(getGlobalServiceContext()) ->refreshMetadataNow(&_txn, nss.ns(), &latestVersion); ShardingState::get(getGlobalServiceContext())->resetMetadata(nss.ns()); // Do merge string errMsg; bool result = mergeChunks( &_txn, nss, BSON("x" << 0 << "y" << 1), BSON("x" << 2 << "y" << 1), epoch, &errMsg); ASSERT_EQUALS(errMsg, ""); ASSERT(result); // Verify result CollectionMetadataPtr metadata = ShardingState::get(getGlobalServiceContext())->getCollectionMetadata(nss.ns()); ChunkType chunk; ASSERT(metadata->getNextChunk(BSON("x" << 0 << "y" << 1), &chunk)); ASSERT(chunk.getMin().woCompare(BSON("x" << 0 << "y" << 1)) == 0); ASSERT(chunk.getMax().woCompare(BSON("x" << 2 << "y" << 1)) == 0); ASSERT_EQUALS(metadata->getNumChunks(), 1u); ASSERT_EQUALS(metadata->getShardVersion().majorVersion(), latestVersion.majorVersion()); ASSERT_GREATER_THAN(metadata->getShardVersion().minorVersion(), latestVersion.minorVersion()); assertWrittenAsMerged(ranges); }
const ChunkVersion ShardingState::getVersion( const string& ns ) const { scoped_lock lk(_mutex); CollectionMetadataMap::const_iterator it = _collMetadata.find( ns ); if ( it != _collMetadata.end() ) { CollectionMetadataPtr p = it->second; return p->getShardVersion(); } else { return ChunkVersion( 0, OID() ); } }
void ShardingState::appendInfo( BSONObjBuilder& b ) { b.appendBool( "enabled" , _enabled ); if ( ! _enabled ) return; b.append( "configServer" , _configServer ); b.append( "shardName" , _shardName ); { BSONObjBuilder bb( b.subobjStart( "versions" ) ); scoped_lock lk(_mutex); for ( CollectionMetadataMap::iterator it = _collMetadata.begin(); it != _collMetadata.end(); ++it ) { CollectionMetadataPtr p = it->second; bb.appendTimestamp( it->first , p->getShardVersion().toLong() ); } bb.done(); } }
bool WriteBatchExecutor::doWrite( const string& ns, const BatchItemRef& itemRef, CurOp* currentOp, WriteStats* stats, BSONObj* upsertedID, BatchedErrorDetail* error ) { const BatchedCommandRequest& request = *itemRef.getRequest(); int index = itemRef.getItemIndex(); // // Check our shard version if we need to (must be in the write lock) // CollectionMetadataPtr metadata; if ( shardingState.enabled() ) { // Index inserts make the namespace nontrivial for versioning string targetingNS = itemRef.getRequest()->getTargetingNS(); Lock::assertWriteLocked( targetingNS ); metadata = shardingState.getCollectionMetadata( targetingNS ); if ( request.isShardVersionSet() && !ChunkVersion::isIgnoredVersion( request.getShardVersion() ) ) { ChunkVersion shardVersion = metadata ? metadata->getShardVersion() : ChunkVersion::UNSHARDED(); if ( !request.getShardVersion() // .isWriteCompatibleWith( shardVersion ) ) { buildStaleError( request.getShardVersion(), shardVersion, error ); return false; } } } // // Not stale, do the actual write // if ( request.getBatchType() == BatchedCommandRequest::BatchType_Insert ) { // Need to check for unique index problems if ( metadata && request.isUniqueIndexRequest() ) { if ( !isUniqueIndexCompatible( metadata->getKeyPattern(), request.getIndexKeyPattern() ) ) { buildUniqueIndexError( metadata->getKeyPattern(), request.getIndexKeyPattern(), error ); return false; } } // Insert return doInsert( ns, request.getInsertRequest()->getDocumentsAt( index ), currentOp, stats, error ); } else if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update ) { // TODO: Pass down immutable shard key fields // Update return doUpdate( ns, *request.getUpdateRequest()->getUpdatesAt( index ), currentOp, stats, upsertedID, error ); } else { dassert( request.getBatchType() == BatchedCommandRequest::BatchType_Delete ); // Delete return doDelete( ns, *request.getDeleteRequest()->getDeletesAt( index ), currentOp, stats, error ); } }
bool mergeChunks( OperationContext* txn, const NamespaceString& nss, const BSONObj& minKey, const BSONObj& maxKey, const OID& epoch, string* errMsg ) { // // Get sharding state up-to-date // ConnectionString configLoc = ConnectionString::parse( shardingState.getConfigServer(), *errMsg ); if ( !configLoc.isValid() ){ warning() << *errMsg << endl; return false; } // // Get the distributed lock // ScopedDistributedLock collLock( configLoc, nss.ns() ); collLock.setLockMessage( stream() << "merging chunks in " << nss.ns() << " from " << minKey << " to " << maxKey ); Status acquisitionStatus = collLock.tryAcquire(); if (!acquisitionStatus.isOK()) { *errMsg = stream() << "could not acquire collection lock for " << nss.ns() << " to merge chunks in [" << minKey << "," << maxKey << ")" << causedBy(acquisitionStatus); warning() << *errMsg << endl; return false; } // // We now have the collection lock, refresh metadata to latest version and sanity check // ChunkVersion shardVersion; Status status = shardingState.refreshMetadataNow(txn, nss.ns(), &shardVersion); if ( !status.isOK() ) { *errMsg = str::stream() << "could not merge chunks, failed to refresh metadata for " << nss.ns() << causedBy( status.reason() ); warning() << *errMsg << endl; return false; } if ( epoch.isSet() && shardVersion.epoch() != epoch ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has changed" << " since merge was sent" << "(sent epoch : " << epoch.toString() << ", current epoch : " << shardVersion.epoch().toString() << ")"; warning() << *errMsg << endl; return false; } CollectionMetadataPtr metadata = shardingState.getCollectionMetadata( nss.ns() ); if ( !metadata || metadata->getKeyPattern().isEmpty() ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " is not sharded"; warning() << *errMsg << endl; return false; } dassert( metadata->getShardVersion().equals( shardVersion ) ); if ( !metadata->isValidKey( minKey ) || !metadata->isValidKey( maxKey ) ) { *errMsg = stream() << "could not merge chunks, the range " << rangeToString( minKey, maxKey ) << " is not valid" << " for collection " << nss.ns() << " with key pattern " << metadata->getKeyPattern(); warning() << *errMsg << endl; return false; } // // Get merged chunk information // ChunkVersion mergeVersion = metadata->getCollVersion(); mergeVersion.incMinor(); OwnedPointerVector<ChunkType> chunksToMerge; ChunkType itChunk; itChunk.setMin( minKey ); itChunk.setMax( minKey ); itChunk.setNS( nss.ns() ); itChunk.setShard( shardingState.getShardName() ); while ( itChunk.getMax().woCompare( maxKey ) < 0 && metadata->getNextChunk( itChunk.getMax(), &itChunk ) ) { auto_ptr<ChunkType> saved( new ChunkType ); itChunk.cloneTo( saved.get() ); chunksToMerge.mutableVector().push_back( saved.release() ); } if ( chunksToMerge.empty() ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " and ending at " << maxKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } // // Validate the range starts and ends at chunks and has no holes, error if not valid // BSONObj firstDocMin = ( *chunksToMerge.begin() )->getMin(); BSONObj firstDocMax = ( *chunksToMerge.begin() )->getMax(); // minKey is inclusive bool minKeyInRange = rangeContains( firstDocMin, firstDocMax, minKey ); if ( !minKeyInRange ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } BSONObj lastDocMin = ( *chunksToMerge.rbegin() )->getMin(); BSONObj lastDocMax = ( *chunksToMerge.rbegin() )->getMax(); // maxKey is exclusive bool maxKeyInRange = lastDocMin.woCompare( maxKey ) < 0 && lastDocMax.woCompare( maxKey ) >= 0; if ( !maxKeyInRange ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range ending at " << maxKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } bool validRangeStartKey = firstDocMin.woCompare( minKey ) == 0; bool validRangeEndKey = lastDocMax.woCompare( maxKey ) == 0; if ( !validRangeStartKey || !validRangeEndKey ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " does not contain a chunk " << ( !validRangeStartKey ? "starting at " + minKey.toString() : "" ) << ( !validRangeStartKey && !validRangeEndKey ? " or " : "" ) << ( !validRangeEndKey ? "ending at " + maxKey.toString() : "" ); warning() << *errMsg << endl; return false; } if ( chunksToMerge.size() == 1 ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " already contains chunk for " << rangeToString( minKey, maxKey ); warning() << *errMsg << endl; return false; } bool holeInRange = false; // Look for hole in range ChunkType* prevChunk = *chunksToMerge.begin(); ChunkType* nextChunk = NULL; for ( OwnedPointerVector<ChunkType>::const_iterator it = chunksToMerge.begin(); it != chunksToMerge.end(); ++it ) { if ( it == chunksToMerge.begin() ) continue; nextChunk = *it; if ( prevChunk->getMax().woCompare( nextChunk->getMin() ) != 0 ) { holeInRange = true; break; } prevChunk = nextChunk; } if ( holeInRange ) { dassert( NULL != nextChunk ); *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has a hole in the range " << rangeToString( minKey, maxKey ) << " at " << rangeToString( prevChunk->getMax(), nextChunk->getMin() ); warning() << *errMsg << endl; return false; } // // Run apply ops command // BSONObj applyOpsCmd = buildApplyOpsCmd( chunksToMerge, shardVersion, mergeVersion ); bool ok; BSONObj result; try { ScopedDbConnection conn( configLoc, 30.0 ); ok = conn->runCommand( "config", applyOpsCmd, result ); if ( !ok ) *errMsg = result.toString(); conn.done(); } catch( const DBException& ex ) { ok = false; *errMsg = ex.toString(); } if ( !ok ) { *errMsg = stream() << "could not merge chunks for " << nss.ns() << ", writing to config failed" << causedBy( errMsg ); warning() << *errMsg << endl; return false; } // // Install merged chunk metadata // { Lock::DBLock writeLk(txn->lockState(), nss.db(), newlm::MODE_X); shardingState.mergeChunks(txn, nss.ns(), minKey, maxKey, mergeVersion); } // // Log change // BSONObj mergeLogEntry = buildMergeLogEntry( chunksToMerge, shardVersion, mergeVersion ); configServer.logChange( "merge", nss.ns(), mergeLogEntry ); return true; }
Status ShardingState::doRefreshMetadata( const string& ns, const ChunkVersion& reqShardVersion, bool useRequestedVersion, ChunkVersion* latestShardVersion ) { // The idea here is that we're going to reload the metadata from the config server, but // we need to do so outside any locks. When we get our result back, if the current metadata // has changed, we may not be able to install the new metadata. // // Get the initial metadata // No DBLock is needed since the metadata is expected to change during reload. // CollectionMetadataPtr beforeMetadata; string shardName; { scoped_lock lk( _mutex ); CollectionMetadataMap::iterator it = _collMetadata.find( ns ); if ( it != _collMetadata.end() ) beforeMetadata = it->second; shardName = _shardName; } ChunkVersion beforeShardVersion; ChunkVersion beforeCollVersion; if ( beforeMetadata ) { beforeShardVersion = beforeMetadata->getShardVersion(); beforeCollVersion = beforeMetadata->getCollVersion(); } *latestShardVersion = beforeShardVersion; // We can't reload without a shard name. Must check here before loading, since shard name // may have changed if we checked it earlier and released the _mutex. if ( shardName.empty() ) { string errMsg = str::stream() << "cannot refresh metadata for " << ns << " before shard name has been set"; LOG( 0 ) << errMsg << endl; return Status( ErrorCodes::IllegalOperation, errMsg ); } // // Determine whether we need to diff or fully reload // bool fullReload = false; if ( !beforeMetadata ) { // We don't have any metadata to reload from fullReload = true; } else if ( useRequestedVersion && reqShardVersion.epoch() != beforeShardVersion.epoch() ) { // It's not useful to use the metadata as a base because we think the epoch will differ fullReload = true; } // // Load the metadata from the remote server, start construction // LOG( 0 ) << "remotely refreshing metadata for " << ns << ( useRequestedVersion ? string( " with requested shard version " ) + reqShardVersion.toString() : "" ) << ( fullReload ? ", current shard version is " : " based on current shard version " ) << beforeShardVersion << ", current metadata version is " << beforeCollVersion << endl; string errMsg; ConnectionString configServerLoc = ConnectionString::parse( _configServer, errMsg ); MetadataLoader mdLoader( configServerLoc ); CollectionMetadata* remoteMetadataRaw = new CollectionMetadata(); CollectionMetadataPtr remoteMetadata( remoteMetadataRaw ); Timer refreshTimer; Status status = mdLoader.makeCollectionMetadata( ns, shardName, ( fullReload ? NULL : beforeMetadata.get() ), remoteMetadataRaw ); long long refreshMillis = refreshTimer.millis(); if ( status.code() == ErrorCodes::NamespaceNotFound ) { remoteMetadata.reset(); remoteMetadataRaw = NULL; } else if ( !status.isOK() ) { warning() << "could not remotely refresh metadata for " << ns << causedBy( status.reason() ) << endl; return status; } ChunkVersion remoteShardVersion; ChunkVersion remoteCollVersion; if ( remoteMetadata ) { remoteShardVersion = remoteMetadata->getShardVersion(); remoteCollVersion = remoteMetadata->getCollVersion(); } // // Get ready to install loaded metadata if needed // CollectionMetadataPtr afterMetadata; ChunkVersion afterShardVersion; ChunkVersion afterCollVersion; ChunkVersion::VersionChoice choice; // If we choose to install the new metadata, this describes the kind of install enum InstallType { InstallType_New, InstallType_Update, InstallType_Replace, InstallType_Drop, InstallType_None } installType = InstallType_None; // compiler complains otherwise { // DBLock needed since we're now potentially changing the metadata, and don't want // reads/writes to be ongoing. Lock::DBWrite writeLk( ns ); // // Get the metadata now that the load has completed // scoped_lock lk( _mutex ); CollectionMetadataMap::iterator it = _collMetadata.find( ns ); if ( it != _collMetadata.end() ) afterMetadata = it->second; if ( afterMetadata ) { afterShardVersion = afterMetadata->getShardVersion(); afterCollVersion = afterMetadata->getCollVersion(); } *latestShardVersion = afterShardVersion; // // Resolve newer pending chunks with the remote metadata, finish construction // status = mdLoader.promotePendingChunks( afterMetadata.get(), remoteMetadataRaw ); if ( !status.isOK() ) { warning() << "remote metadata for " << ns << " is inconsistent with current pending chunks" << causedBy( status.reason() ) << endl; return status; } // // Compare the 'before', 'after', and 'remote' versions/epochs and choose newest // Zero-epochs (sentinel value for "dropped" collections), are tested by // !epoch.isSet(). // choice = ChunkVersion::chooseNewestVersion( beforeCollVersion, afterCollVersion, remoteCollVersion ); if ( choice == ChunkVersion::VersionChoice_Remote ) { dassert(!remoteCollVersion.epoch().isSet() || remoteShardVersion >= beforeShardVersion); if ( !afterCollVersion.epoch().isSet() ) { // First metadata load installType = InstallType_New; dassert( it == _collMetadata.end() ); _collMetadata.insert( make_pair( ns, remoteMetadata ) ); } else if ( remoteCollVersion.epoch().isSet() && remoteCollVersion.epoch() == afterCollVersion.epoch() ) { // Update to existing metadata installType = InstallType_Update; // Invariant: If CollMetadata was not found, version should be have been 0. dassert( it != _collMetadata.end() ); it->second = remoteMetadata; } else if ( remoteCollVersion.epoch().isSet() ) { // New epoch detected, replacing metadata installType = InstallType_Replace; // Invariant: If CollMetadata was not found, version should be have been 0. dassert( it != _collMetadata.end() ); it->second = remoteMetadata; } else { dassert( !remoteCollVersion.epoch().isSet() ); // Drop detected installType = InstallType_Drop; _collMetadata.erase( it ); } *latestShardVersion = remoteShardVersion; } } // End _mutex // End DBWrite // // Do messaging based on what happened above // string versionMsg = str::stream() << " (loaded metadata version : " << remoteCollVersion.toString() << ( beforeCollVersion.epoch() == afterCollVersion.epoch() ? string( ", stored version : " ) + afterCollVersion.toString() : string( ", stored versions : " ) + beforeCollVersion.toString() + " / " + afterCollVersion.toString() ) << ", took " << refreshMillis << "ms)"; if ( choice == ChunkVersion::VersionChoice_Unknown ) { string errMsg = str::stream() << "need to retry loading metadata for " << ns << ", collection may have been dropped or recreated during load" << versionMsg; warning() << errMsg << endl; return Status( ErrorCodes::RemoteChangeDetected, errMsg ); } if ( choice == ChunkVersion::VersionChoice_Local ) { LOG( 0 ) << "newer metadata not found for " << ns << versionMsg << endl; return Status::OK(); } dassert( choice == ChunkVersion::VersionChoice_Remote ); switch( installType ) { case InstallType_New: LOG( 0 ) << "loaded new metadata for " << ns << versionMsg << endl; break; case InstallType_Update: LOG( 0 ) << "loaded newer metadata for " << ns << versionMsg << endl; break; case InstallType_Replace: LOG( 0 ) << "replacing metadata for " << ns << versionMsg << endl; break; case InstallType_Drop: LOG( 0 ) << "dropping metadata for " << ns << versionMsg << endl; break; default: verify( false ); break; } return Status::OK(); }
Status ShardingState::refreshMetadataIfNeeded( const string& ns, const ChunkVersion& reqShardVersion, ChunkVersion* latestShardVersion ) { // The _configServerTickets serializes this process such that only a small number of threads // can try to refresh at the same time. LOG( 2 ) << "metadata refresh requested for " << ns << " at shard version " << reqShardVersion << endl; // // Queuing of refresh requests starts here when remote reload is needed. This may take time. // TODO: Explicitly expose the queuing discipline. // _configServerTickets.waitForTicket(); TicketHolderReleaser needTicketFrom( &_configServerTickets ); // // Fast path - check if the requested version is at a higher version than the current // metadata version or a different epoch before verifying against config server. // CollectionMetadataPtr storedMetadata; { scoped_lock lk( _mutex ); CollectionMetadataMap::iterator it = _collMetadata.find( ns ); if ( it != _collMetadata.end() ) storedMetadata = it->second; } ChunkVersion storedShardVersion; if ( storedMetadata ) storedShardVersion = storedMetadata->getShardVersion(); *latestShardVersion = storedShardVersion; if ( storedShardVersion >= reqShardVersion && storedShardVersion.epoch() == reqShardVersion.epoch() ) { // Don't need to remotely reload if we're in the same epoch with a >= version return Status::OK(); } // // Slow path - remotely reload // // Cases: // A) Initial config load and/or secondary take-over. // B) Migration TO this shard finished, notified by mongos. // C) Dropping a collection, notified (currently) by mongos. // D) Stale client wants to reload metadata with a different *epoch*, so we aren't sure. if ( storedShardVersion.epoch() != reqShardVersion.epoch() ) { // Need to remotely reload if our epochs aren't the same, to verify LOG( 1 ) << "metadata change requested for " << ns << ", from shard version " << storedShardVersion << " to " << reqShardVersion << ", need to verify with config server" << endl; } else { // Need to remotely reload since our epochs aren't the same but our version is greater LOG( 1 ) << "metadata version update requested for " << ns << ", from shard version " << storedShardVersion << " to " << reqShardVersion << ", need to verify with config server" << endl; } return doRefreshMetadata( ns, reqShardVersion, true, latestShardVersion ); }
bool WriteBatchExecutor::doWrite( const string& ns, const BatchItemRef& itemRef, CurOp* currentOp, WriteStats* stats, BSONObj* upsertedID, BatchedErrorDetail* error ) { const BatchedCommandRequest& request = *itemRef.getRequest(); int index = itemRef.getItemIndex(); // // Check our shard version if we need to (must be in the write lock) // if ( shardingState.enabled() && request.isShardVersionSet() && !ChunkVersion::isIgnoredVersion( request.getShardVersion() ) ) { Lock::assertWriteLocked( ns ); CollectionMetadataPtr metadata = shardingState.getCollectionMetadata( ns ); ChunkVersion shardVersion = metadata ? metadata->getShardVersion() : ChunkVersion::UNSHARDED(); if ( !request.getShardVersion() // .isWriteCompatibleWith( shardVersion ) ) { // Write stale error to results error->setErrCode( ErrorCodes::StaleShardVersion ); BSONObjBuilder infoB; shardVersion.addToBSON( infoB, "vWanted" ); error->setErrInfo( infoB.obj() ); string errMsg = mongoutils::str::stream() << "stale shard version detected before write, received " << request.getShardVersion().toString() << " but local version is " << shardVersion.toString(); error->setErrMessage( errMsg ); return false; } } // // Not stale, do the actual write // if ( request.getBatchType() == BatchedCommandRequest::BatchType_Insert ) { // Insert return doInsert( ns, request.getInsertRequest()->getDocumentsAt( index ), currentOp, stats, error ); } else if ( request.getBatchType() == BatchedCommandRequest::BatchType_Update ) { // Update return doUpdate( ns, *request.getUpdateRequest()->getUpdatesAt( index ), currentOp, stats, upsertedID, error ); } else { dassert( request.getBatchType() == BatchedCommandRequest::BatchType_Delete ); // Delete return doDelete( ns, *request.getDeleteRequest()->getDeletesAt( index ), currentOp, stats, error ); } }
bool mergeChunks(OperationContext* txn, const NamespaceString& nss, const BSONObj& minKey, const BSONObj& maxKey, const OID& epoch, string* errMsg) { // // Get sharding state up-to-date // ConnectionString configLoc = ConnectionString::parse(shardingState.getConfigServer(), *errMsg); if (!configLoc.isValid()) { warning() << *errMsg << endl; return false; } // // Get the distributed lock // string whyMessage = stream() << "merging chunks in " << nss.ns() << " from " << minKey << " to " << maxKey; auto scopedDistLock = grid.catalogManager()->getDistLockManager()->lock(nss.ns(), whyMessage); if (!scopedDistLock.isOK()) { *errMsg = stream() << "could not acquire collection lock for " << nss.ns() << " to merge chunks in [" << minKey << "," << maxKey << ")" << causedBy(scopedDistLock.getStatus()); warning() << *errMsg << endl; return false; } // // We now have the collection lock, refresh metadata to latest version and sanity check // ChunkVersion shardVersion; Status status = shardingState.refreshMetadataNow(txn, nss.ns(), &shardVersion); if (!status.isOK()) { *errMsg = str::stream() << "could not merge chunks, failed to refresh metadata for " << nss.ns() << causedBy(status.reason()); warning() << *errMsg << endl; return false; } if (epoch.isSet() && shardVersion.epoch() != epoch) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has changed" << " since merge was sent" << "(sent epoch : " << epoch.toString() << ", current epoch : " << shardVersion.epoch().toString() << ")"; warning() << *errMsg << endl; return false; } CollectionMetadataPtr metadata = shardingState.getCollectionMetadata(nss.ns()); if (!metadata || metadata->getKeyPattern().isEmpty()) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " is not sharded"; warning() << *errMsg << endl; return false; } dassert(metadata->getShardVersion().equals(shardVersion)); if (!metadata->isValidKey(minKey) || !metadata->isValidKey(maxKey)) { *errMsg = stream() << "could not merge chunks, the range " << rangeToString(minKey, maxKey) << " is not valid" << " for collection " << nss.ns() << " with key pattern " << metadata->getKeyPattern(); warning() << *errMsg << endl; return false; } // // Get merged chunk information // ChunkVersion mergeVersion = metadata->getCollVersion(); mergeVersion.incMinor(); std::vector<ChunkType> chunksToMerge; ChunkType itChunk; itChunk.setMin(minKey); itChunk.setMax(minKey); itChunk.setNS(nss.ns()); itChunk.setShard(shardingState.getShardName()); while (itChunk.getMax().woCompare(maxKey) < 0 && metadata->getNextChunk(itChunk.getMax(), &itChunk)) { chunksToMerge.push_back(itChunk); } if (chunksToMerge.empty()) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " and ending at " << maxKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } // // Validate the range starts and ends at chunks and has no holes, error if not valid // BSONObj firstDocMin = chunksToMerge.front().getMin(); BSONObj firstDocMax = chunksToMerge.front().getMax(); // minKey is inclusive bool minKeyInRange = rangeContains(firstDocMin, firstDocMax, minKey); if (!minKeyInRange) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } BSONObj lastDocMin = chunksToMerge.back().getMin(); BSONObj lastDocMax = chunksToMerge.back().getMax(); // maxKey is exclusive bool maxKeyInRange = lastDocMin.woCompare(maxKey) < 0 && lastDocMax.woCompare(maxKey) >= 0; if (!maxKeyInRange) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range ending at " << maxKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } bool validRangeStartKey = firstDocMin.woCompare(minKey) == 0; bool validRangeEndKey = lastDocMax.woCompare(maxKey) == 0; if (!validRangeStartKey || !validRangeEndKey) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " does not contain a chunk " << (!validRangeStartKey ? "starting at " + minKey.toString() : "") << (!validRangeStartKey && !validRangeEndKey ? " or " : "") << (!validRangeEndKey ? "ending at " + maxKey.toString() : ""); warning() << *errMsg << endl; return false; } if (chunksToMerge.size() == 1) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " already contains chunk for " << rangeToString(minKey, maxKey); warning() << *errMsg << endl; return false; } // Look for hole in range for (size_t i = 1; i < chunksToMerge.size(); ++i) { if (chunksToMerge[i - 1].getMax().woCompare(chunksToMerge[i].getMin()) != 0) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has a hole in the range " << rangeToString(minKey, maxKey) << " at " << rangeToString(chunksToMerge[i - 1].getMax(), chunksToMerge[i].getMin()); warning() << *errMsg << endl; return false; } } // // Run apply ops command // Status applyOpsStatus = runApplyOpsCmd(chunksToMerge, shardVersion, mergeVersion); if (!applyOpsStatus.isOK()) { warning() << applyOpsStatus; return false; } // // Install merged chunk metadata // { ScopedTransaction transaction(txn, MODE_IX); Lock::DBLock writeLk(txn->lockState(), nss.db(), MODE_IX); Lock::CollectionLock collLock(txn->lockState(), nss.ns(), MODE_X); shardingState.mergeChunks(txn, nss.ns(), minKey, maxKey, mergeVersion); } // // Log change // BSONObj mergeLogEntry = buildMergeLogEntry(chunksToMerge, shardVersion, mergeVersion); grid.catalogManager()->logChange( txn->getClient()->clientAddress(true), "merge", nss.ns(), mergeLogEntry); return true; }