void ShardingState::donateChunk( const string& ns , const BSONObj& min , const BSONObj& max , ChunkVersion version ) { scoped_lock lk( _mutex ); CollectionMetadataMap::const_iterator it = _collMetadata.find( ns ); verify( it != _collMetadata.end() ) ; CollectionMetadataPtr p = it->second; // empty shards should have version 0 version = ( p->getNumChunks() > 1 ) ? version : ChunkVersion( 0, 0, p->getCollVersion().epoch() ); ChunkType chunk; chunk.setMin( min ); chunk.setMax( max ); string errMsg; CollectionMetadataPtr cloned( p->cloneMigrate( chunk, version, &errMsg ) ); // uassert to match old behavior, TODO: report errors w/o throwing uassert( 16855, errMsg, NULL != cloned.get() ); // TODO: a bit dangerous to have two different zero-version states - no-metadata and // no-version _collMetadata[ns] = cloned; }
bool ShardingState::forgetPending( const string& ns, const BSONObj& min, const BSONObj& max, const OID& epoch, string* errMsg ) { scoped_lock lk( _mutex ); CollectionMetadataMap::const_iterator it = _collMetadata.find( ns ); if ( it == _collMetadata.end() ) { *errMsg = str::stream() << "no need to forget pending chunk " << "[" << min << "," << max << ")" << " because the local metadata for " << ns << " has changed"; return false; } CollectionMetadataPtr metadata = it->second; // This can currently happen because drops aren't synchronized with in-migrations // The idea for checking this here is that in the future we shouldn't have this problem if ( metadata->getCollVersion().epoch() != epoch ) { *errMsg = str::stream() << "no need to forget pending chunk " << "[" << min << "," << max << ")" << " because the epoch for " << ns << " has changed from " << epoch << " to " << metadata->getCollVersion().epoch(); return false; } ChunkType chunk; chunk.setMin( min ); chunk.setMax( max ); CollectionMetadataPtr cloned( metadata->cloneMinusPending( chunk, errMsg ) ); if ( !cloned ) return false; _collMetadata[ns] = cloned; return true; }
bool mergeChunks( OperationContext* txn, const NamespaceString& nss, const BSONObj& minKey, const BSONObj& maxKey, const OID& epoch, string* errMsg ) { // // Get sharding state up-to-date // ConnectionString configLoc = ConnectionString::parse( shardingState.getConfigServer(), *errMsg ); if ( !configLoc.isValid() ){ warning() << *errMsg << endl; return false; } // // Get the distributed lock // ScopedDistributedLock collLock( configLoc, nss.ns() ); collLock.setLockMessage( stream() << "merging chunks in " << nss.ns() << " from " << minKey << " to " << maxKey ); Status acquisitionStatus = collLock.tryAcquire(); if (!acquisitionStatus.isOK()) { *errMsg = stream() << "could not acquire collection lock for " << nss.ns() << " to merge chunks in [" << minKey << "," << maxKey << ")" << causedBy(acquisitionStatus); warning() << *errMsg << endl; return false; } // // We now have the collection lock, refresh metadata to latest version and sanity check // ChunkVersion shardVersion; Status status = shardingState.refreshMetadataNow(txn, nss.ns(), &shardVersion); if ( !status.isOK() ) { *errMsg = str::stream() << "could not merge chunks, failed to refresh metadata for " << nss.ns() << causedBy( status.reason() ); warning() << *errMsg << endl; return false; } if ( epoch.isSet() && shardVersion.epoch() != epoch ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has changed" << " since merge was sent" << "(sent epoch : " << epoch.toString() << ", current epoch : " << shardVersion.epoch().toString() << ")"; warning() << *errMsg << endl; return false; } CollectionMetadataPtr metadata = shardingState.getCollectionMetadata( nss.ns() ); if ( !metadata || metadata->getKeyPattern().isEmpty() ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " is not sharded"; warning() << *errMsg << endl; return false; } dassert( metadata->getShardVersion().equals( shardVersion ) ); if ( !metadata->isValidKey( minKey ) || !metadata->isValidKey( maxKey ) ) { *errMsg = stream() << "could not merge chunks, the range " << rangeToString( minKey, maxKey ) << " is not valid" << " for collection " << nss.ns() << " with key pattern " << metadata->getKeyPattern(); warning() << *errMsg << endl; return false; } // // Get merged chunk information // ChunkVersion mergeVersion = metadata->getCollVersion(); mergeVersion.incMinor(); OwnedPointerVector<ChunkType> chunksToMerge; ChunkType itChunk; itChunk.setMin( minKey ); itChunk.setMax( minKey ); itChunk.setNS( nss.ns() ); itChunk.setShard( shardingState.getShardName() ); while ( itChunk.getMax().woCompare( maxKey ) < 0 && metadata->getNextChunk( itChunk.getMax(), &itChunk ) ) { auto_ptr<ChunkType> saved( new ChunkType ); itChunk.cloneTo( saved.get() ); chunksToMerge.mutableVector().push_back( saved.release() ); } if ( chunksToMerge.empty() ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " and ending at " << maxKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } // // Validate the range starts and ends at chunks and has no holes, error if not valid // BSONObj firstDocMin = ( *chunksToMerge.begin() )->getMin(); BSONObj firstDocMax = ( *chunksToMerge.begin() )->getMax(); // minKey is inclusive bool minKeyInRange = rangeContains( firstDocMin, firstDocMax, minKey ); if ( !minKeyInRange ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } BSONObj lastDocMin = ( *chunksToMerge.rbegin() )->getMin(); BSONObj lastDocMax = ( *chunksToMerge.rbegin() )->getMax(); // maxKey is exclusive bool maxKeyInRange = lastDocMin.woCompare( maxKey ) < 0 && lastDocMax.woCompare( maxKey ) >= 0; if ( !maxKeyInRange ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range ending at " << maxKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } bool validRangeStartKey = firstDocMin.woCompare( minKey ) == 0; bool validRangeEndKey = lastDocMax.woCompare( maxKey ) == 0; if ( !validRangeStartKey || !validRangeEndKey ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " does not contain a chunk " << ( !validRangeStartKey ? "starting at " + minKey.toString() : "" ) << ( !validRangeStartKey && !validRangeEndKey ? " or " : "" ) << ( !validRangeEndKey ? "ending at " + maxKey.toString() : "" ); warning() << *errMsg << endl; return false; } if ( chunksToMerge.size() == 1 ) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " already contains chunk for " << rangeToString( minKey, maxKey ); warning() << *errMsg << endl; return false; } bool holeInRange = false; // Look for hole in range ChunkType* prevChunk = *chunksToMerge.begin(); ChunkType* nextChunk = NULL; for ( OwnedPointerVector<ChunkType>::const_iterator it = chunksToMerge.begin(); it != chunksToMerge.end(); ++it ) { if ( it == chunksToMerge.begin() ) continue; nextChunk = *it; if ( prevChunk->getMax().woCompare( nextChunk->getMin() ) != 0 ) { holeInRange = true; break; } prevChunk = nextChunk; } if ( holeInRange ) { dassert( NULL != nextChunk ); *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has a hole in the range " << rangeToString( minKey, maxKey ) << " at " << rangeToString( prevChunk->getMax(), nextChunk->getMin() ); warning() << *errMsg << endl; return false; } // // Run apply ops command // BSONObj applyOpsCmd = buildApplyOpsCmd( chunksToMerge, shardVersion, mergeVersion ); bool ok; BSONObj result; try { ScopedDbConnection conn( configLoc, 30.0 ); ok = conn->runCommand( "config", applyOpsCmd, result ); if ( !ok ) *errMsg = result.toString(); conn.done(); } catch( const DBException& ex ) { ok = false; *errMsg = ex.toString(); } if ( !ok ) { *errMsg = stream() << "could not merge chunks for " << nss.ns() << ", writing to config failed" << causedBy( errMsg ); warning() << *errMsg << endl; return false; } // // Install merged chunk metadata // { Lock::DBLock writeLk(txn->lockState(), nss.db(), newlm::MODE_X); shardingState.mergeChunks(txn, nss.ns(), minKey, maxKey, mergeVersion); } // // Log change // BSONObj mergeLogEntry = buildMergeLogEntry( chunksToMerge, shardVersion, mergeVersion ); configServer.logChange( "merge", nss.ns(), mergeLogEntry ); return true; }
Status ShardingState::doRefreshMetadata( const string& ns, const ChunkVersion& reqShardVersion, bool useRequestedVersion, ChunkVersion* latestShardVersion ) { // The idea here is that we're going to reload the metadata from the config server, but // we need to do so outside any locks. When we get our result back, if the current metadata // has changed, we may not be able to install the new metadata. // // Get the initial metadata // No DBLock is needed since the metadata is expected to change during reload. // CollectionMetadataPtr beforeMetadata; string shardName; { scoped_lock lk( _mutex ); CollectionMetadataMap::iterator it = _collMetadata.find( ns ); if ( it != _collMetadata.end() ) beforeMetadata = it->second; shardName = _shardName; } ChunkVersion beforeShardVersion; ChunkVersion beforeCollVersion; if ( beforeMetadata ) { beforeShardVersion = beforeMetadata->getShardVersion(); beforeCollVersion = beforeMetadata->getCollVersion(); } *latestShardVersion = beforeShardVersion; // We can't reload without a shard name. Must check here before loading, since shard name // may have changed if we checked it earlier and released the _mutex. if ( shardName.empty() ) { string errMsg = str::stream() << "cannot refresh metadata for " << ns << " before shard name has been set"; LOG( 0 ) << errMsg << endl; return Status( ErrorCodes::IllegalOperation, errMsg ); } // // Determine whether we need to diff or fully reload // bool fullReload = false; if ( !beforeMetadata ) { // We don't have any metadata to reload from fullReload = true; } else if ( useRequestedVersion && reqShardVersion.epoch() != beforeShardVersion.epoch() ) { // It's not useful to use the metadata as a base because we think the epoch will differ fullReload = true; } // // Load the metadata from the remote server, start construction // LOG( 0 ) << "remotely refreshing metadata for " << ns << ( useRequestedVersion ? string( " with requested shard version " ) + reqShardVersion.toString() : "" ) << ( fullReload ? ", current shard version is " : " based on current shard version " ) << beforeShardVersion << ", current metadata version is " << beforeCollVersion << endl; string errMsg; ConnectionString configServerLoc = ConnectionString::parse( _configServer, errMsg ); MetadataLoader mdLoader( configServerLoc ); CollectionMetadata* remoteMetadataRaw = new CollectionMetadata(); CollectionMetadataPtr remoteMetadata( remoteMetadataRaw ); Timer refreshTimer; Status status = mdLoader.makeCollectionMetadata( ns, shardName, ( fullReload ? NULL : beforeMetadata.get() ), remoteMetadataRaw ); long long refreshMillis = refreshTimer.millis(); if ( status.code() == ErrorCodes::NamespaceNotFound ) { remoteMetadata.reset(); remoteMetadataRaw = NULL; } else if ( !status.isOK() ) { warning() << "could not remotely refresh metadata for " << ns << causedBy( status.reason() ) << endl; return status; } ChunkVersion remoteShardVersion; ChunkVersion remoteCollVersion; if ( remoteMetadata ) { remoteShardVersion = remoteMetadata->getShardVersion(); remoteCollVersion = remoteMetadata->getCollVersion(); } // // Get ready to install loaded metadata if needed // CollectionMetadataPtr afterMetadata; ChunkVersion afterShardVersion; ChunkVersion afterCollVersion; ChunkVersion::VersionChoice choice; // If we choose to install the new metadata, this describes the kind of install enum InstallType { InstallType_New, InstallType_Update, InstallType_Replace, InstallType_Drop, InstallType_None } installType = InstallType_None; // compiler complains otherwise { // DBLock needed since we're now potentially changing the metadata, and don't want // reads/writes to be ongoing. Lock::DBWrite writeLk( ns ); // // Get the metadata now that the load has completed // scoped_lock lk( _mutex ); CollectionMetadataMap::iterator it = _collMetadata.find( ns ); if ( it != _collMetadata.end() ) afterMetadata = it->second; if ( afterMetadata ) { afterShardVersion = afterMetadata->getShardVersion(); afterCollVersion = afterMetadata->getCollVersion(); } *latestShardVersion = afterShardVersion; // // Resolve newer pending chunks with the remote metadata, finish construction // status = mdLoader.promotePendingChunks( afterMetadata.get(), remoteMetadataRaw ); if ( !status.isOK() ) { warning() << "remote metadata for " << ns << " is inconsistent with current pending chunks" << causedBy( status.reason() ) << endl; return status; } // // Compare the 'before', 'after', and 'remote' versions/epochs and choose newest // Zero-epochs (sentinel value for "dropped" collections), are tested by // !epoch.isSet(). // choice = ChunkVersion::chooseNewestVersion( beforeCollVersion, afterCollVersion, remoteCollVersion ); if ( choice == ChunkVersion::VersionChoice_Remote ) { dassert(!remoteCollVersion.epoch().isSet() || remoteShardVersion >= beforeShardVersion); if ( !afterCollVersion.epoch().isSet() ) { // First metadata load installType = InstallType_New; dassert( it == _collMetadata.end() ); _collMetadata.insert( make_pair( ns, remoteMetadata ) ); } else if ( remoteCollVersion.epoch().isSet() && remoteCollVersion.epoch() == afterCollVersion.epoch() ) { // Update to existing metadata installType = InstallType_Update; // Invariant: If CollMetadata was not found, version should be have been 0. dassert( it != _collMetadata.end() ); it->second = remoteMetadata; } else if ( remoteCollVersion.epoch().isSet() ) { // New epoch detected, replacing metadata installType = InstallType_Replace; // Invariant: If CollMetadata was not found, version should be have been 0. dassert( it != _collMetadata.end() ); it->second = remoteMetadata; } else { dassert( !remoteCollVersion.epoch().isSet() ); // Drop detected installType = InstallType_Drop; _collMetadata.erase( it ); } *latestShardVersion = remoteShardVersion; } } // End _mutex // End DBWrite // // Do messaging based on what happened above // string versionMsg = str::stream() << " (loaded metadata version : " << remoteCollVersion.toString() << ( beforeCollVersion.epoch() == afterCollVersion.epoch() ? string( ", stored version : " ) + afterCollVersion.toString() : string( ", stored versions : " ) + beforeCollVersion.toString() + " / " + afterCollVersion.toString() ) << ", took " << refreshMillis << "ms)"; if ( choice == ChunkVersion::VersionChoice_Unknown ) { string errMsg = str::stream() << "need to retry loading metadata for " << ns << ", collection may have been dropped or recreated during load" << versionMsg; warning() << errMsg << endl; return Status( ErrorCodes::RemoteChangeDetected, errMsg ); } if ( choice == ChunkVersion::VersionChoice_Local ) { LOG( 0 ) << "newer metadata not found for " << ns << versionMsg << endl; return Status::OK(); } dassert( choice == ChunkVersion::VersionChoice_Remote ); switch( installType ) { case InstallType_New: LOG( 0 ) << "loaded new metadata for " << ns << versionMsg << endl; break; case InstallType_Update: LOG( 0 ) << "loaded newer metadata for " << ns << versionMsg << endl; break; case InstallType_Replace: LOG( 0 ) << "replacing metadata for " << ns << versionMsg << endl; break; case InstallType_Drop: LOG( 0 ) << "dropping metadata for " << ns << versionMsg << endl; break; default: verify( false ); break; } return Status::OK(); }
bool mergeChunks(OperationContext* txn, const NamespaceString& nss, const BSONObj& minKey, const BSONObj& maxKey, const OID& epoch, string* errMsg) { // // Get sharding state up-to-date // ConnectionString configLoc = ConnectionString::parse(shardingState.getConfigServer(), *errMsg); if (!configLoc.isValid()) { warning() << *errMsg << endl; return false; } // // Get the distributed lock // string whyMessage = stream() << "merging chunks in " << nss.ns() << " from " << minKey << " to " << maxKey; auto scopedDistLock = grid.catalogManager()->getDistLockManager()->lock(nss.ns(), whyMessage); if (!scopedDistLock.isOK()) { *errMsg = stream() << "could not acquire collection lock for " << nss.ns() << " to merge chunks in [" << minKey << "," << maxKey << ")" << causedBy(scopedDistLock.getStatus()); warning() << *errMsg << endl; return false; } // // We now have the collection lock, refresh metadata to latest version and sanity check // ChunkVersion shardVersion; Status status = shardingState.refreshMetadataNow(txn, nss.ns(), &shardVersion); if (!status.isOK()) { *errMsg = str::stream() << "could not merge chunks, failed to refresh metadata for " << nss.ns() << causedBy(status.reason()); warning() << *errMsg << endl; return false; } if (epoch.isSet() && shardVersion.epoch() != epoch) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has changed" << " since merge was sent" << "(sent epoch : " << epoch.toString() << ", current epoch : " << shardVersion.epoch().toString() << ")"; warning() << *errMsg << endl; return false; } CollectionMetadataPtr metadata = shardingState.getCollectionMetadata(nss.ns()); if (!metadata || metadata->getKeyPattern().isEmpty()) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " is not sharded"; warning() << *errMsg << endl; return false; } dassert(metadata->getShardVersion().equals(shardVersion)); if (!metadata->isValidKey(minKey) || !metadata->isValidKey(maxKey)) { *errMsg = stream() << "could not merge chunks, the range " << rangeToString(minKey, maxKey) << " is not valid" << " for collection " << nss.ns() << " with key pattern " << metadata->getKeyPattern(); warning() << *errMsg << endl; return false; } // // Get merged chunk information // ChunkVersion mergeVersion = metadata->getCollVersion(); mergeVersion.incMinor(); std::vector<ChunkType> chunksToMerge; ChunkType itChunk; itChunk.setMin(minKey); itChunk.setMax(minKey); itChunk.setNS(nss.ns()); itChunk.setShard(shardingState.getShardName()); while (itChunk.getMax().woCompare(maxKey) < 0 && metadata->getNextChunk(itChunk.getMax(), &itChunk)) { chunksToMerge.push_back(itChunk); } if (chunksToMerge.empty()) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " and ending at " << maxKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } // // Validate the range starts and ends at chunks and has no holes, error if not valid // BSONObj firstDocMin = chunksToMerge.front().getMin(); BSONObj firstDocMax = chunksToMerge.front().getMax(); // minKey is inclusive bool minKeyInRange = rangeContains(firstDocMin, firstDocMax, minKey); if (!minKeyInRange) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } BSONObj lastDocMin = chunksToMerge.back().getMin(); BSONObj lastDocMax = chunksToMerge.back().getMax(); // maxKey is exclusive bool maxKeyInRange = lastDocMin.woCompare(maxKey) < 0 && lastDocMax.woCompare(maxKey) >= 0; if (!maxKeyInRange) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range ending at " << maxKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } bool validRangeStartKey = firstDocMin.woCompare(minKey) == 0; bool validRangeEndKey = lastDocMax.woCompare(maxKey) == 0; if (!validRangeStartKey || !validRangeEndKey) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " does not contain a chunk " << (!validRangeStartKey ? "starting at " + minKey.toString() : "") << (!validRangeStartKey && !validRangeEndKey ? " or " : "") << (!validRangeEndKey ? "ending at " + maxKey.toString() : ""); warning() << *errMsg << endl; return false; } if (chunksToMerge.size() == 1) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " already contains chunk for " << rangeToString(minKey, maxKey); warning() << *errMsg << endl; return false; } // Look for hole in range for (size_t i = 1; i < chunksToMerge.size(); ++i) { if (chunksToMerge[i - 1].getMax().woCompare(chunksToMerge[i].getMin()) != 0) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has a hole in the range " << rangeToString(minKey, maxKey) << " at " << rangeToString(chunksToMerge[i - 1].getMax(), chunksToMerge[i].getMin()); warning() << *errMsg << endl; return false; } } // // Run apply ops command // Status applyOpsStatus = runApplyOpsCmd(chunksToMerge, shardVersion, mergeVersion); if (!applyOpsStatus.isOK()) { warning() << applyOpsStatus; return false; } // // Install merged chunk metadata // { ScopedTransaction transaction(txn, MODE_IX); Lock::DBLock writeLk(txn->lockState(), nss.db(), MODE_IX); Lock::CollectionLock collLock(txn->lockState(), nss.ns(), MODE_X); shardingState.mergeChunks(txn, nss.ns(), minKey, maxKey, mergeVersion); } // // Log change // BSONObj mergeLogEntry = buildMergeLogEntry(chunksToMerge, shardVersion, mergeVersion); grid.catalogManager()->logChange( txn->getClient()->clientAddress(true), "merge", nss.ns(), mergeLogEntry); return true; }