StatusWith<repl::OpTimeWith<CollectionType>> ShardingCatalogClientImpl::getCollection( OperationContext* opCtx, const NamespaceString& nss, repl::ReadConcernLevel readConcernLevel) { auto statusFind = _exhaustiveFindOnConfig(opCtx, kConfigReadSelector, readConcernLevel, CollectionType::ConfigNS, BSON(CollectionType::fullNs(nss.ns())), BSONObj(), 1); if (!statusFind.isOK()) { return statusFind.getStatus(); } const auto& retOpTimePair = statusFind.getValue(); const auto& retVal = retOpTimePair.value; if (retVal.empty()) { return Status(ErrorCodes::NamespaceNotFound, stream() << "collection " << nss.ns() << " not found"); } invariant(retVal.size() == 1); auto parseStatus = CollectionType::fromBSON(retVal.front()); if (!parseStatus.isOK()) { return parseStatus.getStatus(); } auto collType = parseStatus.getValue(); if (collType.getDropped()) { return Status(ErrorCodes::NamespaceNotFound, stream() << "collection " << nss.ns() << " was dropped"); } return repl::OpTimeWith<CollectionType>(collType, retOpTimePair.opTime); }
StatusWith<repl::OpTimeWith<std::vector<ShardType>>> ShardingCatalogClientImpl::getAllShards( OperationContext* opCtx, repl::ReadConcernLevel readConcern) { std::vector<ShardType> shards; auto findStatus = _exhaustiveFindOnConfig(opCtx, kConfigReadSelector, readConcern, ShardType::ConfigNS, BSONObj(), // no query filter BSONObj(), // no sort boost::none); // no limit if (!findStatus.isOK()) { return findStatus.getStatus(); } for (const BSONObj& doc : findStatus.getValue().value) { auto shardRes = ShardType::fromBSON(doc); if (!shardRes.isOK()) { return shardRes.getStatus().withContext(stream() << "Failed to parse shard document " << doc); } Status validateStatus = shardRes.getValue().validate(); if (!validateStatus.isOK()) { return validateStatus.withContext(stream() << "Failed to validate shard document " << doc); } shards.push_back(shardRes.getValue()); } return repl::OpTimeWith<std::vector<ShardType>>{std::move(shards), findStatus.getValue().opTime}; }
void MigrationManager::_schedule_inlock(OperationContext* txn, const HostAndPort& targetHost, Migration migration) { executor::TaskExecutor* const executor = Grid::get(txn)->getExecutorPool()->getFixedExecutor(); const NamespaceString nss(migration.nss); auto it = _activeMigrations.find(nss); if (it == _activeMigrations.end()) { const std::string whyMessage(stream() << "Migrating chunk(s) in collection " << nss.ns()); // Acquire the collection distributed lock (blocking call) auto statusWithDistLockHandle = Grid::get(txn)->catalogClient(txn)->getDistLockManager()->lockWithSessionID( txn, nss.ns(), whyMessage, _lockSessionID, DistLockManager::kSingleLockAttemptTimeout); if (!statusWithDistLockHandle.isOK()) { migration.completionNotification->set( Status(statusWithDistLockHandle.getStatus().code(), stream() << "Could not acquire collection lock for " << nss.ns() << " to migrate chunks, due to " << statusWithDistLockHandle.getStatus().reason())); return; } it = _activeMigrations.insert(std::make_pair(nss, MigrationsList())).first; } auto migrations = &it->second; // Add ourselves to the list of migrations on this collection migrations->push_front(std::move(migration)); auto itMigration = migrations->begin(); const RemoteCommandRequest remoteRequest( targetHost, NamespaceString::kAdminDb.toString(), itMigration->moveChunkCmdObj, txn); StatusWith<executor::TaskExecutor::CallbackHandle> callbackHandleWithStatus = executor->scheduleRemoteCommand( remoteRequest, [this, itMigration](const executor::TaskExecutor::RemoteCommandCallbackArgs& args) { Client::initThread(getThreadName().c_str()); ON_BLOCK_EXIT([&] { Client::destroy(); }); auto txn = cc().makeOperationContext(); stdx::lock_guard<stdx::mutex> lock(_mutex); _complete_inlock(txn.get(), itMigration, args.response); }); if (callbackHandleWithStatus.isOK()) { itMigration->callbackHandle = std::move(callbackHandleWithStatus.getValue()); return; } _complete_inlock(txn, itMigration, std::move(callbackHandleWithStatus.getStatus())); }
Status ChunkMoveOperationState::initialize(const BSONObj& cmdObj) { // Make sure we're as up-to-date as possible with shard information. This catches the case where // we might have changed a shard's host by removing/adding a shard with the same name. grid.shardRegistry()->reload(_txn); _fromShard = cmdObj["fromShard"].str(); if (_fromShard.empty()) { return {ErrorCodes::InvalidOptions, "need to specify shard to move chunk from"}; } _toShard = cmdObj["toShard"].str(); if (_toShard.empty()) { return {ErrorCodes::InvalidOptions, "need to specify shard to move chunk to"}; } Status epochStatus = bsonExtractOIDField(cmdObj, "epoch", &_collectionEpoch); if (!epochStatus.isOK()) { return epochStatus; } _minKey = cmdObj["min"].Obj(); if (_minKey.isEmpty()) { return {ErrorCodes::InvalidOptions, "need to specify a min"}; } _maxKey = cmdObj["max"].Obj(); if (_maxKey.isEmpty()) { return {ErrorCodes::InvalidOptions, "need to specify a max"}; } { std::shared_ptr<Shard> fromShard = grid.shardRegistry()->getShard(_txn, _fromShard); if (!fromShard) { return {ErrorCodes::ShardNotFound, stream() << "Source shard " << _fromShard << " is missing. This indicates metadata corruption."}; } _fromShardCS = fromShard->getConnString(); } { std::shared_ptr<Shard> toShard = grid.shardRegistry()->getShard(_txn, _toShard); if (!toShard) { return {ErrorCodes::ShardNotFound, stream() << "Destination shard " << _toShard << " is missing. This indicates metadata corruption."}; } _toShardCS = toShard->getConnString(); } return Status::OK(); }
/** * Returns the config version of the cluster pointed at by the connection string. * * @return OK if version found successfully, error status if something bad happened. */ Status getConfigVersion(CatalogManager* catalogManager, VersionType* versionInfo) { try { versionInfo->clear(); ScopedDbConnection conn(grid.shardRegistry()->getConfigServerConnectionString(), 30); unique_ptr<DBClientCursor> cursor(_safeCursor(conn->query("config.version", BSONObj()))); bool hasConfigData = conn->count(ShardType::ConfigNS) || conn->count(DatabaseType::ConfigNS) || conn->count(CollectionType::ConfigNS); if (!cursor->more()) { // Version is 1 if we have data, 0 if we're completely empty if (hasConfigData) { versionInfo->setMinCompatibleVersion(UpgradeHistory_UnreportedVersion); versionInfo->setCurrentVersion(UpgradeHistory_UnreportedVersion); } else { versionInfo->setMinCompatibleVersion(UpgradeHistory_EmptyVersion); versionInfo->setCurrentVersion(UpgradeHistory_EmptyVersion); } conn.done(); return Status::OK(); } BSONObj versionDoc = cursor->next(); auto versionInfoResult = VersionType::fromBSON(versionDoc); if (!versionInfoResult.isOK()) { conn.done(); return Status(ErrorCodes::UnsupportedFormat, stream() << "invalid config version document " << versionDoc << versionInfoResult.getStatus().toString()); } *versionInfo = versionInfoResult.getValue(); if (cursor->more()) { conn.done(); return Status(ErrorCodes::RemoteValidationError, stream() << "should only have 1 document " << "in config.version collection"); } conn.done(); } catch (const DBException& e) { return e.toStatus(); } return Status::OK(); }
StatusWith<repl::OpTimeWith<DatabaseType>> ShardingCatalogClientImpl::getDatabase( OperationContext* opCtx, const std::string& dbName, repl::ReadConcernLevel readConcernLevel) { if (!NamespaceString::validDBName(dbName, NamespaceString::DollarInDbNameBehavior::Allow)) { return {ErrorCodes::InvalidNamespace, stream() << dbName << " is not a valid db name"}; } // The admin database is always hosted on the config server. if (dbName == NamespaceString::kAdminDb) { DatabaseType dbt( dbName, ShardRegistry::kConfigServerShardId, false, databaseVersion::makeFixed()); return repl::OpTimeWith<DatabaseType>(dbt); } // The config database's primary shard is always config, and it is always sharded. if (dbName == NamespaceString::kConfigDb) { DatabaseType dbt( dbName, ShardRegistry::kConfigServerShardId, true, databaseVersion::makeFixed()); return repl::OpTimeWith<DatabaseType>(dbt); } auto result = _fetchDatabaseMetadata(opCtx, dbName, kConfigReadSelector, readConcernLevel); if (result == ErrorCodes::NamespaceNotFound) { // If we failed to find the database metadata on the 'nearest' config server, try again // against the primary, in case the database was recently created. result = _fetchDatabaseMetadata( opCtx, dbName, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, readConcernLevel); if (!result.isOK() && (result != ErrorCodes::NamespaceNotFound)) { return result.getStatus().withContext( str::stream() << "Could not confirm non-existence of database " << dbName); } } return result; }
StatusWith<string> CatalogManagerReplicaSet::getTagForChunk(const std::string& collectionNs, const ChunkType& chunk) { auto configShard = grid.shardRegistry()->getShard("config"); auto readHostStatus = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHostStatus.isOK()) { return readHostStatus.getStatus(); } BSONObj query = BSON(TagsType::ns(collectionNs) << TagsType::min() << BSON("$lte" << chunk.getMin()) << TagsType::max() << BSON("$gte" << chunk.getMax())); auto findStatus = grid.shardRegistry()->exhaustiveFind( readHostStatus.getValue(), NamespaceString(TagsType::ConfigNS), query, BSONObj(), 1); if (!findStatus.isOK()) { return findStatus.getStatus(); } const auto& docs = findStatus.getValue(); if (docs.empty()) { return string{}; } invariant(docs.size() == 1); BSONObj tagsDoc = docs.front(); const auto tagsResult = TagsType::fromBSON(tagsDoc); if (!tagsResult.isOK()) { return {ErrorCodes::FailedToParse, stream() << "error while parsing " << TagsType::ConfigNS << " document: " << tagsDoc << " : " << tagsResult.getStatus().toString()}; } return tagsResult.getValue().getTag(); }
StatusWith<CollectionType> CatalogManagerReplicaSet::getCollection(const std::string& collNs) { auto configShard = grid.shardRegistry()->getShard("config"); auto readHostStatus = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHostStatus.isOK()) { return readHostStatus.getStatus(); } auto statusFind = grid.shardRegistry()->exhaustiveFind(readHostStatus.getValue(), NamespaceString(CollectionType::ConfigNS), BSON(CollectionType::fullNs(collNs)), 1); if (!statusFind.isOK()) { return statusFind.getStatus(); } const auto& retVal = statusFind.getValue(); if (retVal.empty()) { return Status(ErrorCodes::NamespaceNotFound, stream() << "collection " << collNs << " not found"); } invariant(retVal.size() == 1); return CollectionType::fromBSON(retVal.front()); }
Status CatalogManagerReplicaSet::getChunks(const Query& query, int nToReturn, vector<ChunkType>* chunks) { chunks->clear(); auto configShard = grid.shardRegistry()->getShard("config"); auto readHostStatus = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHostStatus.isOK()) { return readHostStatus.getStatus(); } auto findStatus = grid.shardRegistry()->exhaustiveFind(readHostStatus.getValue(), NamespaceString(ChunkType::ConfigNS), query.obj, boost::none); // no limit if (!findStatus.isOK()) { return findStatus.getStatus(); } for (const BSONObj& obj : findStatus.getValue()) { auto chunkRes = ChunkType::fromBSON(obj); if (!chunkRes.isOK()) { chunks->clear(); return {ErrorCodes::FailedToParse, stream() << "Failed to parse chunk with id (" << obj[ChunkType::name()].toString() << "): " << chunkRes.getStatus().toString()}; } chunks->push_back(chunkRes.getValue()); } return Status::OK(); }
Status CatalogManagerReplicaSet::getAllShards(vector<ShardType>* shards) { const auto configShard = grid.shardRegistry()->getShard("config"); const auto readHost = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHost.isOK()) { return readHost.getStatus(); } auto findStatus = grid.shardRegistry()->exhaustiveFind(readHost.getValue(), NamespaceString(ShardType::ConfigNS), BSONObj(), // no query filter boost::none); // no limit if (!findStatus.isOK()) { return findStatus.getStatus(); } for (const BSONObj& doc : findStatus.getValue()) { auto shardRes = ShardType::fromBSON(doc); if (!shardRes.isOK()) { shards->clear(); return {ErrorCodes::FailedToParse, stream() << "Failed to parse shard with id (" << doc[ShardType::name()].toString() << "): " << shardRes.getStatus().toString()}; } shards->push_back(shardRes.getValue()); } return Status::OK(); }
StatusWith<repl::OpTimeWith<DatabaseType>> ShardingCatalogClientImpl::_fetchDatabaseMetadata( OperationContext* opCtx, const std::string& dbName, const ReadPreferenceSetting& readPref, repl::ReadConcernLevel readConcernLevel) { invariant(dbName != NamespaceString::kAdminDb && dbName != NamespaceString::kConfigDb); auto findStatus = _exhaustiveFindOnConfig(opCtx, readPref, readConcernLevel, DatabaseType::ConfigNS, BSON(DatabaseType::name(dbName)), BSONObj(), boost::none); if (!findStatus.isOK()) { return findStatus.getStatus(); } const auto& docsWithOpTime = findStatus.getValue(); if (docsWithOpTime.value.empty()) { return {ErrorCodes::NamespaceNotFound, stream() << "database " << dbName << " not found"}; } invariant(docsWithOpTime.value.size() == 1); auto parseStatus = DatabaseType::fromBSON(docsWithOpTime.value.front()); if (!parseStatus.isOK()) { return parseStatus.getStatus(); } return repl::OpTimeWith<DatabaseType>(parseStatus.getValue(), docsWithOpTime.opTime); }
CollectionMetadata* CollectionMetadata::clonePlusChunk(const ChunkType& chunk, const ChunkVersion& newShardVersion, string* errMsg) const { // The error message string is optional. string dummy; if (errMsg == NULL) { errMsg = &dummy; } // It is acceptable to move version backwards (e.g., undoing a migration that went bad // during commit) but only cloning away the last chunk may reset the version to 0. if (!newShardVersion.isSet()) { *errMsg = stream() << "cannot add chunk " << rangeToString(chunk.getMin(), chunk.getMax()) << " with zero shard version"; warning() << *errMsg; return NULL; } invariant(chunk.getMin().woCompare(chunk.getMax()) < 0); // Check that there isn't any chunk on the interval to be added. if (rangeMapOverlaps(_chunksMap, chunk.getMin(), chunk.getMax())) { RangeVector overlap; getRangeMapOverlap(_chunksMap, chunk.getMin(), chunk.getMax(), &overlap); *errMsg = stream() << "cannot add chunk " << rangeToString(chunk.getMin(), chunk.getMax()) << " because the chunk overlaps " << overlapToString(overlap); warning() << *errMsg; return NULL; } unique_ptr<CollectionMetadata> metadata(new CollectionMetadata); metadata->_keyPattern = this->_keyPattern; metadata->_keyPattern.getOwned(); metadata->fillKeyPatternFields(); metadata->_pendingMap = this->_pendingMap; metadata->_chunksMap = this->_chunksMap; metadata->_chunksMap.insert(make_pair(chunk.getMin().getOwned(), chunk.getMax().getOwned())); metadata->_shardVersion = newShardVersion; metadata->_collVersion = newShardVersion > _collVersion ? newShardVersion : this->_collVersion; metadata->fillRanges(); invariant(metadata->isValid()); return metadata.release(); }
Status MigrationManager::_processRemoteCommandResponse( const RemoteCommandResponse& remoteCommandResponse, ScopedMigrationRequest* scopedMigrationRequest) { stdx::lock_guard<stdx::mutex> lock(_mutex); Status commandStatus(ErrorCodes::InternalError, "Uninitialized value."); // Check for local errors sending the remote command caused by stepdown. if (isErrorDueToConfigStepdown(remoteCommandResponse.status, _state != State::kEnabled && _state != State::kRecovering)) { scopedMigrationRequest->keepDocumentOnDestruct(); return {ErrorCodes::BalancerInterrupted, stream() << "Migration interrupted because the balancer is stopping." << " Command status: " << remoteCommandResponse.status.toString()}; } if (!remoteCommandResponse.isOK()) { commandStatus = remoteCommandResponse.status; } else { // TODO: delete in 3.8 commandStatus = extractMigrationStatusFromCommandResponse(remoteCommandResponse.data); } if (!Shard::shouldErrorBePropagated(commandStatus.code())) { commandStatus = {ErrorCodes::OperationFailed, stream() << "moveChunk command failed on source shard." << causedBy(commandStatus)}; } // Any failure to remove the migration document should be because the config server is // stepping/shutting down. In this case we must fail the moveChunk command with a retryable // error so that the caller does not move on to other distlock requiring operations that could // fail when the balancer recovers and takes distlocks for migration recovery. Status status = scopedMigrationRequest->tryToRemoveMigration(); if (!status.isOK()) { commandStatus = { ErrorCodes::BalancerInterrupted, stream() << "Migration interrupted because the balancer is stopping" << " and failed to remove the config.migrations document." << " Command status: " << (commandStatus.isOK() ? status.toString() : commandStatus.toString())}; } return commandStatus; }
CollectionMetadata* CollectionMetadata::clonePlusPending(const ChunkType& pending, string* errMsg) const { // The error message string is optional. string dummy; if (errMsg == NULL) { errMsg = &dummy; } if (rangeMapOverlaps(_chunksMap, pending.getMin(), pending.getMax())) { RangeVector overlap; getRangeMapOverlap(_chunksMap, pending.getMin(), pending.getMax(), &overlap); *errMsg = stream() << "cannot add pending chunk " << rangeToString(pending.getMin(), pending.getMax()) << " because the chunk overlaps " << overlapToString(overlap); warning() << *errMsg; return NULL; } unique_ptr<CollectionMetadata> metadata(new CollectionMetadata); metadata->_keyPattern = this->_keyPattern; metadata->_keyPattern.getOwned(); metadata->fillKeyPatternFields(); metadata->_pendingMap = this->_pendingMap; metadata->_chunksMap = this->_chunksMap; metadata->_rangesMap = this->_rangesMap; metadata->_shardVersion = _shardVersion; metadata->_collVersion = _collVersion; // If there are any pending chunks on the interval to be added this is ok, since pending // chunks aren't officially tracked yet and something may have changed on servers we do not // see yet. // We remove any chunks we overlap, the remote request starting a chunk migration must have // been authoritative. if (rangeMapOverlaps(_pendingMap, pending.getMin(), pending.getMax())) { RangeVector pendingOverlap; getRangeMapOverlap(_pendingMap, pending.getMin(), pending.getMax(), &pendingOverlap); warning() << "new pending chunk " << rangeToString(pending.getMin(), pending.getMax()) << " overlaps existing pending chunks " << overlapToString(pendingOverlap) << ", a migration may not have completed"; for (RangeVector::iterator it = pendingOverlap.begin(); it != pendingOverlap.end(); ++it) { metadata->_pendingMap.erase(it->first); } } metadata->_pendingMap.insert(make_pair(pending.getMin(), pending.getMax())); invariant(metadata->isValid()); return metadata.release(); }
CollectionMetadata* CollectionMetadata::cloneMinusPending(const ChunkType& pending, string* errMsg) const { // The error message string is optional. string dummy; if (errMsg == NULL) { errMsg = &dummy; } // Check that we have the exact chunk that will be subtracted. if (!rangeMapContains(_pendingMap, pending.getMin(), pending.getMax())) { *errMsg = stream() << "cannot remove pending chunk " << rangeToString(pending.getMin(), pending.getMax()) << ", this shard does not contain the chunk"; if (rangeMapOverlaps(_pendingMap, pending.getMin(), pending.getMax())) { RangeVector overlap; getRangeMapOverlap(_pendingMap, pending.getMin(), pending.getMax(), &overlap); *errMsg += stream() << " and it overlaps " << overlapToString(overlap); } warning() << *errMsg; return NULL; } unique_ptr<CollectionMetadata> metadata(new CollectionMetadata); metadata->_keyPattern = this->_keyPattern; metadata->_keyPattern.getOwned(); metadata->fillKeyPatternFields(); metadata->_pendingMap = this->_pendingMap; metadata->_pendingMap.erase(pending.getMin()); metadata->_chunksMap = this->_chunksMap; metadata->_rangesMap = this->_rangesMap; metadata->_shardVersion = _shardVersion; metadata->_collVersion = _collVersion; invariant(metadata->isValid()); return metadata.release(); }
StatusWith<std::vector<ChunkType>> ShardingCatalogClientImpl::getChunks( OperationContext* opCtx, const BSONObj& query, const BSONObj& sort, boost::optional<int> limit, OpTime* opTime, repl::ReadConcernLevel readConcern) { invariant(serverGlobalParams.clusterRole == ClusterRole::ConfigServer || readConcern == repl::ReadConcernLevel::kMajorityReadConcern); // Convert boost::optional<int> to boost::optional<long long>. auto longLimit = limit ? boost::optional<long long>(*limit) : boost::none; auto findStatus = _exhaustiveFindOnConfig( opCtx, kConfigReadSelector, readConcern, ChunkType::ConfigNS, query, sort, longLimit); if (!findStatus.isOK()) { return findStatus.getStatus().withContext("Failed to load chunks"); } const auto& chunkDocsOpTimePair = findStatus.getValue(); std::vector<ChunkType> chunks; for (const BSONObj& obj : chunkDocsOpTimePair.value) { auto chunkRes = ChunkType::fromConfigBSON(obj); if (!chunkRes.isOK()) { return chunkRes.getStatus().withContext(stream() << "Failed to parse chunk with id " << obj[ChunkType::name()]); } chunks.push_back(chunkRes.getValue()); } if (opTime) { *opTime = chunkDocsOpTimePair.opTime; } return chunks; }
StatusWith<DatabaseType> CatalogManagerReplicaSet::getDatabase(const std::string& dbName) { invariant(nsIsDbOnly(dbName)); // The two databases that are hosted on the config server are config and admin if (dbName == "config" || dbName == "admin") { DatabaseType dbt; dbt.setName(dbName); dbt.setSharded(false); dbt.setPrimary("config"); return dbt; } const auto configShard = grid.shardRegistry()->getShard("config"); const auto readHost = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHost.isOK()) { return readHost.getStatus(); } auto findStatus = grid.shardRegistry()->exhaustiveFind(readHost.getValue(), NamespaceString(DatabaseType::ConfigNS), BSON(DatabaseType::name(dbName)), 1); if (!findStatus.isOK()) { return findStatus.getStatus(); } const auto& docs = findStatus.getValue(); if (docs.empty()) { return {ErrorCodes::NamespaceNotFound, stream() << "database " << dbName << " not found"}; } invariant(docs.size() == 1); return DatabaseType::fromBSON(docs.front()); }
shared_ptr<Notification<Status>> MigrationManager::_schedule( OperationContext* txn, const MigrateInfo& migrateInfo, bool shardTakesCollectionDistLock, uint64_t maxChunkSizeBytes, const MigrationSecondaryThrottleOptions& secondaryThrottle, bool waitForDelete) { const NamespaceString nss(migrateInfo.ns); // Sanity checks that the chunk being migrated is actually valid. These will be repeated at the // shard as well, but doing them here saves an extra network call, which might otherwise fail. auto statusWithScopedChunkManager = ScopedChunkManager::getExisting(txn, nss); if (!statusWithScopedChunkManager.isOK()) { return std::make_shared<Notification<Status>>( std::move(statusWithScopedChunkManager.getStatus())); } ChunkManager* const chunkManager = statusWithScopedChunkManager.getValue().cm(); auto chunk = chunkManager->findIntersectingChunkWithSimpleCollation(txn, migrateInfo.minKey); invariant(chunk); // If the chunk is not found exactly as requested, the caller must have stale data if (chunk->getMin() != migrateInfo.minKey || chunk->getMax() != migrateInfo.maxKey) { return std::make_shared<Notification<Status>>(Status( ErrorCodes::IncompatibleShardingMetadata, stream() << "Chunk " << ChunkRange(migrateInfo.minKey, migrateInfo.maxKey).toString() << " does not exist.")); } // If chunk is already on the correct shard, just treat the operation as success if (chunk->getShardId() == migrateInfo.to) { return std::make_shared<Notification<Status>>(Status::OK()); } const auto recipientShard = Grid::get(txn)->shardRegistry()->getShard(txn, migrateInfo.from); auto hostStatus = recipientShard->getTargeter()->findHost( ReadPreferenceSetting{ReadPreference::PrimaryOnly}, RemoteCommandTargeter::selectFindHostMaxWaitTime(txn)); if (!hostStatus.isOK()) { return std::make_shared<Notification<Status>>(std::move(hostStatus.getStatus())); } BSONObjBuilder builder; MoveChunkRequest::appendAsCommand( &builder, nss, chunkManager->getVersion(), Grid::get(txn)->shardRegistry()->getConfigServerConnectionString(), migrateInfo.from, migrateInfo.to, ChunkRange(migrateInfo.minKey, migrateInfo.maxKey), maxChunkSizeBytes, secondaryThrottle, waitForDelete, shardTakesCollectionDistLock); Migration migration(nss, builder.obj()); auto retVal = migration.completionNotification; if (shardTakesCollectionDistLock) { _scheduleWithoutDistLock(txn, hostStatus.getValue(), std::move(migration)); } else { _scheduleWithDistLock(txn, hostStatus.getValue(), std::move(migration)); } return retVal; }
StatusWith<std::unique_ptr<CollectionMetadata>> CollectionMetadata::cloneMerge( const BSONObj& minKey, const BSONObj& maxKey, const ChunkVersion& newShardVersion) const { invariant(newShardVersion.epoch() == _shardVersion.epoch()); invariant(newShardVersion > _shardVersion); RangeVector overlap; getRangeMapOverlap(_chunksMap, minKey, maxKey, &overlap); if (overlap.empty() || overlap.size() == 1) { return {ErrorCodes::IllegalOperation, stream() << "cannot merge range " << rangeToString(minKey, maxKey) << (overlap.empty() ? ", no chunks found in this range" : ", only one chunk found in this range")}; } bool validStartEnd = true; bool validNoHoles = true; if (overlap.begin()->first.woCompare(minKey) != 0) { // First chunk doesn't start with minKey validStartEnd = false; } else if (overlap.rbegin()->second.woCompare(maxKey) != 0) { // Last chunk doesn't end with maxKey validStartEnd = false; } else { // Check that there are no holes BSONObj prevMaxKey = minKey; for (RangeVector::iterator it = overlap.begin(); it != overlap.end(); ++it) { if (it->first.woCompare(prevMaxKey) != 0) { validNoHoles = false; break; } prevMaxKey = it->second; } } if (!validStartEnd || !validNoHoles) { return {ErrorCodes::IllegalOperation, stream() << "cannot merge range " << rangeToString(minKey, maxKey) << ", overlapping chunks " << overlapToString(overlap) << (!validStartEnd ? " do not have the same min and max key" : " are not all adjacent")}; } unique_ptr<CollectionMetadata> metadata(stdx::make_unique<CollectionMetadata>()); metadata->_keyPattern = _keyPattern.getOwned(); metadata->fillKeyPatternFields(); metadata->_pendingMap = _pendingMap; metadata->_chunksMap = _chunksMap; metadata->_rangesMap = _rangesMap; metadata->_shardVersion = newShardVersion; metadata->_collVersion = newShardVersion > _collVersion ? newShardVersion : this->_collVersion; for (RangeVector::iterator it = overlap.begin(); it != overlap.end(); ++it) { metadata->_chunksMap.erase(it->first); } metadata->_chunksMap.insert(make_pair(minKey, maxKey)); invariant(metadata->isValid()); return std::move(metadata); }
StatusWith<std::unique_ptr<CollectionMetadata>> CollectionMetadata::cloneSplit( const BSONObj& minKey, const BSONObj& maxKey, const std::vector<BSONObj>& splitKeys, const ChunkVersion& newShardVersion) const { invariant(newShardVersion.epoch() == _shardVersion.epoch()); invariant(newShardVersion > _shardVersion); // The version required in both resulting chunks could be simply an increment in the // minor portion of the current version. However, we are enforcing uniqueness over the // attributes <ns, version> of the configdb collection 'chunks'. So in practice, a // migrate somewhere may force this split to pick up a version that has the major // portion higher than the one that this shard has been using. // // TODO drop the uniqueness constraint and tighten the check below so that only the // minor portion of version changes // Check that we have the exact chunk that will be subtracted. if (!rangeMapContains(_chunksMap, minKey, maxKey)) { stream errMsg; errMsg << "cannot split chunk " << rangeToString(minKey, maxKey) << ", this shard does not contain the chunk"; if (rangeMapOverlaps(_chunksMap, minKey, maxKey)) { RangeVector overlap; getRangeMapOverlap(_chunksMap, minKey, maxKey, &overlap); errMsg << " and it overlaps " << overlapToString(overlap); } return {ErrorCodes::IllegalOperation, errMsg}; } unique_ptr<CollectionMetadata> metadata(stdx::make_unique<CollectionMetadata>()); metadata->_keyPattern = _keyPattern.getOwned(); metadata->fillKeyPatternFields(); metadata->_pendingMap = _pendingMap; metadata->_chunksMap = _chunksMap; metadata->_shardVersion = newShardVersion; // will increment 2nd, 3rd,... chunks below BSONObj startKey = minKey; for (const auto& split : splitKeys) { // Check that the split key is valid if (!rangeContains(minKey, maxKey, split)) { return {ErrorCodes::IllegalOperation, stream() << "cannot split chunk " << rangeToString(minKey, maxKey) << " at key " << split}; } // Check that the split keys are in order if (split.woCompare(startKey) <= 0) { // The split keys came in out of order, this probably indicates a bug, so fail the // operation. Re-iterate splitKeys to build a useful error message including the array // of splitKeys in the order received. str::stream errMsg; errMsg << "Invalid input to splitChunk, split keys must be in order, got: ["; for (auto it2 = splitKeys.cbegin(); it2 != splitKeys.cend(); ++it2) { if (it2 != splitKeys.begin()) { errMsg << ", "; } errMsg << it2->toString(); } errMsg << "]"; return {ErrorCodes::IllegalOperation, errMsg}; } metadata->_chunksMap[startKey] = split.getOwned(); metadata->_chunksMap.insert(make_pair(split.getOwned(), maxKey.getOwned())); metadata->_shardVersion.incMinor(); startKey = split; } metadata->_collVersion = metadata->_shardVersion > _collVersion ? metadata->_shardVersion : _collVersion; metadata->fillRanges(); invariant(metadata->isValid()); return std::move(metadata); }
string CollectionMetadata::toStringBasic() const { return stream() << "collection version: " << _collVersion.toString() << ", shard version: " << _shardVersion.toString(); }
StatusWith<ForwardingCatalogManager::ScopedDistLock*> ChunkMoveOperationState::acquireMoveMetadata( OperationContext* txn) { // Get the distributed lock const string whyMessage(stream() << "migrating chunk [" << minKey << ", " << maxKey << ") in " << _nss.ns()); _distLockStatus = grid.forwardingCatalogManager()->distLock(txn, _nss.ns(), whyMessage); if (!_distLockStatus->isOK()) { const string msg = stream() << "could not acquire collection lock for " << _nss.ns() << " to migrate chunk [" << minKey << "," << maxKey << ")" << causedBy(_distLockStatus->getStatus()); warning() << msg; return Status(_distLockStatus->getStatus().code(), msg); } ShardingState* const shardingState = ShardingState::get(txn); // Snapshot the metadata Status refreshStatus = shardingState->refreshMetadataNow(txn, _nss.ns(), &_shardVersion); if (!refreshStatus.isOK()) { const string msg = stream() << "moveChunk cannot start migrate of chunk " << "[" << minKey << "," << maxKey << ")" << causedBy(refreshStatus.reason()); warning() << msg; return Status(refreshStatus.code(), msg); } if (_shardVersion.majorVersion() == 0) { // It makes no sense to migrate if our version is zero and we have no chunks const string msg = stream() << "moveChunk cannot start migrate of chunk " << "[" << minKey << "," << maxKey << ")" << " with zero shard version"; warning() << msg; return Status(ErrorCodes::IncompatibleShardingMetadata, msg); } if (_collectionEpoch != _shardVersion.epoch()) { const string msg = stream() << "moveChunk cannot move chunk " << "[" << minKey << "," << maxKey << "), " << "collection may have been dropped. " << "current epoch: " << _shardVersion.epoch() << ", cmd epoch: " << _collectionEpoch; warning() << msg; return Status(ErrorCodes::IncompatibleShardingMetadata, msg); } _collMetadata = shardingState->getCollectionMetadata(_nss.ns()); // With nonzero shard version, we must have a coll version >= our shard version invariant(_collMetadata->getCollVersion() >= _shardVersion); // With nonzero shard version, we must have a shard key invariant(!_collMetadata->getKeyPattern().isEmpty()); ChunkType origChunk; if (!_collMetadata->getNextChunk(getMinKey(), &origChunk) || origChunk.getMin().woCompare(getMinKey()) || origChunk.getMax().woCompare(getMaxKey())) { // Our boundaries are different from those passed in const string msg = stream() << "moveChunk cannot find chunk " << "[" << minKey << "," << maxKey << ")" << " to migrate, the chunk boundaries may be stale"; warning() << msg; return Status(ErrorCodes::IncompatibleShardingMetadata, msg); } return &_distLockStatus->getValue(); }
Status ShardingCatalogClientImpl::insertConfigDocument(OperationContext* opCtx, const NamespaceString& nss, const BSONObj& doc, const WriteConcernOptions& writeConcern) { invariant(nss.db() == NamespaceString::kAdminDb || nss.db() == NamespaceString::kConfigDb); const BSONElement idField = doc.getField("_id"); invariant(!idField.eoo()); BatchedCommandRequest request([&] { write_ops::Insert insertOp(nss); insertOp.setDocuments({doc}); return insertOp; }()); request.setWriteConcern(writeConcern.toBSON()); auto configShard = Grid::get(opCtx)->shardRegistry()->getConfigShard(); for (int retry = 1; retry <= kMaxWriteRetry; retry++) { auto response = configShard->runBatchWriteCommand( opCtx, Shard::kDefaultConfigCommandTimeout, request, Shard::RetryPolicy::kNoRetry); Status status = response.toStatus(); if (retry < kMaxWriteRetry && configShard->isRetriableError(status.code(), Shard::RetryPolicy::kIdempotent)) { // Pretend like the operation is idempotent because we're handling DuplicateKey errors // specially continue; } // If we get DuplicateKey error on the first attempt to insert, this definitively means that // we are trying to insert the same entry a second time, so error out. If it happens on a // retry attempt though, it is not clear whether we are actually inserting a duplicate key // or it is because we failed to wait for write concern on the first attempt. In order to // differentiate, fetch the entry and check. if (retry > 1 && status == ErrorCodes::DuplicateKey) { LOG(1) << "Insert retry failed because of duplicate key error, rechecking."; auto fetchDuplicate = _exhaustiveFindOnConfig(opCtx, ReadPreferenceSetting{ReadPreference::PrimaryOnly}, repl::ReadConcernLevel::kMajorityReadConcern, nss, idField.wrap(), BSONObj(), boost::none); if (!fetchDuplicate.isOK()) { return fetchDuplicate.getStatus(); } auto existingDocs = fetchDuplicate.getValue().value; if (existingDocs.empty()) { return {status.withContext( stream() << "DuplicateKey error was returned after a retry attempt, but no " "documents were found. This means a concurrent change occurred " "together with the retries.")}; } invariant(existingDocs.size() == 1); BSONObj existing = std::move(existingDocs.front()); if (existing.woCompare(doc) == 0) { // Documents match, so treat the operation as success return Status::OK(); } } return status; } MONGO_UNREACHABLE; }
CollectionMetadata* CollectionMetadata::cloneMigrate(const ChunkType& chunk, const ChunkVersion& newShardVersion, string* errMsg) const { // The error message string is optional. string dummy; if (errMsg == NULL) { errMsg = &dummy; } // Check that we have the exact chunk that will be subtracted. if (!rangeMapContains(_chunksMap, chunk.getMin(), chunk.getMax())) { *errMsg = stream() << "cannot remove chunk " << rangeToString(chunk.getMin(), chunk.getMax()) << ", this shard does not contain the chunk"; if (rangeMapOverlaps(_chunksMap, chunk.getMin(), chunk.getMax())) { RangeVector overlap; getRangeMapOverlap(_chunksMap, chunk.getMin(), chunk.getMax(), &overlap); *errMsg += stream() << " and it overlaps " << overlapToString(overlap); } warning() << *errMsg; return NULL; } // If left with no chunks, check that the version is zero. if (_chunksMap.size() == 1) { if (newShardVersion.isSet()) { *errMsg = stream() << "cannot set shard version to non-zero value " << newShardVersion.toString() << " when removing last chunk " << rangeToString(chunk.getMin(), chunk.getMax()); warning() << *errMsg; return NULL; } } // Can't move version backwards when subtracting chunks. This is what guarantees that // no read or write would be taken once we subtract data from the current shard. else if (newShardVersion <= _shardVersion) { *errMsg = stream() << "cannot remove chunk " << rangeToString(chunk.getMin(), chunk.getMax()) << " because the new shard version " << newShardVersion.toString() << " is not greater than the current shard version " << _shardVersion.toString(); warning() << *errMsg; return NULL; } unique_ptr<CollectionMetadata> metadata(new CollectionMetadata); metadata->_keyPattern = this->_keyPattern; metadata->_keyPattern.getOwned(); metadata->fillKeyPatternFields(); metadata->_pendingMap = this->_pendingMap; metadata->_chunksMap = this->_chunksMap; metadata->_chunksMap.erase(chunk.getMin()); metadata->_shardVersion = newShardVersion; metadata->_collVersion = newShardVersion > _collVersion ? newShardVersion : this->_collVersion; metadata->fillRanges(); invariant(metadata->isValid()); return metadata.release(); }
bool checkAndUpgradeConfigVersion(CatalogManager* catalogManager, bool upgrade, VersionType* initialVersionInfo, VersionType* versionInfo, string* errMsg) { string dummy; if (!errMsg) { errMsg = &dummy; } Status getConfigStatus = getConfigVersion(catalogManager, versionInfo); if (!getConfigStatus.isOK()) { *errMsg = stream() << "could not load config version for upgrade" << causedBy(getConfigStatus); return false; } versionInfo->cloneTo(initialVersionInfo); VersionStatus comp = isConfigVersionCompatible(*versionInfo, errMsg); if (comp == VersionStatus_Incompatible) return false; if (comp == VersionStatus_Compatible) return true; invariant(comp == VersionStatus_NeedUpgrade); // // Our current config version is now greater than the current version, so we should upgrade // if possible. // // The first empty version is technically an upgrade, but has special semantics bool isEmptyVersion = versionInfo->getCurrentVersion() == UpgradeHistory_EmptyVersion; // First check for the upgrade flag (but no flag is needed if we're upgrading from empty) if (!isEmptyVersion && !upgrade) { *errMsg = stream() << "newer version " << CURRENT_CONFIG_VERSION << " of mongo config metadata is required, " << "current version is " << versionInfo->getCurrentVersion() << ", " << "need to run mongos with --upgrade"; return false; } // Contact the config servers to make sure all are online - otherwise we wait a long time // for locks. if (!_checkConfigServersAlive(catalogManager->connectionString(), errMsg)) { if (isEmptyVersion) { *errMsg = stream() << "all config servers must be reachable for initial" << " config database creation" << causedBy(errMsg); } else { *errMsg = stream() << "all config servers must be reachable for config upgrade" << causedBy(errMsg); } return false; } // Check whether or not the balancer is online, if it is online we will not upgrade // (but we will initialize the config server) if (!isEmptyVersion) { auto balSettingsResult = catalogManager->getGlobalSettings(SettingsType::BalancerDocKey); if (balSettingsResult.isOK()) { SettingsType balSettings = balSettingsResult.getValue(); if (!balSettings.getBalancerStopped()) { *errMsg = stream() << "balancer must be stopped for config upgrade" << causedBy(errMsg); } } } // // Acquire a lock for the upgrade process. // // We want to ensure that only a single mongo process is upgrading the config server at a // time. // string whyMessage(stream() << "upgrading config database to new format v" << CURRENT_CONFIG_VERSION); auto lockTimeout = stdx::chrono::milliseconds(20 * 60 * 1000); auto scopedDistLock = catalogManager->getDistLockManager()->lock("configUpgrade", whyMessage, lockTimeout); if (!scopedDistLock.isOK()) { *errMsg = scopedDistLock.getStatus().toString(); return false; } // // Double-check compatibility inside the upgrade lock // Another process may have won the lock earlier and done the upgrade for us, check // if this is the case. // getConfigStatus = getConfigVersion(catalogManager, versionInfo); if (!getConfigStatus.isOK()) { *errMsg = stream() << "could not reload config version for upgrade" << causedBy(getConfigStatus); return false; } versionInfo->cloneTo(initialVersionInfo); comp = isConfigVersionCompatible(*versionInfo, errMsg); if (comp == VersionStatus_Incompatible) return false; if (comp == VersionStatus_Compatible) return true; invariant(comp == VersionStatus_NeedUpgrade); // // Run through the upgrade steps necessary to bring our config version to the current // version // log() << "starting upgrade of config server from v" << versionInfo->getCurrentVersion() << " to v" << CURRENT_CONFIG_VERSION; ConfigUpgradeRegistry registry(createRegistry()); while (versionInfo->getCurrentVersion() < CURRENT_CONFIG_VERSION) { int fromVersion = versionInfo->getCurrentVersion(); // // Run the next upgrade process and replace versionInfo with the result of the // upgrade. // if (!_nextUpgrade(catalogManager, registry, *versionInfo, versionInfo, errMsg)) { return false; } // Ensure we're making progress here if (versionInfo->getCurrentVersion() <= fromVersion) { *errMsg = stream() << "bad v" << fromVersion << " config version upgrade, " << "version did not increment and is now " << versionInfo->getCurrentVersion(); return false; } } invariant(versionInfo->getCurrentVersion() == CURRENT_CONFIG_VERSION); log() << "upgrade of config server to v" << versionInfo->getCurrentVersion() << " successful"; return true; }
StatusWith<ForwardingCatalogManager::ScopedDistLock*> ChunkMoveOperationState::acquireMoveMetadata() { // Get the distributed lock const string whyMessage(stream() << "migrating chunk [" << _minKey << ", " << _maxKey << ") in " << _nss.ns()); _distLockStatus = grid.forwardingCatalogManager()->distLock(_txn, _nss.ns(), whyMessage); if (!_distLockStatus->isOK()) { const string msg = stream() << "could not acquire collection lock for " << _nss.ns() << " to migrate chunk [" << _minKey << "," << _maxKey << ")" << causedBy(_distLockStatus->getStatus()); warning() << msg; return Status(_distLockStatus->getStatus().code(), msg); } ShardingState* const shardingState = ShardingState::get(_txn); // Snapshot the metadata Status refreshStatus = shardingState->refreshMetadataNow(_txn, _nss.ns(), &_shardVersion); if (!refreshStatus.isOK()) { const string msg = stream() << "moveChunk cannot start migrate of chunk " << "[" << _minKey << "," << _maxKey << ")" << causedBy(refreshStatus.reason()); warning() << msg; return Status(refreshStatus.code(), msg); } if (_shardVersion.majorVersion() == 0) { // It makes no sense to migrate if our version is zero and we have no chunks const string msg = stream() << "moveChunk cannot start migrate of chunk " << "[" << _minKey << "," << _maxKey << ")" << " with zero shard version"; warning() << msg; return Status(ErrorCodes::IncompatibleShardingMetadata, msg); } { // Mongos >= v3.2 sends the full version, v3.0 only sends the epoch. // TODO(SERVER-20742): Stop parsing epoch separately after 3.2. auto& operationVersion = OperationShardVersion::get(_txn); if (operationVersion.hasShardVersion()) { _collectionVersion = operationVersion.getShardVersion(_nss); _collectionEpoch = _collectionVersion.epoch(); } // else the epoch will already be set from the parsing of the ChunkMoveOperationState if (_collectionEpoch != _shardVersion.epoch()) { const string msg = stream() << "moveChunk cannot move chunk " << "[" << _minKey << "," << _maxKey << "), " << "collection may have been dropped. " << "current epoch: " << _shardVersion.epoch() << ", cmd epoch: " << _collectionEpoch; warning() << msg; throw SendStaleConfigException(_nss.toString(), msg, _collectionVersion, _shardVersion); } } _collMetadata = shardingState->getCollectionMetadata(_nss.ns()); // With nonzero shard version, we must have a coll version >= our shard version invariant(_collMetadata->getCollVersion() >= _shardVersion); // With nonzero shard version, we must have a shard key invariant(!_collMetadata->getKeyPattern().isEmpty()); ChunkType origChunk; if (!_collMetadata->getNextChunk(_minKey, &origChunk) || origChunk.getMin().woCompare(_minKey) || origChunk.getMax().woCompare(_maxKey)) { // Our boundaries are different from those passed in const string msg = stream() << "moveChunk cannot find chunk " << "[" << _minKey << "," << _maxKey << ")" << " to migrate, the chunk boundaries may be stale"; warning() << msg; throw SendStaleConfigException(_nss.toString(), msg, _collectionVersion, _shardVersion); } return &_distLockStatus->getValue(); }
Status ChunkMoveOperationState::commitMigration() { invariant(_distLockStatus.is_initialized()); invariant(_distLockStatus->isOK()); log() << "About to enter migrate critical section"; // We're under the collection distributed lock here, so no other migrate can change maxVersion // or CollectionMetadata state. ShardingState* const shardingState = ShardingState::get(_txn); Status startStatus = ShardingStateRecovery::startMetadataOp(_txn); if (!startStatus.isOK()) return startStatus; shardingState->migrationSourceManager()->setInCriticalSection(true); const ChunkVersion originalCollVersion = getCollMetadata()->getCollVersion(); ChunkVersion myVersion = originalCollVersion; myVersion.incMajor(); { ScopedTransaction transaction(_txn, MODE_IX); Lock::DBLock lk(_txn->lockState(), _nss.db(), MODE_IX); Lock::CollectionLock collLock(_txn->lockState(), _nss.ns(), MODE_X); invariant(myVersion > shardingState->getVersion(_nss.ns())); // Bump the metadata's version up and "forget" about the chunk being moved. This is // not the commit point, but in practice the state in this shard won't change until // the commit it done. shardingState->donateChunk(_txn, _nss.ns(), _minKey, _maxKey, myVersion); } log() << "moveChunk setting version to: " << myVersion << migrateLog; // We're under the collection lock here, too, so we can undo the chunk donation because // no other state change could be ongoing BSONObj res; Status recvChunkCommitStatus{ErrorCodes::InternalError, "status not set"}; try { ScopedDbConnection connTo(_toShardCS, 35.0); connTo->runCommand("admin", BSON("_recvChunkCommit" << 1), res); connTo.done(); recvChunkCommitStatus = getStatusFromCommandResult(res); } catch (const DBException& e) { const string msg = stream() << "moveChunk could not contact to shard " << _toShard << " to commit transfer" << causedBy(e); warning() << msg; recvChunkCommitStatus = Status(e.toStatus().code(), msg); } if (MONGO_FAIL_POINT(failMigrationCommit) && recvChunkCommitStatus.isOK()) { recvChunkCommitStatus = Status(ErrorCodes::InternalError, "Failing _recvChunkCommit due to failpoint."); } if (!recvChunkCommitStatus.isOK()) { log() << "moveChunk migrate commit not accepted by TO-shard: " << res << " resetting shard version to: " << getShardVersion() << migrateLog; { ScopedTransaction transaction(_txn, MODE_IX); Lock::DBLock dbLock(_txn->lockState(), _nss.db(), MODE_IX); Lock::CollectionLock collLock(_txn->lockState(), _nss.ns(), MODE_X); log() << "moveChunk collection lock acquired to reset shard version from " "failed migration"; // Revert the chunk manager back to the state before "forgetting" about the chunk shardingState->undoDonateChunk(_txn, _nss.ns(), getCollMetadata()); } log() << "Shard version successfully reset to clean up failed migration"; return Status(recvChunkCommitStatus.code(), stream() << "_recvChunkCommit failed: " << causedBy(recvChunkCommitStatus)); } log() << "moveChunk migrate commit accepted by TO-shard: " << res << migrateLog; BSONArrayBuilder updates; { // Update for the chunk being moved BSONObjBuilder op; op.append("op", "u"); op.appendBool("b", false); // No upserting op.append("ns", ChunkType::ConfigNS); BSONObjBuilder n(op.subobjStart("o")); n.append(ChunkType::name(), Chunk::genID(_nss.ns(), _minKey)); myVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod()); n.append(ChunkType::ns(), _nss.ns()); n.append(ChunkType::min(), _minKey); n.append(ChunkType::max(), _maxKey); n.append(ChunkType::shard(), _toShard); n.done(); BSONObjBuilder q(op.subobjStart("o2")); q.append(ChunkType::name(), Chunk::genID(_nss.ns(), _minKey)); q.done(); updates.append(op.obj()); } // Version at which the next highest lastmod will be set. If the chunk being moved is the last // in the shard, nextVersion is that chunk's lastmod otherwise the highest version is from the // chunk being bumped on the FROM-shard. ChunkVersion nextVersion = myVersion; // If we have chunks left on the FROM shard, update the version of one of them as well. We can // figure that out by grabbing the metadata as it has been changed. const std::shared_ptr<CollectionMetadata> bumpedCollMetadata( shardingState->getCollectionMetadata(_nss.ns())); if (bumpedCollMetadata->getNumChunks() > 0) { // get another chunk on that shard ChunkType bumpChunk; invariant(bumpedCollMetadata->getNextChunk(bumpedCollMetadata->getMinKey(), &bumpChunk)); BSONObj bumpMin = bumpChunk.getMin(); BSONObj bumpMax = bumpChunk.getMax(); dassert(bumpMin.woCompare(_minKey) != 0); BSONObjBuilder op; op.append("op", "u"); op.appendBool("b", false); op.append("ns", ChunkType::ConfigNS); nextVersion.incMinor(); // same as used on donateChunk BSONObjBuilder n(op.subobjStart("o")); n.append(ChunkType::name(), Chunk::genID(_nss.ns(), bumpMin)); nextVersion.addToBSON(n, ChunkType::DEPRECATED_lastmod()); n.append(ChunkType::ns(), _nss.ns()); n.append(ChunkType::min(), bumpMin); n.append(ChunkType::max(), bumpMax); n.append(ChunkType::shard(), _fromShard); n.done(); BSONObjBuilder q(op.subobjStart("o2")); q.append(ChunkType::name(), Chunk::genID(_nss.ns(), bumpMin)); q.done(); updates.append(op.obj()); log() << "moveChunk updating self version to: " << nextVersion << " through " << bumpMin << " -> " << bumpMax << " for collection '" << _nss.ns() << "'" << migrateLog; } else { log() << "moveChunk moved last chunk out for collection '" << _nss.ns() << "'" << migrateLog; } BSONArrayBuilder preCond; { BSONObjBuilder b; b.append("ns", ChunkType::ConfigNS); b.append("q", BSON("query" << BSON(ChunkType::ns(_nss.ns())) << "orderby" << BSON(ChunkType::DEPRECATED_lastmod() << -1))); { BSONObjBuilder bb(b.subobjStart("res")); // TODO: For backwards compatibility, we can't yet require an epoch here bb.appendTimestamp(ChunkType::DEPRECATED_lastmod(), originalCollVersion.toLong()); bb.done(); } preCond.append(b.obj()); } Status applyOpsStatus{Status::OK()}; try { // For testing migration failures if (MONGO_FAIL_POINT(failMigrationConfigWritePrepare)) { throw DBException("mock migration failure before config write", ErrorCodes::PrepareConfigsFailed); } applyOpsStatus = grid.catalogManager(_txn)->applyChunkOpsDeprecated(_txn, updates.arr(), preCond.arr()); if (MONGO_FAIL_POINT(failMigrationApplyOps)) { throw SocketException(SocketException::RECV_ERROR, shardingState->getConfigServer(_txn).toString()); } } catch (const DBException& ex) { warning() << ex << migrateLog; applyOpsStatus = ex.toStatus(); } if (applyOpsStatus == ErrorCodes::PrepareConfigsFailed) { // In the process of issuing the migrate commit, the SyncClusterConnection checks that // the config servers are reachable. If they are not, we are sure that the applyOps // command was not sent to any of the configs, so we can safely back out of the // migration here, by resetting the shard version that we bumped up to in the // donateChunk() call above. log() << "About to acquire moveChunk coll lock to reset shard version from " << "failed migration"; { ScopedTransaction transaction(_txn, MODE_IX); Lock::DBLock dbLock(_txn->lockState(), _nss.db(), MODE_IX); Lock::CollectionLock collLock(_txn->lockState(), _nss.ns(), MODE_X); // Revert the metadata back to the state before "forgetting" about the chunk shardingState->undoDonateChunk(_txn, _nss.ns(), getCollMetadata()); } log() << "Shard version successfully reset to clean up failed migration"; const string msg = stream() << "Failed to send migrate commit to configs " << causedBy(applyOpsStatus); return Status(applyOpsStatus.code(), msg); } else if (!applyOpsStatus.isOK()) { // This could be a blip in the connectivity. Wait out a few seconds and check if the // commit request made it. // // If the commit made it to the config, we'll see the chunk in the new shard and // there's no further action to be done. // // If the commit did not make it, currently the only way to fix this state is to // bounce the mongod so that the old state (before migrating) is brought in. warning() << "moveChunk commit outcome ongoing" << migrateLog; sleepsecs(10); // Look for the chunk in this shard whose version got bumped. We assume that if that // mod made it to the config server, then applyOps was successful. try { std::vector<ChunkType> newestChunk; Status status = grid.catalogManager(_txn)->getChunks(_txn, BSON(ChunkType::ns(_nss.ns())), BSON(ChunkType::DEPRECATED_lastmod() << -1), 1, &newestChunk, nullptr); uassertStatusOK(status); ChunkVersion checkVersion; if (!newestChunk.empty()) { invariant(newestChunk.size() == 1); checkVersion = newestChunk[0].getVersion(); } if (checkVersion.equals(nextVersion)) { log() << "moveChunk commit confirmed" << migrateLog; } else { error() << "moveChunk commit failed: version is at " << checkVersion << " instead of " << nextVersion << migrateLog; error() << "TERMINATING" << migrateLog; dbexit(EXIT_SHARDING_ERROR); } } catch (...) { error() << "moveChunk failed to get confirmation of commit" << migrateLog; error() << "TERMINATING" << migrateLog; dbexit(EXIT_SHARDING_ERROR); } } MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangBeforeLeavingCriticalSection); shardingState->migrationSourceManager()->setInCriticalSection(false); ShardingStateRecovery::endMetadataOp(_txn); // Migration is done, just log some diagnostics information BSONObj chunkInfo = BSON("min" << _minKey << "max" << _maxKey << "from" << _fromShard << "to" << _toShard); BSONObjBuilder commitInfo; commitInfo.appendElements(chunkInfo); if (res["counts"].type() == Object) { commitInfo.appendElements(res["counts"].Obj()); } grid.catalogManager(_txn)->logChange(_txn, "moveChunk.commit", _nss.ns(), commitInfo.obj()); shardingState->migrationSourceManager()->done(_txn); _isRunning = false; return Status::OK(); }
CollectionMetadata* CollectionMetadata::cloneSplit(const ChunkType& chunk, const vector<BSONObj>& splitKeys, const ChunkVersion& newShardVersion, string* errMsg) const { // The error message string is optional. string dummy; if (errMsg == NULL) { errMsg = &dummy; } // The version required in both resulting chunks could be simply an increment in the // minor portion of the current version. However, we are enforcing uniqueness over the // attributes <ns, version> of the configdb collection 'chunks'. So in practice, a // migrate somewhere may force this split to pick up a version that has the major // portion higher than the one that this shard has been using. // // TODO drop the uniqueness constraint and tighten the check below so that only the // minor portion of version changes if (newShardVersion <= _shardVersion) { *errMsg = stream() << "cannot split chunk " << rangeToString(chunk.getMin(), chunk.getMax()) << ", new shard version " << newShardVersion.toString() << " is not greater than current version " << _shardVersion.toString(); warning() << *errMsg; return NULL; } // Check that we have the exact chunk that will be subtracted. if (!rangeMapContains(_chunksMap, chunk.getMin(), chunk.getMax())) { *errMsg = stream() << "cannot split chunk " << rangeToString(chunk.getMin(), chunk.getMax()) << ", this shard does not contain the chunk"; if (rangeMapOverlaps(_chunksMap, chunk.getMin(), chunk.getMax())) { RangeVector overlap; getRangeMapOverlap(_chunksMap, chunk.getMin(), chunk.getMax(), &overlap); *errMsg += stream() << " and it overlaps " << overlapToString(overlap); } warning() << *errMsg; return NULL; } // Check that the split key is valid for (vector<BSONObj>::const_iterator it = splitKeys.begin(); it != splitKeys.end(); ++it) { if (!rangeContains(chunk.getMin(), chunk.getMax(), *it)) { *errMsg = stream() << "cannot split chunk " << rangeToString(chunk.getMin(), chunk.getMax()) << " at key " << *it; warning() << *errMsg; return NULL; } } unique_ptr<CollectionMetadata> metadata(new CollectionMetadata); metadata->_keyPattern = this->_keyPattern; metadata->_keyPattern.getOwned(); metadata->fillKeyPatternFields(); metadata->_pendingMap = this->_pendingMap; metadata->_chunksMap = this->_chunksMap; metadata->_shardVersion = newShardVersion; // will increment 2nd, 3rd,... chunks below BSONObj startKey = chunk.getMin(); for (vector<BSONObj>::const_iterator it = splitKeys.begin(); it != splitKeys.end(); ++it) { BSONObj split = *it; invariant(split.woCompare(startKey) > 0); metadata->_chunksMap[startKey] = split.getOwned(); metadata->_chunksMap.insert(make_pair(split.getOwned(), chunk.getMax().getOwned())); metadata->_shardVersion.incMinor(); startKey = split; } metadata->_collVersion = metadata->_shardVersion > _collVersion ? metadata->_shardVersion : _collVersion; metadata->fillRanges(); invariant(metadata->isValid()); return metadata.release(); }
Status checkAndInitConfigVersion(OperationContext* txn, CatalogManager* catalogManager, DistLockManager* distLockManager) { VersionType versionInfo; Status status = getConfigVersion(catalogManager, &versionInfo); if (!status.isOK()) { return status; } string errMsg; VersionStatus comp = isConfigVersionCompatible(versionInfo, &errMsg); if (comp == VersionStatus_Incompatible) return {ErrorCodes::IncompatibleShardingMetadata, errMsg}; if (comp == VersionStatus_Compatible) return Status::OK(); invariant(comp == VersionStatus_NeedUpgrade); if (versionInfo.getCurrentVersion() != UpgradeHistory_EmptyVersion) { return {ErrorCodes::IncompatibleShardingMetadata, stream() << "newer version " << CURRENT_CONFIG_VERSION << " of mongo config metadata is required, " << "current version is " << versionInfo.getCurrentVersion()}; } // Contact the config servers to make sure all are online - otherwise we wait a long time // for locks. status = _checkConfigServersAlive(grid.shardRegistry()->getConfigServerConnectionString()); if (!status.isOK()) { return status; } // // Acquire a lock for the upgrade process. // // We want to ensure that only a single mongo process is upgrading the config server at a // time. // string whyMessage(stream() << "initializing config database to new format v" << CURRENT_CONFIG_VERSION); auto lockTimeout = stdx::chrono::minutes(20); auto scopedDistLock = distLockManager->lock(txn, "configUpgrade", whyMessage, lockTimeout); if (!scopedDistLock.isOK()) { return scopedDistLock.getStatus(); } // // Double-check compatibility inside the upgrade lock // Another process may have won the lock earlier and done the upgrade for us, check // if this is the case. // status = getConfigVersion(catalogManager, &versionInfo); if (!status.isOK()) { return status; } comp = isConfigVersionCompatible(versionInfo, &errMsg); if (comp == VersionStatus_Incompatible) { return {ErrorCodes::IncompatibleShardingMetadata, errMsg}; } if (comp == VersionStatus_Compatible) return Status::OK(); invariant(comp == VersionStatus_NeedUpgrade); // // Run through the upgrade steps necessary to bring our config version to the current // version // log() << "initializing config server version to " << CURRENT_CONFIG_VERSION; status = makeConfigVersionDocument(txn, catalogManager); if (!status.isOK()) return status; log() << "initialization of config server to v" << CURRENT_CONFIG_VERSION << " successful"; return Status::OK(); }
CollectionMetadata* CollectionMetadata::cloneMerge(const BSONObj& minKey, const BSONObj& maxKey, const ChunkVersion& newShardVersion, string* errMsg) const { if (newShardVersion <= _shardVersion) { *errMsg = stream() << "cannot merge range " << rangeToString(minKey, maxKey) << ", new shard version " << newShardVersion.toString() << " is not greater than current version " << _shardVersion.toString(); warning() << *errMsg; return NULL; } RangeVector overlap; getRangeMapOverlap(_chunksMap, minKey, maxKey, &overlap); if (overlap.empty() || overlap.size() == 1) { *errMsg = stream() << "cannot merge range " << rangeToString(minKey, maxKey) << (overlap.empty() ? ", no chunks found in this range" : ", only one chunk found in this range"); warning() << *errMsg; return NULL; } bool validStartEnd = true; bool validNoHoles = true; if (overlap.begin()->first.woCompare(minKey) != 0) { // First chunk doesn't start with minKey validStartEnd = false; } else if (overlap.rbegin()->second.woCompare(maxKey) != 0) { // Last chunk doesn't end with maxKey validStartEnd = false; } else { // Check that there are no holes BSONObj prevMaxKey = minKey; for (RangeVector::iterator it = overlap.begin(); it != overlap.end(); ++it) { if (it->first.woCompare(prevMaxKey) != 0) { validNoHoles = false; break; } prevMaxKey = it->second; } } if (!validStartEnd || !validNoHoles) { *errMsg = stream() << "cannot merge range " << rangeToString(minKey, maxKey) << ", overlapping chunks " << overlapToString(overlap) << (!validStartEnd ? " do not have the same min and max key" : " are not all adjacent"); warning() << *errMsg; return NULL; } unique_ptr<CollectionMetadata> metadata(new CollectionMetadata); metadata->_keyPattern = this->_keyPattern; metadata->_keyPattern.getOwned(); metadata->fillKeyPatternFields(); metadata->_pendingMap = this->_pendingMap; metadata->_chunksMap = this->_chunksMap; metadata->_rangesMap = this->_rangesMap; metadata->_shardVersion = newShardVersion; metadata->_collVersion = newShardVersion > _collVersion ? newShardVersion : this->_collVersion; for (RangeVector::iterator it = overlap.begin(); it != overlap.end(); ++it) { metadata->_chunksMap.erase(it->first); } metadata->_chunksMap.insert(make_pair(minKey, maxKey)); invariant(metadata->isValid()); return metadata.release(); }