Status CatalogManagerReplicaSet::getChunks(const BSONObj& query, const BSONObj& sort, boost::optional<int> limit, vector<ChunkType>* chunks) { chunks->clear(); auto configShard = grid.shardRegistry()->getShard("config"); auto readHostStatus = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHostStatus.isOK()) { return readHostStatus.getStatus(); } // Convert boost::optional<int> to boost::optional<long long>. auto longLimit = limit ? boost::optional<long long>(*limit) : boost::none; auto findStatus = grid.shardRegistry()->exhaustiveFind( readHostStatus.getValue(), NamespaceString(ChunkType::ConfigNS), query, sort, longLimit); if (!findStatus.isOK()) { return findStatus.getStatus(); } for (const BSONObj& obj : findStatus.getValue()) { auto chunkRes = ChunkType::fromBSON(obj); if (!chunkRes.isOK()) { chunks->clear(); return {ErrorCodes::FailedToParse, stream() << "Failed to parse chunk with id (" << obj[ChunkType::name()].toString() << "): " << chunkRes.getStatus().toString()}; } chunks->push_back(chunkRes.getValue()); } return Status::OK(); }
Status CatalogManagerReplicaSet::getDatabasesForShard(const string& shardName, vector<string>* dbs) { auto configShard = grid.shardRegistry()->getShard("config"); auto readHost = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHost.isOK()) { return readHost.getStatus(); } auto findStatus = grid.shardRegistry()->exhaustiveFind(readHost.getValue(), NamespaceString(DatabaseType::ConfigNS), BSON(DatabaseType::primary(shardName)), BSONObj(), boost::none); // no limit if (!findStatus.isOK()) { return findStatus.getStatus(); } for (const BSONObj& obj : findStatus.getValue()) { string dbName; Status status = bsonExtractStringField(obj, DatabaseType::name(), &dbName); if (!status.isOK()) { dbs->clear(); return status; } dbs->push_back(dbName); } return Status::OK(); }
Status CatalogManagerReplicaSet::getTagsForCollection(const std::string& collectionNs, std::vector<TagsType>* tags) { tags->clear(); auto configShard = grid.shardRegistry()->getShard("config"); auto readHostStatus = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHostStatus.isOK()) { return readHostStatus.getStatus(); } auto findStatus = grid.shardRegistry()->exhaustiveFind(readHostStatus.getValue(), NamespaceString(TagsType::ConfigNS), BSON(TagsType::ns(collectionNs)), BSON(TagsType::min() << 1), boost::none); // no limit if (!findStatus.isOK()) { return findStatus.getStatus(); } for (const BSONObj& obj : findStatus.getValue()) { auto tagRes = TagsType::fromBSON(obj); if (!tagRes.isOK()) { tags->clear(); return Status(ErrorCodes::FailedToParse, str::stream() << "Failed to parse tag: " << tagRes.getStatus().toString()); } tags->push_back(tagRes.getValue()); } return Status::OK(); }
StatusWith<OpTimePair<CollectionType>> CatalogManagerReplicaSet::getCollection( OperationContext* txn, const std::string& collNs) { auto configShard = grid.shardRegistry()->getShard(txn, "config"); auto readHostStatus = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHostStatus.isOK()) { return readHostStatus.getStatus(); } auto statusFind = _exhaustiveFindOnConfig(readHostStatus.getValue(), NamespaceString(CollectionType::ConfigNS), BSON(CollectionType::fullNs(collNs)), BSONObj(), 1); if (!statusFind.isOK()) { return statusFind.getStatus(); } const auto& retOpTimePair = statusFind.getValue(); const auto& retVal = retOpTimePair.value; if (retVal.empty()) { return Status(ErrorCodes::NamespaceNotFound, stream() << "collection " << collNs << " not found"); } invariant(retVal.size() == 1); auto parseStatus = CollectionType::fromBSON(retVal.front()); if (!parseStatus.isOK()) { return parseStatus.getStatus(); } return OpTimePair<CollectionType>(parseStatus.getValue(), retOpTimePair.opTime); }
StatusWith<string> CatalogManagerReplicaSet::getTagForChunk(const std::string& collectionNs, const ChunkType& chunk) { auto configShard = grid.shardRegistry()->getShard("config"); auto readHostStatus = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHostStatus.isOK()) { return readHostStatus.getStatus(); } BSONObj query = BSON(TagsType::ns(collectionNs) << TagsType::min() << BSON("$lte" << chunk.getMin()) << TagsType::max() << BSON("$gte" << chunk.getMax())); auto findStatus = grid.shardRegistry()->exhaustiveFind( readHostStatus.getValue(), NamespaceString(TagsType::ConfigNS), query, BSONObj(), 1); if (!findStatus.isOK()) { return findStatus.getStatus(); } const auto& docs = findStatus.getValue(); if (docs.empty()) { return string{}; } invariant(docs.size() == 1); BSONObj tagsDoc = docs.front(); const auto tagsResult = TagsType::fromBSON(tagsDoc); if (!tagsResult.isOK()) { return {ErrorCodes::FailedToParse, stream() << "error while parsing " << TagsType::ConfigNS << " document: " << tagsDoc << " : " << tagsResult.getStatus().toString()}; } return tagsResult.getValue().getTag(); }
StatusWith<CollectionType> CatalogManagerReplicaSet::getCollection(const std::string& collNs) { auto configShard = grid.shardRegistry()->getShard("config"); auto readHostStatus = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHostStatus.isOK()) { return readHostStatus.getStatus(); } auto statusFind = grid.shardRegistry()->exhaustiveFind(readHostStatus.getValue(), NamespaceString(CollectionType::ConfigNS), BSON(CollectionType::fullNs(collNs)), BSONObj(), 1); if (!statusFind.isOK()) { return statusFind.getStatus(); } const auto& retVal = statusFind.getValue(); if (retVal.empty()) { return Status(ErrorCodes::NamespaceNotFound, stream() << "collection " << collNs << " not found"); } invariant(retVal.size() == 1); return CollectionType::fromBSON(retVal.front()); }
Status CatalogManagerReplicaSet::getAllShards(vector<ShardType>* shards) { const auto configShard = grid.shardRegistry()->getShard("config"); const auto readHost = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHost.isOK()) { return readHost.getStatus(); } auto findStatus = grid.shardRegistry()->exhaustiveFind(readHost.getValue(), NamespaceString(ShardType::ConfigNS), BSONObj(), // no query filter BSONObj(), // no sort boost::none); // no limit if (!findStatus.isOK()) { return findStatus.getStatus(); } for (const BSONObj& doc : findStatus.getValue()) { auto shardRes = ShardType::fromBSON(doc); if (!shardRes.isOK()) { shards->clear(); return {ErrorCodes::FailedToParse, stream() << "Failed to parse shard with id (" << doc[ShardType::name()].toString() << "): " << shardRes.getStatus().toString()}; } shards->push_back(shardRes.getValue()); } return Status::OK(); }
StatusWith<VersionType> CatalogManagerReplicaSet::_getConfigVersion(OperationContext* txn) { const auto configShard = grid.shardRegistry()->getShard(txn, "config"); const auto readHostStatus = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHostStatus.isOK()) { return readHostStatus.getStatus(); } auto readHost = readHostStatus.getValue(); auto findStatus = _exhaustiveFindOnConfig(readHost, NamespaceString(VersionType::ConfigNS), BSONObj(), BSONObj(), boost::none /* no limit */); if (!findStatus.isOK()) { return findStatus.getStatus(); } auto queryResults = findStatus.getValue().value; if (queryResults.size() > 1) { return {ErrorCodes::RemoteValidationError, str::stream() << "should only have 1 document in " << VersionType::ConfigNS}; } if (queryResults.empty()) { auto countStatus = _runCountCommandOnConfig(readHost, NamespaceString(ShardType::ConfigNS), BSONObj()); if (!countStatus.isOK()) { return countStatus.getStatus(); } const auto& shardCount = countStatus.getValue(); if (shardCount > 0) { // Version document doesn't exist, but config.shards is not empty. Assuming that // the current config metadata is pre v2.4. VersionType versionInfo; versionInfo.setMinCompatibleVersion(UpgradeHistory_UnreportedVersion); versionInfo.setCurrentVersion(UpgradeHistory_UnreportedVersion); return versionInfo; } VersionType versionInfo; versionInfo.setMinCompatibleVersion(UpgradeHistory_EmptyVersion); versionInfo.setCurrentVersion(UpgradeHistory_EmptyVersion); return versionInfo; } BSONObj versionDoc = queryResults.front(); auto versionTypeResult = VersionType::fromBSON(versionDoc); if (!versionTypeResult.isOK()) { return Status(ErrorCodes::UnsupportedFormat, str::stream() << "invalid config version document: " << versionDoc << versionTypeResult.getStatus().toString()); } return versionTypeResult.getValue(); }
Status MigrationChunkClonerSourceLegacy::startClone(OperationContext* txn) { invariant(!txn->lockState()->isLocked()); auto scopedGuard = MakeGuard([&] { cancelClone(txn); }); // Resolve the donor and recipient shards and their connection string { auto donorShardStatus = grid.shardRegistry()->getShard(txn, _args.getFromShardId()); if (!donorShardStatus.isOK()) { return donorShardStatus.getStatus(); } _donorCS = donorShardStatus.getValue()->getConnString(); } { auto recipientShardStatus = grid.shardRegistry()->getShard(txn, _args.getToShardId()); if (!recipientShardStatus.isOK()) { return recipientShardStatus.getStatus(); } auto recipientShard = recipientShardStatus.getValue(); auto shardHostStatus = recipientShard->getTargeter()->findHost( ReadPreferenceSetting{ReadPreference::PrimaryOnly}); if (!shardHostStatus.isOK()) { return shardHostStatus.getStatus(); } _recipientHost = std::move(shardHostStatus.getValue()); } // Prepare the currently available documents Status status = _storeCurrentLocs(txn); if (!status.isOK()) { return status; } // Tell the recipient shard to start cloning BSONObjBuilder cmdBuilder; StartChunkCloneRequest::appendAsCommand(&cmdBuilder, _args.getNss(), _sessionId, _args.getConfigServerCS(), _donorCS, _args.getFromShardId(), _args.getToShardId(), _args.getMinKey(), _args.getMaxKey(), _shardKeyPattern.toBSON(), _args.getSecondaryThrottle()); auto responseStatus = _callRecipient(cmdBuilder.obj()); if (!responseStatus.isOK()) { return responseStatus.getStatus(); } scopedGuard.Dismiss(); return Status::OK(); }
void ChunkManager::calcInitSplitsAndShards(OperationContext* txn, const ShardId& primaryShardId, const vector<BSONObj>* initPoints, const set<ShardId>* initShardIds, vector<BSONObj>* splitPoints, vector<ShardId>* shardIds) const { verify(_chunkMap.size() == 0); Chunk c(this, _keyPattern.getKeyPattern().globalMin(), _keyPattern.getKeyPattern().globalMax(), primaryShardId); if (!initPoints || !initPoints->size()) { // discover split points const auto primaryShard = grid.shardRegistry()->getShard(txn, primaryShardId); auto targetStatus = primaryShard->getTargeter()->findHost({ReadPreference::PrimaryPreferred, TagSet{}}); uassertStatusOK(targetStatus); NamespaceString nss(getns()); auto result = grid.shardRegistry()->runCommand( txn, targetStatus.getValue(), nss.db().toString(), BSON("count" << nss.coll())); long long numObjects = 0; uassertStatusOK(result.getStatus()); uassertStatusOK(Command::getStatusFromCommandResult(result.getValue())); uassertStatusOK(bsonExtractIntegerField(result.getValue(), "n", &numObjects)); if (numObjects > 0) c.pickSplitVector(txn, *splitPoints, Chunk::MaxChunkSize); // since docs already exists, must use primary shard shardIds->push_back(primaryShardId); } else { // make sure points are unique and ordered set<BSONObj> orderedPts; for (unsigned i = 0; i < initPoints->size(); ++i) { BSONObj pt = (*initPoints)[i]; orderedPts.insert(pt); } for (set<BSONObj>::iterator it = orderedPts.begin(); it != orderedPts.end(); ++it) { splitPoints->push_back(*it); } if (!initShardIds || !initShardIds->size()) { // If not specified, only use the primary shard (note that it's not safe for mongos // to put initial chunks on other shards without the primary mongod knowing). shardIds->push_back(primaryShardId); } else { std::copy(initShardIds->begin(), initShardIds->end(), std::back_inserter(*shardIds)); } } }
Status AsyncRequestsSender::RemoteData::resolveShardIdToHostAndPort( AsyncRequestsSender* ars, const ReadPreferenceSetting& readPref) { const auto shard = getShard(); if (!shard) { return Status(ErrorCodes::ShardNotFound, str::stream() << "Could not find shard " << shardId); } auto findHostStatus = shard->getTargeter()->findHost(ars->_opCtx, readPref); if (findHostStatus.isOK()) shardHostAndPort = std::move(findHostStatus.getValue()); return findHostStatus.getStatus(); }
Status CatalogManagerReplicaSet::getCollections(OperationContext* txn, const std::string* dbName, std::vector<CollectionType>* collections, OpTime* opTime) { BSONObjBuilder b; if (dbName) { invariant(!dbName->empty()); b.appendRegex(CollectionType::fullNs(), string(str::stream() << "^" << pcrecpp::RE::QuoteMeta(*dbName) << "\\.")); } auto configShard = grid.shardRegistry()->getShard(txn, "config"); auto readHost = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHost.isOK()) { return readHost.getStatus(); } auto findStatus = _exhaustiveFindOnConfig(readHost.getValue(), NamespaceString(CollectionType::ConfigNS), b.obj(), BSONObj(), boost::none); // no limit if (!findStatus.isOK()) { return findStatus.getStatus(); } const auto& docsOpTimePair = findStatus.getValue(); for (const BSONObj& obj : docsOpTimePair.value) { const auto collectionResult = CollectionType::fromBSON(obj); if (!collectionResult.isOK()) { collections->clear(); return {ErrorCodes::FailedToParse, str::stream() << "error while parsing " << CollectionType::ConfigNS << " document: " << obj << " : " << collectionResult.getStatus().toString()}; } collections->push_back(collectionResult.getValue()); } if (opTime) { *opTime = docsOpTimePair.opTime; } return Status::OK(); }
Status CatalogManagerReplicaSet::_checkDbDoesNotExist(OperationContext* txn, const string& dbName, DatabaseType* db) { BSONObjBuilder queryBuilder; queryBuilder.appendRegex( DatabaseType::name(), (string) "^" + pcrecpp::RE::QuoteMeta(dbName) + "$", "i"); const auto configShard = grid.shardRegistry()->getShard(txn, "config"); const auto readHost = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHost.isOK()) { return readHost.getStatus(); } auto findStatus = _exhaustiveFindOnConfig(readHost.getValue(), NamespaceString(DatabaseType::ConfigNS), queryBuilder.obj(), BSONObj(), 1); if (!findStatus.isOK()) { return findStatus.getStatus(); } const auto& docs = findStatus.getValue().value; if (docs.empty()) { return Status::OK(); } BSONObj dbObj = docs.front(); std::string actualDbName = dbObj[DatabaseType::name()].String(); if (actualDbName == dbName) { if (db) { auto parseDBStatus = DatabaseType::fromBSON(dbObj); if (!parseDBStatus.isOK()) { return parseDBStatus.getStatus(); } *db = parseDBStatus.getValue(); } return Status(ErrorCodes::NamespaceExists, str::stream() << "database " << dbName << " already exists"); } return Status(ErrorCodes::DatabaseDifferCase, str::stream() << "can't have 2 databases that just differ on case " << " have: " << actualDbName << " want to add: " << dbName); }
StatusWith<OpTimePair<DatabaseType>> CatalogManagerReplicaSet::getDatabase( OperationContext* txn, const std::string& dbName) { invariant(nsIsDbOnly(dbName)); // The two databases that are hosted on the config server are config and admin if (dbName == "config" || dbName == "admin") { DatabaseType dbt; dbt.setName(dbName); dbt.setSharded(false); dbt.setPrimary("config"); return OpTimePair<DatabaseType>(dbt); } const auto configShard = grid.shardRegistry()->getShard(txn, "config"); const auto readHost = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHost.isOK()) { return readHost.getStatus(); } auto findStatus = _exhaustiveFindOnConfig(readHost.getValue(), NamespaceString(DatabaseType::ConfigNS), BSON(DatabaseType::name(dbName)), BSONObj(), 1); if (!findStatus.isOK()) { return findStatus.getStatus(); } const auto& docsWithOpTime = findStatus.getValue(); if (docsWithOpTime.value.empty()) { return {ErrorCodes::DatabaseNotFound, stream() << "database " << dbName << " not found"}; } invariant(docsWithOpTime.value.size() == 1); auto parseStatus = DatabaseType::fromBSON(docsWithOpTime.value.front()); if (!parseStatus.isOK()) { return parseStatus.getStatus(); } return OpTimePair<DatabaseType>(parseStatus.getValue(), docsWithOpTime.opTime); }
StatusWith<std::string> CatalogManagerReplicaSet::_generateNewShardName() const { const auto configShard = grid.shardRegistry()->getShard("config"); const auto readHost = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHost.isOK()) { return readHost.getStatus(); } BSONObjBuilder shardNameRegex; shardNameRegex.appendRegex(ShardType::name(), "^shard"); auto findStatus = grid.shardRegistry()->exhaustiveFind(readHost.getValue(), NamespaceString(ShardType::ConfigNS), shardNameRegex.obj(), BSON(ShardType::name() << -1), 1); if (!findStatus.isOK()) { return findStatus.getStatus(); } const auto& docs = findStatus.getValue(); int count = 0; if (!docs.empty()) { const auto shardStatus = ShardType::fromBSON(docs.front()); if (!shardStatus.isOK()) { return shardStatus.getStatus(); } std::istringstream is(shardStatus.getValue().getName().substr(5)); is >> count; count++; } // TODO fix so that we can have more than 10000 automatically generated shard names if (count < 9999) { std::stringstream ss; ss << "shard" << std::setfill('0') << std::setw(4) << count; return ss.str(); } return Status(ErrorCodes::OperationFailed, "unable to generate new shard name"); }
Status AsyncResultsMerger::RemoteCursorData::resolveShardIdToHostAndPort( const ReadPreferenceSetting& readPref) { invariant(shardId); invariant(!cursorId); const auto shard = getShard(); if (!shard) { return Status(ErrorCodes::ShardNotFound, str::stream() << "Could not find shard " << *shardId); } // TODO: Pass down an OperationContext* to use here. auto findHostStatus = shard->getTargeter()->findHostWithMaxWait(readPref, Seconds{20}); if (!findHostStatus.isOK()) { return findHostStatus.getStatus(); } _shardHostAndPort = std::move(findHostStatus.getValue()); return Status::OK(); }
Status AsyncResultsMerger::RemoteCursorData::resolveShardIdToHostAndPort( OperationContext* txn, const ReadPreferenceSetting& readPref) { invariant(shardId); invariant(!cursorId); const auto shard = grid.shardRegistry()->getShard(txn, *shardId); if (!shard) { return Status(ErrorCodes::ShardNotFound, str::stream() << "Could not find shard " << *shardId); } auto findHostStatus = shard->getTargeter()->findHost( readPref, RemoteCommandTargeter::selectFindHostMaxWaitTime(txn)); if (!findHostStatus.isOK()) { return findHostStatus.getStatus(); } _shardHostAndPort = std::move(findHostStatus.getValue()); return Status::OK(); }
StatusWith<SettingsType> CatalogManagerReplicaSet::getGlobalSettings(OperationContext* txn, const string& key) { const auto configShard = grid.shardRegistry()->getShard(txn, "config"); const auto readHost = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHost.isOK()) { return readHost.getStatus(); } auto findStatus = _exhaustiveFindOnConfig(readHost.getValue(), NamespaceString(SettingsType::ConfigNS), BSON(SettingsType::key(key)), BSONObj(), 1); if (!findStatus.isOK()) { return findStatus.getStatus(); } const auto& docs = findStatus.getValue().value; if (docs.empty()) { return {ErrorCodes::NoMatchingDocument, str::stream() << "can't find settings document with key: " << key}; } BSONObj settingsDoc = docs.front(); StatusWith<SettingsType> settingsResult = SettingsType::fromBSON(settingsDoc); if (!settingsResult.isOK()) { return {ErrorCodes::FailedToParse, str::stream() << "error while parsing settings document: " << settingsDoc << " : " << settingsResult.getStatus().toString()}; } const SettingsType& settings = settingsResult.getValue(); Status validationStatus = settings.validate(); if (!validationStatus.isOK()) { return validationStatus; } return settingsResult; }
void MigrationManager::_executeMigrations(OperationContext* txn, MigrationStatuses* migrationStatuses) { for (auto& migration : _activeMigrations) { const NamespaceString nss(migration.chunkInfo.migrateInfo.ns); auto scopedCMStatus = ScopedChunkManager::getExisting(txn, nss); if (!scopedCMStatus.isOK()) { // Unable to find the ChunkManager for "nss" for whatever reason; abandon this // migration and proceed to the next. stdx::lock_guard<stdx::mutex> lk(_mutex); migrationStatuses->insert(MigrationStatuses::value_type( migration.chunkInfo.migrateInfo.getName(), std::move(scopedCMStatus.getStatus()))); continue; } ChunkManager* const chunkManager = scopedCMStatus.getValue().cm(); auto chunk = chunkManager->findIntersectingChunk(txn, migration.chunkInfo.migrateInfo.minKey); { // No need to lock the mutex. Only this function and _takeDistLockForAMigration // manipulate "_distributedLocks". No need to protect serial actions. if (!_takeDistLockForAMigration(txn, migration, migrationStatuses)) { // If there is a lock conflict between the balancer and the shard, or a shard and a // shard, the migration has been rescheduled. Otherwise an attempt to take the lock // failed for whatever reason and this migration is being abandoned. continue; } } const MigrationRequest& migrationRequest = migration.chunkInfo; BSONObjBuilder builder; MoveChunkRequest::appendAsCommand( &builder, nss, chunkManager->getVersion(), Grid::get(txn)->shardRegistry()->getConfigServerConnectionString(), migrationRequest.migrateInfo.from, migrationRequest.migrateInfo.to, ChunkRange(chunk->getMin(), chunk->getMax()), migrationRequest.maxChunkSizeBytes, migrationRequest.secondaryThrottle, migrationRequest.waitForDelete, migration.oldShard ? true : false); // takeDistLock flag. BSONObj moveChunkRequestObj = builder.obj(); const auto recipientShard = grid.shardRegistry()->getShard(txn, migration.chunkInfo.migrateInfo.from); const auto host = recipientShard->getTargeter()->findHost( ReadPreferenceSetting{ReadPreference::PrimaryOnly}, RemoteCommandTargeter::selectFindHostMaxWaitTime(txn)); if (!host.isOK()) { // Unable to find a target shard for whatever reason; abandon this migration and proceed // to the next. stdx::lock_guard<stdx::mutex> lk(_mutex); migrationStatuses->insert(MigrationStatuses::value_type( migration.chunkInfo.migrateInfo.getName(), std::move(host.getStatus()))); continue; } RemoteCommandRequest remoteRequest(host.getValue(), "admin", moveChunkRequestObj); StatusWith<RemoteCommandResponse> remoteCommandResponse( Status{ErrorCodes::InternalError, "Uninitialized value"}); executor::TaskExecutor* executor = Grid::get(txn)->getExecutorPool()->getFixedExecutor(); StatusWith<executor::TaskExecutor::CallbackHandle> callbackHandleWithStatus = executor->scheduleRemoteCommand(remoteRequest, stdx::bind(&MigrationManager::_checkMigrationCallback, this, stdx::placeholders::_1, txn, &migration, migrationStatuses)); if (!callbackHandleWithStatus.isOK()) { // Scheduling the migration moveChunk failed. stdx::lock_guard<stdx::mutex> lk(_mutex); migrationStatuses->insert( MigrationStatuses::value_type(migration.chunkInfo.migrateInfo.getName(), std::move(callbackHandleWithStatus.getStatus()))); continue; } // The moveChunk command was successfully scheduled. Store the callback handle so that the // command's return can be waited for later. stdx::lock_guard<stdx::mutex> lk(_mutex); migration.setCallbackHandle(std::move(callbackHandleWithStatus.getValue())); } _waitForMigrations(txn); // At this point, there are no parallel running threads so it is safe not to lock the mutex. // All the migrations have returned, release all of the distributed locks that are no longer // being used. _distributedLocks.clear(); // If there are rescheduled migrations, move them to active and run the function again. if (!_rescheduledMigrations.empty()) { // Clear all the callback handles of the rescheduled migrations. for (auto& migration : _rescheduledMigrations) { migration.clearCallbackHandle(); } _activeMigrations = std::move(_rescheduledMigrations); _rescheduledMigrations.clear(); _executeMigrations(txn, migrationStatuses); } else { _activeMigrations.clear(); } }
StatusWith<ShardDrainingStatus> CatalogManagerReplicaSet::removeShard(OperationContext* txn, const std::string& name) { const auto configShard = grid.shardRegistry()->getShard("config"); const auto readHost = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHost.isOK()) { return readHost.getStatus(); } // Check preconditions for removing the shard auto countStatus = _runCountCommand(readHost.getValue(), NamespaceString(ShardType::ConfigNS), BSON(ShardType::name() << NE << name << ShardType::draining(true))); if (!countStatus.isOK()) { return countStatus.getStatus(); } if (countStatus.getValue() > 0) { return Status(ErrorCodes::ConflictingOperationInProgress, "Can't have more than one draining shard at a time"); } countStatus = _runCountCommand(readHost.getValue(), NamespaceString(ShardType::ConfigNS), BSON(ShardType::name() << NE << name)); if (!countStatus.isOK()) { return countStatus.getStatus(); } if (countStatus.getValue() == 0) { return Status(ErrorCodes::IllegalOperation, "Can't remove last shard"); } // Figure out if shard is already draining countStatus = _runCountCommand(readHost.getValue(), NamespaceString(ShardType::ConfigNS), BSON(ShardType::name() << name << ShardType::draining(true))); if (!countStatus.isOK()) { return countStatus.getStatus(); } if (countStatus.getValue() == 0) { log() << "going to start draining shard: " << name; Status status = update(ShardType::ConfigNS, BSON(ShardType::name() << name), BSON("$set" << BSON(ShardType::draining(true))), false, // upsert false, // multi NULL); if (!status.isOK()) { log() << "error starting removeShard: " << name << "; err: " << status.reason(); return status; } grid.shardRegistry()->reload(); // Record start in changelog logChange( txn->getClient()->clientAddress(true), "removeShard.start", "", BSON("shard" << name)); return ShardDrainingStatus::STARTED; } // Draining has already started, now figure out how many chunks and databases are still on the // shard. countStatus = _runCountCommand( readHost.getValue(), NamespaceString(ChunkType::ConfigNS), BSON(ChunkType::shard(name))); if (!countStatus.isOK()) { return countStatus.getStatus(); } const long long chunkCount = countStatus.getValue(); countStatus = _runCountCommand(readHost.getValue(), NamespaceString(DatabaseType::ConfigNS), BSON(DatabaseType::primary(name))); if (!countStatus.isOK()) { return countStatus.getStatus(); } const long long databaseCount = countStatus.getValue(); if (chunkCount > 0 || databaseCount > 0) { // Still more draining to do return ShardDrainingStatus::ONGOING; } // Draining is done, now finish removing the shard. log() << "going to remove shard: " << name; audit::logRemoveShard(txn->getClient(), name); Status status = remove(ShardType::ConfigNS, BSON(ShardType::name() << name), 0, NULL); if (!status.isOK()) { log() << "Error concluding removeShard operation on: " << name << "; err: " << status.reason(); return status; } grid.shardRegistry()->remove(name); grid.shardRegistry()->reload(); // Record finish in changelog logChange(txn->getClient()->clientAddress(true), "removeShard", "", BSON("shard" << name)); return ShardDrainingStatus::COMPLETED; }
Status CatalogManagerReplicaSet::shardCollection(OperationContext* txn, const string& ns, const ShardKeyPattern& fieldsAndOrder, bool unique, const vector<BSONObj>& initPoints, const set<ShardId>& initShardIds) { // Lock the collection globally so that no other mongos can try to shard or drop the collection // at the same time. auto scopedDistLock = getDistLockManager()->lock(ns, "shardCollection"); if (!scopedDistLock.isOK()) { return scopedDistLock.getStatus(); } StatusWith<DatabaseType> status = getDatabase(nsToDatabase(ns)); if (!status.isOK()) { return status.getStatus(); } DatabaseType dbt = status.getValue(); ShardId dbPrimaryShardId = dbt.getPrimary(); const auto primaryShard = grid.shardRegistry()->getShard(dbPrimaryShardId); { // In 3.0 and prior we include this extra safety check that the collection is not getting // sharded concurrently by two different mongos instances. It is not 100%-proof, but it // reduces the chance that two invocations of shard collection will step on each other's // toes. Now we take the distributed lock so going forward this check won't be necessary // but we leave it around for compatibility with other mongoses from 3.0. // TODO(spencer): Remove this after 3.2 ships. const auto configShard = grid.shardRegistry()->getShard("config"); const auto readHost = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHost.isOK()) { return readHost.getStatus(); } auto countStatus = _runCountCommand( readHost.getValue(), NamespaceString(ChunkType::ConfigNS), BSON(ChunkType::ns(ns))); if (!countStatus.isOK()) { return countStatus.getStatus(); } if (countStatus.getValue() > 0) { return Status(ErrorCodes::AlreadyInitialized, str::stream() << "collection " << ns << " already sharded with " << countStatus.getValue() << " chunks."); } } // Record start in changelog { BSONObjBuilder collectionDetail; collectionDetail.append("shardKey", fieldsAndOrder.toBSON()); collectionDetail.append("collection", ns); collectionDetail.append("primary", primaryShard->toString()); { BSONArrayBuilder initialShards(collectionDetail.subarrayStart("initShards")); for (const ShardId& shardId : initShardIds) { initialShards.append(shardId); } } collectionDetail.append("numChunks", static_cast<int>(initPoints.size() + 1)); logChange(txn->getClient()->clientAddress(true), "shardCollection.start", ns, collectionDetail.obj()); } ChunkManagerPtr manager(new ChunkManager(ns, fieldsAndOrder, unique)); manager->createFirstChunks(dbPrimaryShardId, &initPoints, &initShardIds); manager->loadExistingRanges(nullptr); CollectionInfo collInfo; collInfo.useChunkManager(manager); collInfo.save(ns); manager->reload(true); // TODO(spencer) SERVER-19319: Send setShardVersion to primary shard so it knows to start // rejecting unversioned writes. BSONObj finishDetail = BSON("version" << ""); // TODO(spencer) SERVER-19319 Report actual version used logChange(txn->getClient()->clientAddress(true), "shardCollection", ns, finishDetail); return Status::OK(); }
Status CatalogManagerReplicaSet::shardCollection(OperationContext* txn, const string& ns, const ShardKeyPattern& fieldsAndOrder, bool unique, const vector<BSONObj>& initPoints, const set<ShardId>& initShardIds) { // Lock the collection globally so that no other mongos can try to shard or drop the collection // at the same time. auto scopedDistLock = getDistLockManager()->lock(ns, "shardCollection"); if (!scopedDistLock.isOK()) { return scopedDistLock.getStatus(); } auto status = getDatabase(txn, nsToDatabase(ns)); if (!status.isOK()) { return status.getStatus(); } ShardId dbPrimaryShardId = status.getValue().value.getPrimary(); const auto primaryShard = grid.shardRegistry()->getShard(txn, dbPrimaryShardId); { // In 3.0 and prior we include this extra safety check that the collection is not getting // sharded concurrently by two different mongos instances. It is not 100%-proof, but it // reduces the chance that two invocations of shard collection will step on each other's // toes. Now we take the distributed lock so going forward this check won't be necessary // but we leave it around for compatibility with other mongoses from 3.0. // TODO(spencer): Remove this after 3.2 ships. const auto configShard = grid.shardRegistry()->getShard(txn, "config"); const auto readHost = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHost.isOK()) { return readHost.getStatus(); } auto countStatus = _runCountCommandOnConfig( readHost.getValue(), NamespaceString(ChunkType::ConfigNS), BSON(ChunkType::ns(ns))); if (!countStatus.isOK()) { return countStatus.getStatus(); } if (countStatus.getValue() > 0) { return Status(ErrorCodes::AlreadyInitialized, str::stream() << "collection " << ns << " already sharded with " << countStatus.getValue() << " chunks."); } } // Record start in changelog { BSONObjBuilder collectionDetail; collectionDetail.append("shardKey", fieldsAndOrder.toBSON()); collectionDetail.append("collection", ns); collectionDetail.append("primary", primaryShard->toString()); { BSONArrayBuilder initialShards(collectionDetail.subarrayStart("initShards")); for (const ShardId& shardId : initShardIds) { initialShards.append(shardId); } } collectionDetail.append("numChunks", static_cast<int>(initPoints.size() + 1)); logChange(txn, txn->getClient()->clientAddress(true), "shardCollection.start", ns, collectionDetail.obj()); } shared_ptr<ChunkManager> manager(new ChunkManager(ns, fieldsAndOrder, unique)); manager->createFirstChunks(txn, dbPrimaryShardId, &initPoints, &initShardIds); manager->loadExistingRanges(txn, nullptr); CollectionInfo collInfo; collInfo.useChunkManager(manager); collInfo.save(txn, ns); manager->reload(txn, true); // Tell the primary mongod to refresh its data // TODO: Think the real fix here is for mongos to just // assume that all collections are sharded, when we get there SetShardVersionRequest ssv = SetShardVersionRequest::makeForVersioningNoPersist( grid.shardRegistry()->getConfigServerConnectionString(), dbPrimaryShardId, primaryShard->getConnString(), NamespaceString(ns), ChunkVersionAndOpTime(manager->getVersion(), manager->getConfigOpTime()), true); auto ssvStatus = grid.shardRegistry()->runCommandWithNotMasterRetries( txn, dbPrimaryShardId, "admin", ssv.toBSON()); if (!ssvStatus.isOK()) { warning() << "could not update initial version of " << ns << " on shard primary " << dbPrimaryShardId << ssvStatus.getStatus(); } logChange(txn, txn->getClient()->clientAddress(true), "shardCollection", ns, BSON("version" << manager->getVersion().toString())); return Status::OK(); }
void AsyncResultsMerger::handleBatchResponse( const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData, size_t remoteIndex) { stdx::lock_guard<stdx::mutex> lk(_mutex); auto& remote = _remotes[remoteIndex]; // Clear the callback handle. This indicates that we are no longer waiting on a response from // 'remote'. remote.cbHandle = executor::TaskExecutor::CallbackHandle(); // If we're in the process of shutting down then there's no need to process the batch. if (_lifecycleState != kAlive) { invariant(_lifecycleState == kKillStarted); // Make sure to wake up anyone waiting on '_currentEvent' if we're shutting down. signalCurrentEventIfReady_inlock(); // If we're killed and we're not waiting on any more batches to come back, then we are ready // to kill the cursors on the remote hosts and clean up this cursor. Schedule the // killCursors command and signal that this cursor is safe now safe to destroy. We have to // promise not to touch any members of this class because 'this' could become invalid as // soon as we signal the event. if (!haveOutstandingBatchRequests_inlock()) { // If the event handle is invalid, then the executor is in the middle of shutting down, // and we can't schedule any more work for it to complete. if (_killCursorsScheduledEvent.isValid()) { scheduleKillCursors_inlock(); _executor->signalEvent(_killCursorsScheduledEvent); } _lifecycleState = kKillComplete; } return; } // Early return from this point on signal anyone waiting on an event, if ready() is true. ScopeGuard signaller = MakeGuard(&AsyncResultsMerger::signalCurrentEventIfReady_inlock, this); if (!cbData.response.isOK()) { remote.status = cbData.response.getStatus(); // If we failed to retrieve the batch because we couldn't contact the remote, we notify that // targeter that the host is unreachable. The caller can then retry on a new host. if (remote.status == ErrorCodes::HostUnreachable && remote.shardId) { auto shard = _params.shardRegistry->getShard(_params.txn, *remote.shardId); if (!shard) { remote.status = Status(ErrorCodes::HostUnreachable, str::stream() << "Could not find shard " << *remote.shardId << " containing host " << remote.hostAndPort.toString()); } else { shard->getTargeter()->markHostUnreachable(remote.hostAndPort); } } return; } auto getMoreParseStatus = CursorResponse::parseFromBSON(cbData.response.getValue().data); if (!getMoreParseStatus.isOK()) { remote.status = getMoreParseStatus.getStatus(); return; } auto cursorResponse = getMoreParseStatus.getValue(); // If we have a cursor established, and we get a non-zero cursorid that is not equal to the // established cursorid, we will fail the operation. if (remote.cursorId && cursorResponse.cursorId != 0 && *remote.cursorId != cursorResponse.cursorId) { remote.status = Status(ErrorCodes::BadValue, str::stream() << "Expected cursorid " << *remote.cursorId << " but received " << cursorResponse.cursorId); return; } remote.cursorId = cursorResponse.cursorId; remote.cmdObj = boost::none; for (const auto& obj : cursorResponse.batch) { // If there's a sort, we're expecting the remote node to give us back a sort key. if (!_params.sort.isEmpty() && obj[ClusterClientCursorParams::kSortKeyField].type() != BSONType::Object) { remote.status = Status(ErrorCodes::InternalError, str::stream() << "Missing field '" << ClusterClientCursorParams::kSortKeyField << "' in document: " << obj); return; } remote.docBuffer.push(obj); ++remote.fetchedCount; } // If we're doing a sorted merge, then we have to make sure to put this remote onto the // merge queue. if (!_params.sort.isEmpty() && !cursorResponse.batch.empty()) { _mergeQueue.push(remoteIndex); } // If the cursor is tailable and we just received an empty batch, the next return value should // be boost::none in order to indicate the end of the batch. if (_params.isTailable && !remote.hasNext()) { _eofNext = true; } // If even after receiving this batch we still don't have anything buffered (i.e. the batchSize // was zero), then can schedule work to retrieve the next batch right away. // // We do not ask for the next batch if the cursor is tailable, as batches received from remote // tailable cursors should be passed through to the client without asking for more batches. if (!_params.isTailable && !remote.hasNext() && !remote.exhausted()) { auto nextBatchStatus = askForNextBatch_inlock(remoteIndex); if (!nextBatchStatus.isOK()) { remote.status = nextBatchStatus; return; } } // ScopeGuard requires dismiss on success, but we want waiter to be signalled on success as // well as failure. signaller.Dismiss(); signalCurrentEventIfReady_inlock(); }
StatusWith<VersionType> CatalogManagerReplicaSet::_getConfigVersion() { const auto configShard = grid.shardRegistry()->getShard("config"); const auto readHostStatus = configShard->getTargeter()->findHost(kConfigReadSelector); if (!readHostStatus.isOK()) { return readHostStatus.getStatus(); } auto readHost = readHostStatus.getValue(); auto findStatus = grid.shardRegistry()->exhaustiveFind(readHost, NamespaceString(VersionType::ConfigNS), BSONObj(), BSONObj(), boost::none /* no limit */); if (!findStatus.isOK()) { return findStatus.getStatus(); } auto queryResults = findStatus.getValue(); if (queryResults.size() > 1) { return {ErrorCodes::RemoteValidationError, str::stream() << "should only have 1 document in " << VersionType::ConfigNS}; } if (queryResults.empty()) { auto cmdStatus = grid.shardRegistry()->runCommand(readHost, "admin", BSON("listDatabases" << 1)); if (!cmdStatus.isOK()) { return cmdStatus.getStatus(); } const BSONObj& cmdResult = cmdStatus.getValue(); Status cmdResultStatus = getStatusFromCommandResult(cmdResult); if (!cmdResultStatus.isOK()) { return cmdResultStatus; } for (const auto& dbEntry : cmdResult["databases"].Obj()) { const string& dbName = dbEntry["name"].String(); if (dbName != "local" && dbName != "admin") { VersionType versionInfo; versionInfo.setMinCompatibleVersion(UpgradeHistory_UnreportedVersion); versionInfo.setCurrentVersion(UpgradeHistory_UnreportedVersion); return versionInfo; } } VersionType versionInfo; versionInfo.setMinCompatibleVersion(UpgradeHistory_EmptyVersion); versionInfo.setCurrentVersion(UpgradeHistory_EmptyVersion); return versionInfo; } BSONObj versionDoc = queryResults.front(); auto versionTypeResult = VersionType::fromBSON(versionDoc); if (!versionTypeResult.isOK()) { return Status(ErrorCodes::UnsupportedFormat, str::stream() << "invalid config version document: " << versionDoc << versionTypeResult.getStatus().toString()); } return versionTypeResult.getValue(); }
void AsyncResultsMerger::handleBatchResponse( const executor::TaskExecutor::RemoteCommandCallbackArgs& cbData, size_t remoteIndex) { stdx::lock_guard<stdx::mutex> lk(_mutex); auto& remote = _remotes[remoteIndex]; // Clear the callback handle. This indicates that we are no longer waiting on a response from // 'remote'. remote.cbHandle = executor::TaskExecutor::CallbackHandle(); // If we're in the process of shutting down then there's no need to process the batch. if (_lifecycleState != kAlive) { invariant(_lifecycleState == kKillStarted); // Make sure to wake up anyone waiting on '_currentEvent' if we're shutting down. signalCurrentEventIfReady_inlock(); // Make a best effort to parse the response and retrieve the cursor id. We need the cursor // id in order to issue a killCursors command against it. if (cbData.response.isOK()) { auto cursorResponse = parseCursorResponse(cbData.response.getValue().data, remote); if (cursorResponse.isOK()) { remote.cursorId = cursorResponse.getValue().getCursorId(); } } // If we're killed and we're not waiting on any more batches to come back, then we are ready // to kill the cursors on the remote hosts and clean up this cursor. Schedule the // killCursors command and signal that this cursor is safe now safe to destroy. We have to // promise not to touch any members of this class because 'this' could become invalid as // soon as we signal the event. if (!haveOutstandingBatchRequests_inlock()) { // If the event handle is invalid, then the executor is in the middle of shutting down, // and we can't schedule any more work for it to complete. if (_killCursorsScheduledEvent.isValid()) { scheduleKillCursors_inlock(); _executor->signalEvent(_killCursorsScheduledEvent); } _lifecycleState = kKillComplete; } return; } // Early return from this point on signal anyone waiting on an event, if ready() is true. ScopeGuard signaller = MakeGuard(&AsyncResultsMerger::signalCurrentEventIfReady_inlock, this); StatusWith<CursorResponse> cursorResponseStatus( cbData.response.isOK() ? parseCursorResponse(cbData.response.getValue().data, remote) : cbData.response.getStatus()); if (!cursorResponseStatus.isOK()) { // Notify the shard registry of the failure. if (remote.shardId) { auto shard = grid.shardRegistry()->getShard(_params.txn, *remote.shardId); if (!shard) { remote.status = Status(cursorResponseStatus.getStatus().code(), str::stream() << "Could not find shard " << *remote.shardId << " containing host " << remote.getTargetHost().toString()); } else { ShardRegistry::updateReplSetMonitor( shard->getTargeter(), remote.getTargetHost(), cursorResponseStatus.getStatus()); } } // If the error is retriable, schedule another request. if (!remote.cursorId && remote.retryCount < kMaxNumFailedHostRetryAttempts && isPerShardRetriableError(cursorResponseStatus.getStatus().code())) { ++remote.retryCount; // Since we potentially updated the targeter that the last host it chose might be // faulty, the call below may end up getting a different host. remote.status = askForNextBatch_inlock(remoteIndex); if (remote.status.isOK()) { return; } // If we end up here, it means we failed to schedule the retry request, which is a more // severe error that should not be retried. Just pass through to the error handling // logic below. } else { remote.status = cursorResponseStatus.getStatus(); } // Unreachable host errors are swallowed if the 'allowPartialResults' option is set. We // remove the unreachable host entirely from consideration by marking it as exhausted. if (_params.isAllowPartialResults) { remote.status = Status::OK(); // Clear the results buffer and cursor id. std::queue<BSONObj> emptyBuffer; std::swap(remote.docBuffer, emptyBuffer); remote.cursorId = 0; } return; } // Cursor id successfully established. auto cursorResponse = std::move(cursorResponseStatus.getValue()); remote.cursorId = cursorResponse.getCursorId(); remote.initialCmdObj = boost::none; for (const auto& obj : cursorResponse.getBatch()) { // If there's a sort, we're expecting the remote node to give us back a sort key. if (!_params.sort.isEmpty() && obj[ClusterClientCursorParams::kSortKeyField].type() != BSONType::Object) { remote.status = Status(ErrorCodes::InternalError, str::stream() << "Missing field '" << ClusterClientCursorParams::kSortKeyField << "' in document: " << obj); return; } remote.docBuffer.push(obj); ++remote.fetchedCount; } // If we're doing a sorted merge, then we have to make sure to put this remote onto the // merge queue. if (!_params.sort.isEmpty() && !cursorResponse.getBatch().empty()) { _mergeQueue.push(remoteIndex); } // If the cursor is tailable and we just received an empty batch, the next return value should // be boost::none in order to indicate the end of the batch. if (_params.isTailable && !remote.hasNext()) { _eofNext = true; } // If even after receiving this batch we still don't have anything buffered (i.e. the batchSize // was zero), then can schedule work to retrieve the next batch right away. // // We do not ask for the next batch if the cursor is tailable, as batches received from remote // tailable cursors should be passed through to the client without asking for more batches. if (!_params.isTailable && !remote.hasNext() && !remote.exhausted()) { remote.status = askForNextBatch_inlock(remoteIndex); if (!remote.status.isOK()) { return; } } // ScopeGuard requires dismiss on success, but we want waiter to be signalled on success as // well as failure. signaller.Dismiss(); signalCurrentEventIfReady_inlock(); }