boost::optional<Date_t> CollectionRangeDeleter::cleanUpNextRange( OperationContext* opCtx, NamespaceString const& nss, OID const& epoch, int maxToDelete, CollectionRangeDeleter* forTestOnly) { StatusWith<int> wrote = 0; auto range = boost::optional<ChunkRange>(boost::none); auto notification = DeleteNotification(); { UninterruptibleLockGuard noInterrupt(opCtx->lockState()); AutoGetCollection autoColl(opCtx, nss, MODE_IX); auto* const collection = autoColl.getCollection(); auto* const css = CollectionShardingRuntime::get(opCtx, nss); auto& metadataManager = css->_metadataManager; auto* const self = forTestOnly ? forTestOnly : &metadataManager->_rangesToClean; const auto scopedCollectionMetadata = metadataManager->getActiveMetadata(metadataManager, boost::none); if (!scopedCollectionMetadata) { LOG(0) << "Abandoning any range deletions because the metadata for " << nss.ns() << " was reset"; stdx::lock_guard<stdx::mutex> lk(css->_metadataManager->_managerLock); css->_metadataManager->_clearAllCleanups(lk); return boost::none; } const auto& metadata = *scopedCollectionMetadata; if (!forTestOnly && (!collection || !metadata->isSharded())) { if (!collection) { LOG(0) << "Abandoning any range deletions left over from dropped " << nss.ns(); } else { LOG(0) << "Abandoning any range deletions left over from previously sharded" << nss.ns(); } stdx::lock_guard<stdx::mutex> lk(css->_metadataManager->_managerLock); css->_metadataManager->_clearAllCleanups(lk); return boost::none; } if (!forTestOnly && metadata->getCollVersion().epoch() != epoch) { LOG(1) << "Range deletion task for " << nss.ns() << " epoch " << epoch << " woke;" << " (current is " << metadata->getCollVersion() << ")"; return boost::none; } bool writeOpLog = false; { stdx::lock_guard<stdx::mutex> scopedLock(css->_metadataManager->_managerLock); if (self->isEmpty()) { LOG(1) << "No further range deletions scheduled on " << nss.ns(); return boost::none; } auto& orphans = self->_orphans; if (orphans.empty()) { // We have delayed deletions; see if any are ready. auto& df = self->_delayedOrphans.front(); if (df.whenToDelete > Date_t::now()) { LOG(0) << "Deferring deletion of " << nss.ns() << " range " << redact(df.range.toString()) << " until " << df.whenToDelete; return df.whenToDelete; } // Move a single range from _delayedOrphans to _orphans orphans.splice(orphans.end(), self->_delayedOrphans, self->_delayedOrphans.begin()); LOG(1) << "Proceeding with deferred deletion of " << nss.ns() << " range " << redact(orphans.front().range.toString()); writeOpLog = true; } invariant(!orphans.empty()); const auto& frontRange = orphans.front().range; range.emplace(frontRange.getMin().getOwned(), frontRange.getMax().getOwned()); notification = orphans.front().notification; } invariant(range); if (writeOpLog) { // Secondaries will watch for this update, and kill any queries that may depend on // documents in the range -- excepting any queries with a read-concern option // 'ignoreChunkMigration' try { AutoGetCollection autoAdmin( opCtx, NamespaceString::kServerConfigurationNamespace, MODE_IX); Helpers::upsert(opCtx, NamespaceString::kServerConfigurationNamespace.ns(), BSON("_id" << "startRangeDeletion" << "ns" << nss.ns() << "epoch" << epoch << "min" << range->getMin() << "max" << range->getMax())); } catch (const DBException& e) { stdx::lock_guard<stdx::mutex> scopedLock(css->_metadataManager->_managerLock); css->_metadataManager->_clearAllCleanups( scopedLock, e.toStatus("cannot push startRangeDeletion record to Op Log," " abandoning scheduled range deletions")); return boost::none; } } try { wrote = self->_doDeletion( opCtx, collection, metadata->getKeyPattern(), *range, maxToDelete); } catch (const DBException& e) { wrote = e.toStatus(); warning() << e.what(); } } // drop autoColl if (!wrote.isOK() || wrote.getValue() == 0) { if (wrote.isOK()) { LOG(0) << "No documents remain to delete in " << nss << " range " << redact(range->toString()); } // Wait for majority replication even when wrote isn't OK or == 0, because it might have // been OK and/or > 0 previously, and the deletions must be persistent before notifying // clients in _pop(). LOG(0) << "Waiting for majority replication of local deletions in " << nss.ns() << " range " << redact(range->toString()); repl::ReplClientInfo::forClient(opCtx->getClient()).setLastOpToSystemLastOpTime(opCtx); const auto clientOpTime = repl::ReplClientInfo::forClient(opCtx->getClient()).getLastOp(); // Wait for replication outside the lock const auto status = [&] { try { WriteConcernResult unusedWCResult; return waitForWriteConcern( opCtx, clientOpTime, kMajorityWriteConcern, &unusedWCResult); } catch (const DBException& e) { return e.toStatus(); } }(); // Get the lock again to finish off this range (including notifying, if necessary). // Don't allow lock interrupts while cleaning up. UninterruptibleLockGuard noInterrupt(opCtx->lockState()); AutoGetCollection autoColl(opCtx, nss, MODE_IX); auto* const css = CollectionShardingRuntime::get(opCtx, nss); auto& metadataManager = css->_metadataManager; auto* const self = forTestOnly ? forTestOnly : &metadataManager->_rangesToClean; stdx::lock_guard<stdx::mutex> scopedLock(css->_metadataManager->_managerLock); if (!status.isOK()) { LOG(0) << "Error when waiting for write concern after removing " << nss << " range " << redact(range->toString()) << " : " << redact(status.reason()); // If range were already popped (e.g. by dropping nss during the waitForWriteConcern // above) its notification would have been triggered, so this check suffices to ensure // that it is safe to pop the range here if (!notification.ready()) { invariant(!self->isEmpty() && self->_orphans.front().notification == notification); LOG(0) << "Abandoning deletion of latest range in " << nss.ns() << " after local " << "deletions because of replication failure"; self->_pop(status); } } else { LOG(0) << "Finished deleting documents in " << nss.ns() << " range " << redact(range->toString()); self->_pop(wrote.getStatus()); } if (!self->_orphans.empty()) { LOG(1) << "Deleting " << nss.ns() << " range " << redact(self->_orphans.front().range.toString()) << " next."; } return Date_t::now() + stdx::chrono::milliseconds{rangeDeleterBatchDelayMS.load()}; } invariant(range); invariant(wrote.getStatus()); invariant(wrote.getValue() > 0); notification.abandon(); return Date_t::now() + stdx::chrono::milliseconds{rangeDeleterBatchDelayMS.load()}; }
StatusWith<bool> FTDCFileReader::hasNext() { while (true) { if (_state == State::kNeedsDoc) { if (_stream.eof()) { return {false}; } auto swDoc = readDocument(); if (!swDoc.isOK()) { return swDoc.getStatus(); } if (swDoc.getValue().isEmpty()) { return {false}; } _parent = swDoc.getValue(); // metadata or metrics? auto swType = FTDCBSONUtil::getBSONDocumentType(_parent); if (!swType.isOK()) { return swType.getStatus(); } auto swId = FTDCBSONUtil::getBSONDocumentId(_parent); if (!swId.isOK()) { return swId.getStatus(); } _dateId = swId.getValue(); FTDCBSONUtil::FTDCType type = swType.getValue(); if (type == FTDCBSONUtil::FTDCType::kMetadata) { _state = State::kMetadataDoc; auto swMetadata = FTDCBSONUtil::getBSONDocumentFromMetadataDoc(_parent); if (!swMetadata.isOK()) { return swMetadata.getStatus(); } _metadata = swMetadata.getValue(); } else if (type == FTDCBSONUtil::FTDCType::kMetricChunk) { _state = State::kMetricChunk; auto swDocs = FTDCBSONUtil::getMetricsFromMetricDoc(_parent, &_decompressor); if (!swDocs.isOK()) { return swDocs.getStatus(); } _docs = swDocs.getValue(); // There is always at least the reference document _pos = 0; } return {true}; } // We previously returned a metadata document, now we need another document from disk if (_state == State::kMetadataDoc) { _state = State::kNeedsDoc; continue; } // If we have a metric chunk, return the next document in the chunk until the chunk is // exhausted if (_state == State::kMetricChunk) { if (_pos + 1 == _docs.size()) { _state = State::kNeedsDoc; continue; } _pos++; return {true}; } } }
void Strategy::queryOp(OperationContext* txn, Request& request) { verify(!NamespaceString(request.getns()).isCommand()); Timer queryTimer; globalOpCounters.gotQuery(); QueryMessage q(request.d()); NamespaceString ns(q.ns); ClientBasic* client = txn->getClient(); AuthorizationSession* authSession = AuthorizationSession::get(client); Status status = authSession->checkAuthForFind(ns, false); audit::logQueryAuthzCheck(client, ns, q.query, status.code()); uassertStatusOK(status); LOG(3) << "query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn << " options: " << q.queryOptions; if (q.ntoreturn == 1 && strstr(q.ns, ".$cmd")) throw UserException(8010, "something is wrong, shouldn't see a command here"); if (q.queryOptions & QueryOption_Exhaust) { uasserted(18526, string("the 'exhaust' query option is invalid for mongos queries: ") + q.ns + " " + q.query.toString()); } // Spigot which controls whether OP_QUERY style find on mongos uses the new ClusterClientCursor // code path. // TODO: Delete the spigot and always use the new code. if (useClusterClientCursor) { // Determine the default read preference mode based on the value of the slaveOk flag. ReadPreference readPreferenceOption = (q.queryOptions & QueryOption_SlaveOk) ? ReadPreference::SecondaryPreferred : ReadPreference::PrimaryOnly; ReadPreferenceSetting readPreference(readPreferenceOption, TagSet()); BSONElement rpElem; auto readPrefExtractStatus = bsonExtractTypedField( q.query, LiteParsedQuery::kWrappedReadPrefField, mongo::Object, &rpElem); if (readPrefExtractStatus.isOK()) { auto parsedRps = ReadPreferenceSetting::fromBSON(rpElem.Obj()); uassertStatusOK(parsedRps.getStatus()); readPreference = parsedRps.getValue(); } else if (readPrefExtractStatus != ErrorCodes::NoSuchKey) { uassertStatusOK(readPrefExtractStatus); } auto canonicalQuery = CanonicalQuery::canonicalize(q, ExtensionsCallbackNoop()); uassertStatusOK(canonicalQuery.getStatus()); // If the $explain flag was set, we must run the operation on the shards as an explain // command rather than a find command. if (canonicalQuery.getValue()->getParsed().isExplain()) { const LiteParsedQuery& lpq = canonicalQuery.getValue()->getParsed(); BSONObj findCommand = lpq.asFindCommand(); // We default to allPlansExecution verbosity. auto verbosity = ExplainCommon::EXEC_ALL_PLANS; const bool secondaryOk = (readPreference.pref != ReadPreference::PrimaryOnly); rpc::ServerSelectionMetadata metadata(secondaryOk, readPreference); BSONObjBuilder explainBuilder; uassertStatusOK( Strategy::explainFind(txn, findCommand, lpq, verbosity, metadata, &explainBuilder)); BSONObj explainObj = explainBuilder.done(); replyToQuery(0, // query result flags request.p(), request.m(), static_cast<const void*>(explainObj.objdata()), explainObj.objsize(), 1, // numResults 0, // startingFrom CursorId(0)); return; } // Do the work to generate the first batch of results. This blocks waiting to get responses // from the shard(s). std::vector<BSONObj> batch; // 0 means the cursor is exhausted and // otherwise we assume that a cursor with the returned id can be retrieved via the // ClusterCursorManager auto cursorId = ClusterFind::runQuery(txn, *canonicalQuery.getValue(), readPreference, &batch); uassertStatusOK(cursorId.getStatus()); // TODO: this constant should be shared between mongos and mongod, and should // not be inside ShardedClientCursor. BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); // Fill out the response buffer. int numResults = 0; for (const auto& obj : batch) { buffer.appendBuf((void*)obj.objdata(), obj.objsize()); numResults++; } replyToQuery(0, // query result flags request.p(), request.m(), buffer.buf(), buffer.len(), numResults, 0, // startingFrom cursorId.getValue()); return; } QuerySpec qSpec((string)q.ns, q.query, q.fields, q.ntoskip, q.ntoreturn, q.queryOptions); // Parse "$maxTimeMS". StatusWith<int> maxTimeMS = LiteParsedQuery::parseMaxTimeMS(q.query[LiteParsedQuery::queryOptionMaxTimeMS]); uassert(17233, maxTimeMS.getStatus().reason(), maxTimeMS.isOK()); if (_isSystemIndexes(q.ns) && doShardedIndexQuery(txn, request, qSpec)) { return; } ParallelSortClusteredCursor* cursor = new ParallelSortClusteredCursor(qSpec, CommandInfo()); verify(cursor); // TODO: Move out to Request itself, not strategy based try { cursor->init(txn); if (qSpec.isExplain()) { BSONObjBuilder explain_builder; cursor->explain(explain_builder); explain_builder.appendNumber("executionTimeMillis", static_cast<long long>(queryTimer.millis())); BSONObj b = explain_builder.obj(); replyToQuery(0, request.p(), request.m(), b); delete (cursor); return; } } catch (...) { delete cursor; throw; } // TODO: Revisit all of this when we revisit the sharded cursor cache if (cursor->getNumQueryShards() != 1) { // More than one shard (or zero), manage with a ShardedClientCursor // NOTE: We may also have *zero* shards here when the returnPartial flag is set. // Currently the code in ShardedClientCursor handles this. ShardedClientCursorPtr cc(new ShardedClientCursor(q, cursor)); BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); int docCount = 0; const int startFrom = cc->getTotalSent(); bool hasMore = cc->sendNextBatch(q.ntoreturn, buffer, docCount); if (hasMore) { LOG(5) << "storing cursor : " << cc->getId(); int cursorLeftoverMillis = maxTimeMS.getValue() - queryTimer.millis(); if (maxTimeMS.getValue() == 0) { // 0 represents "no limit". cursorLeftoverMillis = kMaxTimeCursorNoTimeLimit; } else if (cursorLeftoverMillis <= 0) { cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired; } cursorCache.store(cc, cursorLeftoverMillis); } replyToQuery(0, request.p(), request.m(), buffer.buf(), buffer.len(), docCount, startFrom, hasMore ? cc->getId() : 0); } else { // Only one shard is used // Remote cursors are stored remotely, we shouldn't need this around. unique_ptr<ParallelSortClusteredCursor> cursorDeleter(cursor); ShardPtr shard = grid.shardRegistry()->getShard(txn, cursor->getQueryShardId()); verify(shard.get()); DBClientCursorPtr shardCursor = cursor->getShardCursor(shard->getId()); // Implicitly stores the cursor in the cache request.reply(*(shardCursor->getMessage()), shardCursor->originalHost()); // We don't want to kill the cursor remotely if there's still data left shardCursor->decouple(); } }
void repairDatabasesAndCheckVersion(OperationContext* txn) { LOG(1) << "enter repairDatabases (to check pdfile version #)"; ScopedTransaction transaction(txn, MODE_X); Lock::GlobalWrite lk(txn->lockState()); vector<string> dbNames; StorageEngine* storageEngine = txn->getServiceContext()->getGlobalStorageEngine(); storageEngine->listDatabases(&dbNames); // Repair all databases first, so that we do not try to open them if they are in bad shape if (storageGlobalParams.repair) { invariant(!storageGlobalParams.readOnly); for (vector<string>::const_iterator i = dbNames.begin(); i != dbNames.end(); ++i) { const string dbName = *i; LOG(1) << " Repairing database: " << dbName; fassert(18506, repairDatabase(txn, storageEngine, dbName)); } } const repl::ReplSettings& replSettings = repl::getGlobalReplicationCoordinator()->getSettings(); // On replica set members we only clear temp collections on DBs other than "local" during // promotion to primary. On pure slaves, they are only cleared when the oplog tells them // to. The local DB is special because it is not replicated. See SERVER-10927 for more // details. const bool shouldClearNonLocalTmpCollections = !(checkIfReplMissingFromCommandLine(txn) || replSettings.usingReplSets() || replSettings.isSlave()); const bool shouldDoCleanupForSERVER23299 = isSubjectToSERVER23299(txn); for (vector<string>::const_iterator i = dbNames.begin(); i != dbNames.end(); ++i) { const string dbName = *i; LOG(1) << " Recovering database: " << dbName; Database* db = dbHolder().openDb(txn, dbName); invariant(db); // First thing after opening the database is to check for file compatibility, // otherwise we might crash if this is a deprecated format. auto status = db->getDatabaseCatalogEntry()->currentFilesCompatible(txn); if (!status.isOK()) { if (status.code() == ErrorCodes::CanRepairToDowngrade) { // Convert CanRepairToDowngrade statuses to MustUpgrade statuses to avoid logging a // potentially confusing and inaccurate message. // // TODO SERVER-24097: Log a message informing the user that they can start the // current version of mongod with --repair and then proceed with normal startup. status = {ErrorCodes::MustUpgrade, status.reason()}; } severe() << "Unable to start mongod due to an incompatibility with the data files and" " this version of mongod: " << redact(status); severe() << "Please consult our documentation when trying to downgrade to a previous" " major release"; quickExit(EXIT_NEED_UPGRADE); return; } // Check if admin.system.version contains an invalid featureCompatibilityVersion. // If a valid featureCompatibilityVersion is present, cache it as a server parameter. if (dbName == "admin") { if (Collection* versionColl = db->getCollection(FeatureCompatibilityVersion::kCollection)) { BSONObj featureCompatibilityVersion; if (Helpers::findOne(txn, versionColl, BSON("_id" << FeatureCompatibilityVersion::kParameterName), featureCompatibilityVersion)) { auto version = FeatureCompatibilityVersion::parse(featureCompatibilityVersion); if (!version.isOK()) { severe() << version.getStatus(); fassertFailedNoTrace(40283); } serverGlobalParams.featureCompatibility.version.store(version.getValue()); } } } // Major versions match, check indexes const string systemIndexes = db->name() + ".system.indexes"; Collection* coll = db->getCollection(systemIndexes); unique_ptr<PlanExecutor> exec( InternalPlanner::collectionScan(txn, systemIndexes, coll, PlanExecutor::YIELD_MANUAL)); BSONObj index; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&index, NULL))) { const BSONObj key = index.getObjectField("key"); const string plugin = IndexNames::findPluginName(key); if (db->getDatabaseCatalogEntry()->isOlderThan24(txn)) { if (IndexNames::existedBefore24(plugin)) { continue; } log() << "Index " << index << " claims to be of type '" << plugin << "', " << "which is either invalid or did not exist before v2.4. " << "See the upgrade section: " << "http://dochub.mongodb.org/core/upgrade-2.4" << startupWarningsLog; } if (index["v"].isNumber() && index["v"].numberInt() == 0) { log() << "WARNING: The index: " << index << " was created with the deprecated" << " v:0 format. This format will not be supported in a future release." << startupWarningsLog; log() << "\t To fix this, you need to rebuild this index." << " For instructions, see http://dochub.mongodb.org/core/rebuild-v0-indexes" << startupWarningsLog; } } // Non-yielding collection scans from InternalPlanner will never error. invariant(PlanExecutor::IS_EOF == state); if (replSettings.usingReplSets()) { // We only care about the _id index if we are in a replset checkForIdIndexes(txn, db); // Ensure oplog is capped (mmap does not guarantee order of inserts on noncapped // collections) if (db->name() == "local") { checkForCappedOplog(txn, db); } } if (shouldDoCleanupForSERVER23299) { handleSERVER23299ForDb(txn, db); } if (!storageGlobalParams.readOnly && (shouldClearNonLocalTmpCollections || dbName == "local")) { db->clearTmpCollections(txn); } } LOG(1) << "done repairDatabases"; }
StatusWith<BSONObj> FTDCFileReader::readDocument() { if (!_stream.is_open()) { return {ErrorCodes::FileNotOpen, "open() needs to be called first."}; } char buf[sizeof(std::int32_t)]; _stream.read(buf, sizeof(buf)); if (sizeof(buf) != _stream.gcount()) { // Did we read exactly zero bytes and hit the eof? // Then return an empty document to indicate we are done reading the file. if (0 == _stream.gcount() && _stream.eof()) { return BSONObj(); } return {ErrorCodes::FileStreamFailed, str::stream() << "Failed to read 4 bytes from file \'" << _file.generic_string() << "\'"}; } std::uint32_t bsonLength = ConstDataView(buf).read<LittleEndian<std::int32_t>>(); // Reads past the end of the file will be caught below // The interim file sentinel is 8 bytes of zero. if (bsonLength == 0) { return BSONObj(); } // Reads past the end of the file will be caught below if (bsonLength > _fileSize || bsonLength < BSONObj::kMinBSONLength) { return {ErrorCodes::InvalidLength, str::stream() << "Invalid BSON length found in file \'" << _file.generic_string() << "\'"}; } // Read the BSON document _buffer.resize(bsonLength); // Stuff the length into the front memcpy(_buffer.data(), buf, sizeof(std::int32_t)); // Read the length - 4 bytes from the file std::int32_t readSize = bsonLength - sizeof(std::int32_t); _stream.read(_buffer.data() + sizeof(std::int32_t), readSize); if (readSize != _stream.gcount()) { return {ErrorCodes::FileStreamFailed, str::stream() << "Failed to read " << readSize << " bytes from file \'" << _file.generic_string() << "\'"}; } ConstDataRange cdr(_buffer.data(), _buffer.data() + bsonLength); // TODO: Validated only validates objects based on a flag which is the default at the moment auto swl = cdr.read<Validated<BSONObj>>(); if (!swl.isOK()) { return swl.getStatus(); } return {swl.getValue().val}; }
StatusWith<ShardType> ShardingCatalogManager::_validateHostAsShard( OperationContext* opCtx, std::shared_ptr<RemoteCommandTargeter> targeter, const std::string* shardProposedName, const ConnectionString& connectionString) { auto swCommandResponse = _runCommandForAddShard(opCtx, targeter.get(), "admin", BSON("isMaster" << 1)); if (swCommandResponse.getStatus() == ErrorCodes::IncompatibleServerVersion) { return swCommandResponse.getStatus().withReason( str::stream() << "Cannot add " << connectionString.toString() << " as a shard because its binary version is not compatible with " "the cluster's featureCompatibilityVersion."); } else if (!swCommandResponse.isOK()) { return swCommandResponse.getStatus(); } // Check for a command response error auto resIsMasterStatus = std::move(swCommandResponse.getValue().commandStatus); if (!resIsMasterStatus.isOK()) { return resIsMasterStatus.withContext(str::stream() << "Error running isMaster against " << targeter->connectionString().toString()); } auto resIsMaster = std::move(swCommandResponse.getValue().response); // Fail if the node being added is a mongos. const std::string msg = resIsMaster.getStringField("msg"); if (msg == "isdbgrid") { return {ErrorCodes::IllegalOperation, "cannot add a mongos as a shard"}; } // Extract the maxWireVersion so we can verify that the node being added has a binary version // greater than or equal to the cluster's featureCompatibilityVersion. We expect an incompatible // binary node to be unable to communicate, returning an IncompatibleServerVersion error, // because of our internal wire version protocol. So we can safely invariant here that the node // is compatible. long long maxWireVersion; Status status = bsonExtractIntegerField(resIsMaster, "maxWireVersion", &maxWireVersion); if (!status.isOK()) { return status.withContext(str::stream() << "isMaster returned invalid 'maxWireVersion' " << "field when attempting to add " << connectionString.toString() << " as a shard"); } if (serverGlobalParams.featureCompatibility.getVersion() > ServerGlobalParams::FeatureCompatibility::Version::kFullyUpgradedTo36) { // If FCV 4.0, or upgrading to / downgrading from, wire version must be LATEST. invariant(maxWireVersion == WireVersion::LATEST_WIRE_VERSION); } else if (serverGlobalParams.featureCompatibility.getVersion() > ServerGlobalParams::FeatureCompatibility::Version::kFullyDowngradedTo34 && serverGlobalParams.featureCompatibility.getVersion() <= ServerGlobalParams::FeatureCompatibility::Version::kFullyUpgradedTo36) { // If FCV 3.6, or upgrading to / downgrading from, wire version must be v3.6 // LATEST_WIRE_VERSION or greater. invariant(maxWireVersion >= WireVersion::LATEST_WIRE_VERSION - 1); } else { // If FCV 3.4, wire version cannot be less than v3.4 LATEST_WIRE_VERSION. invariant(serverGlobalParams.featureCompatibility.getVersion() == ServerGlobalParams::FeatureCompatibility::Version::kFullyDowngradedTo34); invariant(maxWireVersion >= WireVersion::LATEST_WIRE_VERSION - 2); } // Check whether there is a master. If there isn't, the replica set may not have been // initiated. If the connection is a standalone, it will return true for isMaster. bool isMaster; status = bsonExtractBooleanField(resIsMaster, "ismaster", &isMaster); if (!status.isOK()) { return status.withContext(str::stream() << "isMaster returned invalid 'ismaster' " << "field when attempting to add " << connectionString.toString() << " as a shard"); } if (!isMaster) { return {ErrorCodes::NotMaster, str::stream() << connectionString.toString() << " does not have a master. If this is a replica set, ensure that it has a" << " healthy primary and that the set has been properly initiated."}; } const std::string providedSetName = connectionString.getSetName(); const std::string foundSetName = resIsMaster["setName"].str(); // Make sure the specified replica set name (if any) matches the actual shard's replica set if (providedSetName.empty() && !foundSetName.empty()) { return {ErrorCodes::OperationFailed, str::stream() << "host is part of set " << foundSetName << "; " << "use replica set url format " << "<setname>/<server1>,<server2>, ..."}; } if (!providedSetName.empty() && foundSetName.empty()) { return {ErrorCodes::OperationFailed, str::stream() << "host did not return a set name; " << "is the replica set still initializing? " << resIsMaster}; } // Make sure the set name specified in the connection string matches the one where its hosts // belong into if (!providedSetName.empty() && (providedSetName != foundSetName)) { return {ErrorCodes::OperationFailed, str::stream() << "the provided connection string (" << connectionString.toString() << ") does not match the actual set name " << foundSetName}; } // Is it a config server? if (resIsMaster.hasField("configsvr")) { return {ErrorCodes::OperationFailed, str::stream() << "Cannot add " << connectionString.toString() << " as a shard since it is a config server"}; } // If the shard is part of a replica set, make sure all the hosts mentioned in the connection // string are part of the set. It is fine if not all members of the set are mentioned in the // connection string, though. if (!providedSetName.empty()) { std::set<std::string> hostSet; BSONObjIterator iter(resIsMaster["hosts"].Obj()); while (iter.more()) { hostSet.insert(iter.next().String()); // host:port } if (resIsMaster["passives"].isABSONObj()) { BSONObjIterator piter(resIsMaster["passives"].Obj()); while (piter.more()) { hostSet.insert(piter.next().String()); // host:port } } if (resIsMaster["arbiters"].isABSONObj()) { BSONObjIterator piter(resIsMaster["arbiters"].Obj()); while (piter.more()) { hostSet.insert(piter.next().String()); // host:port } } for (const auto& hostEntry : connectionString.getServers()) { const auto& host = hostEntry.toString(); // host:port if (hostSet.find(host) == hostSet.end()) { return {ErrorCodes::OperationFailed, str::stream() << "in seed list " << connectionString.toString() << ", host " << host << " does not belong to replica set " << foundSetName << "; found " << resIsMaster.toString()}; } } } std::string actualShardName; if (shardProposedName) { actualShardName = *shardProposedName; } else if (!foundSetName.empty()) { // Default it to the name of the replica set actualShardName = foundSetName; } // Disallow adding shard replica set with name 'config' if (actualShardName == NamespaceString::kConfigDb) { return {ErrorCodes::BadValue, "use of shard replica set with name 'config' is not allowed"}; } // Retrieve the most up to date connection string that we know from the replica set monitor (if // this is a replica set shard, otherwise it will be the same value as connectionString). ConnectionString actualShardConnStr = targeter->connectionString(); ShardType shard; shard.setName(actualShardName); shard.setHost(actualShardConnStr.toString()); shard.setState(ShardType::ShardState::kShardAware); return shard; }
StatusWith<ShardDrainingStatus> ShardingCatalogManager::removeShard(OperationContext* opCtx, const ShardId& shardId) { // Check preconditions for removing the shard std::string name = shardId.toString(); auto countStatus = _runCountCommandOnConfig( opCtx, ShardType::ConfigNS, BSON(ShardType::name() << NE << name << ShardType::draining(true))); if (!countStatus.isOK()) { return countStatus.getStatus(); } if (countStatus.getValue() > 0) { return Status(ErrorCodes::ConflictingOperationInProgress, "Can't have more than one draining shard at a time"); } countStatus = _runCountCommandOnConfig(opCtx, ShardType::ConfigNS, BSON(ShardType::name() << NE << name)); if (!countStatus.isOK()) { return countStatus.getStatus(); } if (countStatus.getValue() == 0) { return Status(ErrorCodes::IllegalOperation, "Can't remove last shard"); } // Figure out if shard is already draining countStatus = _runCountCommandOnConfig( opCtx, ShardType::ConfigNS, BSON(ShardType::name() << name << ShardType::draining(true))); if (!countStatus.isOK()) { return countStatus.getStatus(); } auto* const shardRegistry = Grid::get(opCtx)->shardRegistry(); if (countStatus.getValue() == 0) { log() << "going to start draining shard: " << name; auto updateStatus = Grid::get(opCtx)->catalogClient()->updateConfigDocument( opCtx, ShardType::ConfigNS, BSON(ShardType::name() << name), BSON("$set" << BSON(ShardType::draining(true))), false, ShardingCatalogClient::kLocalWriteConcern); if (!updateStatus.isOK()) { log() << "error starting removeShard: " << name << causedBy(redact(updateStatus.getStatus())); return updateStatus.getStatus(); } shardRegistry->reload(opCtx); // Record start in changelog Grid::get(opCtx) ->catalogClient() ->logChange(opCtx, "removeShard.start", "", BSON("shard" << name), ShardingCatalogClient::kLocalWriteConcern) .transitional_ignore(); return ShardDrainingStatus::STARTED; } // Draining has already started, now figure out how many chunks and databases are still on the // shard. countStatus = _runCountCommandOnConfig(opCtx, ChunkType::ConfigNS, BSON(ChunkType::shard(name))); if (!countStatus.isOK()) { return countStatus.getStatus(); } const long long chunkCount = countStatus.getValue(); countStatus = _runCountCommandOnConfig(opCtx, DatabaseType::ConfigNS, BSON(DatabaseType::primary(name))); if (!countStatus.isOK()) { return countStatus.getStatus(); } const long long databaseCount = countStatus.getValue(); if (chunkCount > 0 || databaseCount > 0) { // Still more draining to do LOG(0) << "chunkCount: " << chunkCount; LOG(0) << "databaseCount: " << databaseCount; return ShardDrainingStatus::ONGOING; } // Draining is done, now finish removing the shard. log() << "going to remove shard: " << name; audit::logRemoveShard(opCtx->getClient(), name); Status status = Grid::get(opCtx)->catalogClient()->removeConfigDocuments( opCtx, ShardType::ConfigNS, BSON(ShardType::name() << name), ShardingCatalogClient::kLocalWriteConcern); if (!status.isOK()) { log() << "Error concluding removeShard operation on: " << name << "; err: " << status.reason(); return status; } shardConnectionPool.removeHost(name); ReplicaSetMonitor::remove(name); shardRegistry->reload(opCtx); // Record finish in changelog Grid::get(opCtx) ->catalogClient() ->logChange(opCtx, "removeShard", "", BSON("shard" << name), ShardingCatalogClient::kLocalWriteConcern) .transitional_ignore(); return ShardDrainingStatus::COMPLETED; }
StatusWith<BalanceChunkRequest> BalanceChunkRequest::parseFromConfigCommand(const BSONObj& obj) { auto chunkStatus = ChunkType::fromBSON(obj); if (!chunkStatus.isOK()) { return chunkStatus.getStatus(); } // The secondary throttle options being sent to the config server are contained within a // sub-object on the request because they contain the writeConcern field, which when sent to the // config server gets checked for only being w:1 or w:majoirty. BSONObj secondaryThrottleObj; { BSONElement secondaryThrottleElement; auto secondaryThrottleElementStatus = bsonExtractTypedField(obj, kSecondaryThrottle, Object, &secondaryThrottleElement); if (secondaryThrottleElementStatus.isOK()) { secondaryThrottleObj = secondaryThrottleElement.Obj(); } else if (secondaryThrottleElementStatus != ErrorCodes::NoSuchKey) { return secondaryThrottleElementStatus; } } auto secondaryThrottleStatus = MigrationSecondaryThrottleOptions::createFromCommand(secondaryThrottleObj); if (!secondaryThrottleStatus.isOK()) { return secondaryThrottleStatus.getStatus(); } BalanceChunkRequest request(std::move(chunkStatus.getValue()), std::move(secondaryThrottleStatus.getValue())); { Status status = bsonExtractBooleanFieldWithDefault(obj, kWaitForDelete, false, &request._waitForDelete); if (!status.isOK()) { return status; } } { long long maxChunkSizeBytes; Status status = bsonExtractIntegerFieldWithDefault(obj, kMaxChunkSizeBytes, 0, &maxChunkSizeBytes); if (!status.isOK()) { return status; } request._maxChunkSizeBytes = static_cast<int64_t>(maxChunkSizeBytes); } { std::string toShardId; Status status = bsonExtractStringField(obj, kToShardId, &toShardId); if (status.isOK()) { if (toShardId.empty()) { return {ErrorCodes::BadValue, "To shard cannot be empty"}; } request._toShardId = std::move(toShardId); } else if (status != ErrorCodes::NoSuchKey) { return status; } } return request; }
UUID UUID::parse(const BSONObj& obj) { auto res = parse(obj.getField("uuid")); uassert(40566, res.getStatus().reason(), res.isOK()); return res.getValue(); }
StatusWith<CursorResponse> ClusterFind::runGetMore(OperationContext* txn, const GetMoreRequest& request) { auto cursorManager = Grid::get(txn)->getCursorManager(); auto pinnedCursor = cursorManager->checkOutCursor(request.nss, request.cursorid, txn); if (!pinnedCursor.isOK()) { return pinnedCursor.getStatus(); } invariant(request.cursorid == pinnedCursor.getValue().getCursorId()); // If the fail point is enabled, busy wait until it is disabled. while (MONGO_FAIL_POINT(keepCursorPinnedDuringGetMore)) { } if (request.awaitDataTimeout) { auto status = pinnedCursor.getValue().setAwaitDataTimeout(*request.awaitDataTimeout); if (!status.isOK()) { return status; } } std::vector<BSONObj> batch; int bytesBuffered = 0; long long batchSize = request.batchSize.value_or(0); long long startingFrom = pinnedCursor.getValue().getNumReturnedSoFar(); auto cursorState = ClusterCursorManager::CursorState::NotExhausted; while (!FindCommon::enoughForGetMore(batchSize, batch.size())) { auto next = pinnedCursor.getValue().next(); if (!next.isOK()) { return next.getStatus(); } if (next.getValue().isEOF()) { // We reached end-of-stream. if (!pinnedCursor.getValue().isTailable()) { cursorState = ClusterCursorManager::CursorState::Exhausted; } break; } if (!FindCommon::haveSpaceForNext( *next.getValue().getResult(), batch.size(), bytesBuffered)) { pinnedCursor.getValue().queueResult(*next.getValue().getResult()); break; } // Add doc to the batch. Account for the space overhead associated with returning this doc // inside a BSON array. bytesBuffered += (next.getValue().getResult()->objsize() + kPerDocumentOverheadBytesUpperBound); batch.push_back(std::move(*next.getValue().getResult())); } // Transfer ownership of the cursor back to the cursor manager. pinnedCursor.getValue().returnCursor(cursorState); CursorId idToReturn = (cursorState == ClusterCursorManager::CursorState::Exhausted) ? CursorId(0) : request.cursorid; return CursorResponse(request.nss, idToReturn, std::move(batch), startingFrom); }
Status ClusterAggregate::runAggregate(OperationContext* txn, const Namespaces& namespaces, BSONObj cmdObj, int options, BSONObjBuilder* result) { auto dbname = namespaces.executionNss.db().toString(); auto status = grid.catalogCache()->getDatabase(txn, dbname); if (!status.isOK()) { appendEmptyResultSet(*result, status.getStatus(), namespaces.requestedNss.ns()); return Status::OK(); } std::shared_ptr<DBConfig> conf = status.getValue(); if (!conf->isShardingEnabled()) { return aggPassthrough(txn, namespaces, conf, cmdObj, result, options); } auto request = AggregationRequest::parseFromBSON(namespaces.executionNss, cmdObj); if (!request.isOK()) { return request.getStatus(); } boost::intrusive_ptr<ExpressionContext> mergeCtx = new ExpressionContext(txn, request.getValue()); mergeCtx->inRouter = true; // explicitly *not* setting mergeCtx->tempDir // Parse and optimize the pipeline specification. auto pipeline = Pipeline::parse(request.getValue().getPipeline(), mergeCtx); if (!pipeline.isOK()) { return pipeline.getStatus(); } for (auto&& ns : pipeline.getValue()->getInvolvedCollections()) { uassert(28769, str::stream() << ns.ns() << " cannot be sharded", !conf->isSharded(ns.ns())); // We won't try to execute anything on a mongos, but we still have to populate this map // so that any $lookups etc will be able to have a resolved view definition. It's okay // that this is incorrect, we will repopulate the real resolved namespace map on the // mongod. // TODO SERVER-25038 This should become unnecessary once we can get the involved // namespaces before parsing. mergeCtx->resolvedNamespaces[ns.coll()] = {ns, std::vector<BSONObj>{}}; } if (!conf->isSharded(namespaces.executionNss.ns())) { return aggPassthrough(txn, namespaces, conf, cmdObj, result, options); } ChunkManagerPtr chunkMgr = conf->getChunkManager(txn, namespaces.executionNss.ns()); // If there was no collation specified, but there is a default collation for the collation, // use that. if (request.getValue().getCollation().isEmpty() && chunkMgr->getDefaultCollator()) { mergeCtx->setCollator(chunkMgr->getDefaultCollator()->clone()); } // Now that we know the collation we'll be using, inject the ExpressionContext and optimize. // TODO SERVER-25038: this must happen before we parse the pipeline, since we can make // string comparisons during parse time. pipeline.getValue()->injectExpressionContext(mergeCtx); pipeline.getValue()->optimizePipeline(); // If the first $match stage is an exact match on the shard key (with a simple collation or // no string matching), we only have to send it to one shard, so send the command to that // shard. BSONObj firstMatchQuery = pipeline.getValue()->getInitialQuery(); BSONObj shardKeyMatches; shardKeyMatches = uassertStatusOK( chunkMgr->getShardKeyPattern().extractShardKeyFromQuery(txn, firstMatchQuery)); bool singleShard = false; if (!shardKeyMatches.isEmpty()) { auto chunk = chunkMgr->findIntersectingChunk( txn, shardKeyMatches, request.getValue().getCollation()); if (chunk.isOK()) { singleShard = true; } } // Don't need to split pipeline if the first $match is an exact match on shard key, unless // there is a stage that needs to be run on the primary shard. const bool needPrimaryShardMerger = pipeline.getValue()->needsPrimaryShardMerger(); const bool needSplit = !singleShard || needPrimaryShardMerger; // Split the pipeline into pieces for mongod(s) and this mongos. If needSplit is true, // 'pipeline' will become the merger side. boost::intrusive_ptr<Pipeline> shardPipeline(needSplit ? pipeline.getValue()->splitForSharded() : pipeline.getValue()); // Create the command for the shards. The 'fromRouter' field means produce output to be // merged. MutableDocument commandBuilder(request.getValue().serializeToCommandObj()); commandBuilder[AggregationRequest::kPipelineName] = Value(shardPipeline->serialize()); if (needSplit) { commandBuilder[AggregationRequest::kFromRouterName] = Value(true); commandBuilder[AggregationRequest::kCursorName] = Value(DOC(AggregationRequest::kBatchSizeName << 0)); } // These fields are not part of the AggregationRequest since they are not handled by the // aggregation subsystem, so we serialize them separately. const std::initializer_list<StringData> fieldsToPropagateToShards = { "$queryOptions", "readConcern", QueryRequest::cmdOptionMaxTimeMS, }; for (auto&& field : fieldsToPropagateToShards) { commandBuilder[field] = Value(cmdObj[field]); } BSONObj shardedCommand = commandBuilder.freeze().toBson(); BSONObj shardQuery = shardPipeline->getInitialQuery(); // Run the command on the shards // TODO need to make sure cursors are killed if a retry is needed std::vector<Strategy::CommandResult> shardResults; Strategy::commandOp(txn, dbname, shardedCommand, options, namespaces.executionNss.ns(), shardQuery, request.getValue().getCollation(), &shardResults); if (mergeCtx->isExplain) { // This must be checked before we start modifying result. uassertAllShardsSupportExplain(shardResults); if (needSplit) { *result << "needsPrimaryShardMerger" << needPrimaryShardMerger << "splitPipeline" << DOC("shardsPart" << shardPipeline->writeExplainOps() << "mergerPart" << pipeline.getValue()->writeExplainOps()); } else { *result << "splitPipeline" << BSONNULL; } BSONObjBuilder shardExplains(result->subobjStart("shards")); for (size_t i = 0; i < shardResults.size(); i++) { shardExplains.append(shardResults[i].shardTargetId, BSON("host" << shardResults[i].target.toString() << "stages" << shardResults[i].result["stages"])); } return Status::OK(); } if (!needSplit) { invariant(shardResults.size() == 1); invariant(shardResults[0].target.getServers().size() == 1); auto executorPool = grid.getExecutorPool(); const BSONObj reply = uassertStatusOK(storePossibleCursor(shardResults[0].target.getServers()[0], shardResults[0].result, namespaces.requestedNss, executorPool->getArbitraryExecutor(), grid.getCursorManager())); result->appendElements(reply); return getStatusFromCommandResult(reply); } pipeline.getValue()->addInitialSource( DocumentSourceMergeCursors::create(parseCursors(shardResults), mergeCtx)); MutableDocument mergeCmd(request.getValue().serializeToCommandObj()); mergeCmd["pipeline"] = Value(pipeline.getValue()->serialize()); mergeCmd["cursor"] = Value(cmdObj["cursor"]); if (cmdObj.hasField("$queryOptions")) { mergeCmd["$queryOptions"] = Value(cmdObj["$queryOptions"]); } if (cmdObj.hasField(QueryRequest::cmdOptionMaxTimeMS)) { mergeCmd[QueryRequest::cmdOptionMaxTimeMS] = Value(cmdObj[QueryRequest::cmdOptionMaxTimeMS]); } mergeCmd.setField("writeConcern", Value(cmdObj["writeConcern"])); // Not propagating readConcern to merger since it doesn't do local reads. // If the user didn't specify a collation already, make sure there's a collation attached to // the merge command, since the merging shard may not have the collection metadata. if (mergeCmd.peek()["collation"].missing()) { mergeCmd.setField("collation", mergeCtx->getCollator() ? Value(mergeCtx->getCollator()->getSpec().toBSON()) : Value(Document{CollationSpec::kSimpleSpec})); } std::string outputNsOrEmpty; if (DocumentSourceOut* out = dynamic_cast<DocumentSourceOut*>(pipeline.getValue()->getSources().back().get())) { outputNsOrEmpty = out->getOutputNs().ns(); } // Run merging command on random shard, unless a stage needs the primary shard. Need to use // ShardConnection so that the merging mongod is sent the config servers on connection init. auto& prng = txn->getClient()->getPrng(); const auto& mergingShardId = needPrimaryShardMerger ? conf->getPrimaryId() : shardResults[prng.nextInt32(shardResults.size())].shardTargetId; const auto mergingShard = uassertStatusOK(grid.shardRegistry()->getShard(txn, mergingShardId)); ShardConnection conn(mergingShard->getConnString(), outputNsOrEmpty); BSONObj mergedResults = aggRunCommand(conn.get(), namespaces, mergeCmd.freeze().toBson(), options); conn.done(); if (auto wcErrorElem = mergedResults["writeConcernError"]) { appendWriteConcernErrorToCmdResponse(mergingShardId, wcErrorElem, *result); } // Copy output from merging (primary) shard to the output object from our command. // Also, propagates errmsg and code if ok == false. result->appendElementsUnique(mergedResults); return getStatusFromCommandResult(result->asTempObj()); }
StatusWith<CursorId> ClusterFind::runQuery(OperationContext* txn, const CanonicalQuery& query, const ReadPreferenceSetting& readPref, std::vector<BSONObj>* results, BSONObj* viewDefinition) { invariant(results); // Projection on the reserved sort key field is illegal in mongos. if (query.getQueryRequest().getProj().hasField(ClusterClientCursorParams::kSortKeyField)) { return {ErrorCodes::BadValue, str::stream() << "Projection contains illegal field '" << ClusterClientCursorParams::kSortKeyField << "': " << query.getQueryRequest().getProj()}; } auto dbConfig = Grid::get(txn)->catalogCache()->getDatabase(txn, query.nss().db().toString()); if (dbConfig.getStatus() == ErrorCodes::NamespaceNotFound) { // If the database doesn't exist, we successfully return an empty result set without // creating a cursor. return CursorId(0); } else if (!dbConfig.isOK()) { return dbConfig.getStatus(); } std::shared_ptr<ChunkManager> chunkManager; std::shared_ptr<Shard> primary; dbConfig.getValue()->getChunkManagerOrPrimary(txn, query.nss().ns(), chunkManager, primary); // Re-target and re-send the initial find command to the shards until we have established the // shard version. for (size_t retries = 1; retries <= kMaxStaleConfigRetries; ++retries) { auto cursorId = runQueryWithoutRetrying( txn, query, readPref, chunkManager.get(), std::move(primary), results, viewDefinition); if (cursorId.isOK()) { return cursorId; } auto status = std::move(cursorId.getStatus()); if (!ErrorCodes::isStaleShardingError(status.code()) && status != ErrorCodes::ShardNotFound) { // Errors other than trying to reach a non existent shard or receiving a stale // metadata message from MongoD are fatal to the operation. Network errors and // replication retries happen at the level of the AsyncResultsMerger. return status; } LOG(1) << "Received error status for query " << redact(query.toStringShort()) << " on attempt " << retries << " of " << kMaxStaleConfigRetries << ": " << redact(status); const bool staleEpoch = (status == ErrorCodes::StaleEpoch); if (staleEpoch) { if (!dbConfig.getValue()->reload(txn)) { // If the reload failed that means the database wasn't found, so successfully return // an empty result set without creating a cursor. return CursorId(0); } } chunkManager = dbConfig.getValue()->getChunkManagerIfExists(txn, query.nss().ns(), true, staleEpoch); if (!chunkManager) { dbConfig.getValue()->getChunkManagerOrPrimary( txn, query.nss().ns(), chunkManager, primary); } } return {ErrorCodes::StaleShardVersion, str::stream() << "Retried " << kMaxStaleConfigRetries << " times without successfully establishing shard version."}; }
StatusWith<AggregationRequest> AggregationRequest::parseFromBSON( NamespaceString nss, const BSONObj& cmdObj, boost::optional<ExplainOptions::Verbosity> explainVerbosity) { // Parse required parameters. auto pipelineElem = cmdObj[kPipelineName]; auto pipeline = AggregationRequest::parsePipelineFromBSON(pipelineElem); if (!pipeline.isOK()) { return pipeline.getStatus(); } AggregationRequest request(std::move(nss), std::move(pipeline.getValue())); const std::initializer_list<StringData> optionsParsedElseWhere = {kPipelineName, kCommandName}; bool hasCursorElem = false; bool hasExplainElem = false; bool hasFromMongosElem = false; bool hasNeedsMergeElem = false; // Parse optional parameters. for (auto&& elem : cmdObj) { auto fieldName = elem.fieldNameStringData(); if (QueryRequest::kUnwrappedReadPrefField == fieldName) { // We expect this field to be validated elsewhere. request.setUnwrappedReadPref(elem.embeddedObject()); } else if (std::find(optionsParsedElseWhere.begin(), optionsParsedElseWhere.end(), fieldName) != optionsParsedElseWhere.end()) { // Ignore options that are parsed elsewhere. } else if (kCursorName == fieldName) { long long batchSize; auto status = CursorRequest::parseCommandCursorOptions(cmdObj, kDefaultBatchSize, &batchSize); if (!status.isOK()) { return status; } hasCursorElem = true; request.setBatchSize(batchSize); } else if (kCollationName == fieldName) { if (elem.type() != BSONType::Object) { return {ErrorCodes::TypeMismatch, str::stream() << kCollationName << " must be an object, not a " << typeName(elem.type())}; } request.setCollation(elem.embeddedObject().getOwned()); } else if (QueryRequest::cmdOptionMaxTimeMS == fieldName) { auto maxTimeMs = QueryRequest::parseMaxTimeMS(elem); if (!maxTimeMs.isOK()) { return maxTimeMs.getStatus(); } request.setMaxTimeMS(maxTimeMs.getValue()); } else if (repl::ReadConcernArgs::kReadConcernFieldName == fieldName) { if (elem.type() != BSONType::Object) { return {ErrorCodes::TypeMismatch, str::stream() << repl::ReadConcernArgs::kReadConcernFieldName << " must be an object, not a " << typeName(elem.type())}; } request.setReadConcern(elem.embeddedObject().getOwned()); } else if (kHintName == fieldName) { if (BSONType::Object == elem.type()) { request.setHint(elem.embeddedObject()); } else if (BSONType::String == elem.type()) { request.setHint(BSON("$hint" << elem.valueStringData())); } else { return Status(ErrorCodes::FailedToParse, str::stream() << kHintName << " must be specified as a string representing an index" << " name, or an object representing an index's key pattern"); } } else if (kCommentName == fieldName) { if (elem.type() != BSONType::String) { return {ErrorCodes::TypeMismatch, str::stream() << kCommentName << " must be a string, not a " << typeName(elem.type())}; } request.setComment(elem.str()); } else if (kExplainName == fieldName) { if (elem.type() != BSONType::Bool) { return {ErrorCodes::TypeMismatch, str::stream() << kExplainName << " must be a boolean, not a " << typeName(elem.type())}; } hasExplainElem = true; if (elem.Bool()) { request.setExplain(ExplainOptions::Verbosity::kQueryPlanner); } } else if (kFromMongosName == fieldName) { if (elem.type() != BSONType::Bool) { return {ErrorCodes::TypeMismatch, str::stream() << kFromMongosName << " must be a boolean, not a " << typeName(elem.type())}; } hasFromMongosElem = true; request.setFromMongos(elem.Bool()); } else if (kNeedsMergeName == fieldName) { if (elem.type() != BSONType::Bool) { return {ErrorCodes::TypeMismatch, str::stream() << kNeedsMergeName << " must be a boolean, not a " << typeName(elem.type())}; } hasNeedsMergeElem = true; request.setNeedsMerge(elem.Bool()); } else if (kMergeByPBRTName == fieldName) { if (elem.type() != BSONType::Bool) { return {ErrorCodes::TypeMismatch, str::stream() << kMergeByPBRTName << " must be a boolean, not a " << typeName(elem.type())}; } request.setMergeByPBRT(elem.Bool()); } else if (kAllowDiskUseName == fieldName) { if (storageGlobalParams.readOnly) { return {ErrorCodes::IllegalOperation, str::stream() << "The '" << kAllowDiskUseName << "' option is not permitted in read-only mode."}; } else if (elem.type() != BSONType::Bool) { return {ErrorCodes::TypeMismatch, str::stream() << kAllowDiskUseName << " must be a boolean, not a " << typeName(elem.type())}; } request.setAllowDiskUse(elem.Bool()); } else if (kExchangeName == fieldName) { try { IDLParserErrorContext ctx("internalExchange"); request.setExchangeSpec(ExchangeSpec::parse(ctx, elem.Obj())); } catch (const DBException& ex) { return ex.toStatus(); } } else if (bypassDocumentValidationCommandOption() == fieldName) { request.setBypassDocumentValidation(elem.trueValue()); } else if (WriteConcernOptions::kWriteConcernField == fieldName) { if (elem.type() != BSONType::Object) { return {ErrorCodes::TypeMismatch, str::stream() << fieldName << " must be an object, not a " << typeName(elem.type())}; } WriteConcernOptions writeConcern; uassertStatusOK(writeConcern.parse(elem.embeddedObject())); request.setWriteConcern(writeConcern); } else if (kRuntimeConstants == fieldName) { try { IDLParserErrorContext ctx("internalRuntimeConstants"); request.setRuntimeConstants(RuntimeConstants::parse(ctx, elem.Obj())); } catch (const DBException& ex) { return ex.toStatus(); } } else if (!isGenericArgument(fieldName)) { return {ErrorCodes::FailedToParse, str::stream() << "unrecognized field '" << elem.fieldName() << "'"}; } } if (explainVerbosity) { if (hasExplainElem) { return { ErrorCodes::FailedToParse, str::stream() << "The '" << kExplainName << "' option is illegal when a explain verbosity is also provided"}; } request.setExplain(explainVerbosity); } // 'hasExplainElem' implies an aggregate command-level explain option, which does not require // a cursor argument. if (!hasCursorElem && !hasExplainElem) { return {ErrorCodes::FailedToParse, str::stream() << "The '" << kCursorName << "' option is required, except for aggregate with the explain argument"}; } if (request.getExplain() && cmdObj[WriteConcernOptions::kWriteConcernField]) { return {ErrorCodes::FailedToParse, str::stream() << "Aggregation explain does not support the'" << WriteConcernOptions::kWriteConcernField << "' option"}; } if (hasNeedsMergeElem && !hasFromMongosElem) { return {ErrorCodes::FailedToParse, str::stream() << "Cannot specify '" << kNeedsMergeName << "' without '" << kFromMongosName << "'"}; } return request; }
ExitCode _initAndListen(int listenPort) { Client::initThread("initandlisten"); _initWireSpec(); auto globalServiceContext = getGlobalServiceContext(); globalServiceContext->setFastClockSource(FastClockSourceFactory::create(Milliseconds(10))); globalServiceContext->setOpObserver(stdx::make_unique<OpObserver>()); DBDirectClientFactory::get(globalServiceContext) .registerImplementation([](OperationContext* txn) { return std::unique_ptr<DBClientBase>(new DBDirectClient(txn)); }); const repl::ReplSettings& replSettings = repl::getGlobalReplicationCoordinator()->getSettings(); { ProcessId pid = ProcessId::getCurrent(); LogstreamBuilder l = log(LogComponent::kControl); l << "MongoDB starting : pid=" << pid << " port=" << serverGlobalParams.port << " dbpath=" << storageGlobalParams.dbpath; if (replSettings.isMaster()) l << " master=" << replSettings.isMaster(); if (replSettings.isSlave()) l << " slave=" << (int)replSettings.isSlave(); const bool is32bit = sizeof(int*) == 4; l << (is32bit ? " 32" : " 64") << "-bit host=" << getHostNameCached() << endl; } DEV log(LogComponent::kControl) << "DEBUG build (which is slower)" << endl; #if defined(_WIN32) VersionInfoInterface::instance().logTargetMinOS(); #endif logProcessDetails(); checked_cast<ServiceContextMongoD*>(getGlobalServiceContext())->createLockFile(); transport::TransportLayerLegacy::Options options; options.port = listenPort; options.ipList = serverGlobalParams.bind_ip; auto sep = stdx::make_unique<ServiceEntryPointMongod>(getGlobalServiceContext()->getTransportLayer()); auto sepPtr = sep.get(); getGlobalServiceContext()->setServiceEntryPoint(std::move(sep)); // Create, start, and attach the TL auto transportLayer = stdx::make_unique<transport::TransportLayerLegacy>(options, sepPtr); auto res = transportLayer->setup(); if (!res.isOK()) { error() << "Failed to set up listener: " << res; return EXIT_NET_ERROR; } std::shared_ptr<DbWebServer> dbWebServer; if (serverGlobalParams.isHttpInterfaceEnabled) { dbWebServer.reset(new DbWebServer(serverGlobalParams.bind_ip, serverGlobalParams.port + 1000, getGlobalServiceContext(), new RestAdminAccess())); if (!dbWebServer->setupSockets()) { error() << "Failed to set up sockets for HTTP interface during startup."; return EXIT_NET_ERROR; } } getGlobalServiceContext()->initializeGlobalStorageEngine(); #ifdef MONGO_CONFIG_WIREDTIGER_ENABLED if (WiredTigerCustomizationHooks::get(getGlobalServiceContext())->restartRequired()) { exitCleanly(EXIT_CLEAN); } #endif // Warn if we detect configurations for multiple registered storage engines in // the same configuration file/environment. if (serverGlobalParams.parsedOpts.hasField("storage")) { BSONElement storageElement = serverGlobalParams.parsedOpts.getField("storage"); invariant(storageElement.isABSONObj()); BSONObj storageParamsObj = storageElement.Obj(); BSONObjIterator i = storageParamsObj.begin(); while (i.more()) { BSONElement e = i.next(); // Ignore if field name under "storage" matches current storage engine. if (storageGlobalParams.engine == e.fieldName()) { continue; } // Warn if field name matches non-active registered storage engine. if (getGlobalServiceContext()->isRegisteredStorageEngine(e.fieldName())) { warning() << "Detected configuration for non-active storage engine " << e.fieldName() << " when current storage engine is " << storageGlobalParams.engine; } } } if (!getGlobalServiceContext()->getGlobalStorageEngine()->getSnapshotManager()) { if (moe::startupOptionsParsed.count("replication.enableMajorityReadConcern") && moe::startupOptionsParsed["replication.enableMajorityReadConcern"].as<bool>()) { // Note: we are intentionally only erroring if the user explicitly requested that we // enable majority read concern. We do not error if the they are implicitly enabled for // CSRS because a required step in the upgrade procedure can involve an mmapv1 node in // the CSRS in the REMOVED state. This is handled by the TopologyCoordinator. invariant(replSettings.isMajorityReadConcernEnabled()); severe() << "Majority read concern requires a storage engine that supports" << " snapshots, such as wiredTiger. " << storageGlobalParams.engine << " does not support snapshots."; exitCleanly(EXIT_BADOPTIONS); } } logMongodStartupWarnings(storageGlobalParams, serverGlobalParams); { stringstream ss; ss << endl; ss << "*********************************************************************" << endl; ss << " ERROR: dbpath (" << storageGlobalParams.dbpath << ") does not exist." << endl; ss << " Create this directory or give existing directory in --dbpath." << endl; ss << " See http://dochub.mongodb.org/core/startingandstoppingmongo" << endl; ss << "*********************************************************************" << endl; uassert(10296, ss.str().c_str(), boost::filesystem::exists(storageGlobalParams.dbpath)); } { stringstream ss; ss << "repairpath (" << storageGlobalParams.repairpath << ") does not exist"; uassert(12590, ss.str().c_str(), boost::filesystem::exists(storageGlobalParams.repairpath)); } // TODO: This should go into a MONGO_INITIALIZER once we have figured out the correct // dependencies. if (snmpInit) { snmpInit(); } if (!storageGlobalParams.readOnly) { boost::filesystem::remove_all(storageGlobalParams.dbpath + "/_tmp/"); } if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalRecoverOnly) return EXIT_NET_ERROR; if (mongodGlobalParams.scriptingEnabled) { ScriptEngine::setup(); } auto startupOpCtx = getGlobalServiceContext()->makeOperationContext(&cc()); repairDatabasesAndCheckVersion(startupOpCtx.get()); if (storageGlobalParams.upgrade) { log() << "finished checking dbs"; exitCleanly(EXIT_CLEAN); } uassertStatusOK(getGlobalAuthorizationManager()->initialize(startupOpCtx.get())); /* this is for security on certain platforms (nonce generation) */ srand((unsigned)(curTimeMicros64() ^ startupSrandTimer.micros())); // The snapshot thread provides historical collection level and lock statistics for use // by the web interface. Only needed when HTTP is enabled. if (serverGlobalParams.isHttpInterfaceEnabled) { statsSnapshotThread.go(); invariant(dbWebServer); stdx::thread web(stdx::bind(&webServerListenThread, dbWebServer)); web.detach(); } AuthorizationManager* globalAuthzManager = getGlobalAuthorizationManager(); if (globalAuthzManager->shouldValidateAuthSchemaOnStartup()) { Status status = authindex::verifySystemIndexes(startupOpCtx.get()); if (!status.isOK()) { log() << redact(status); exitCleanly(EXIT_NEED_UPGRADE); } // SERVER-14090: Verify that auth schema version is schemaVersion26Final. int foundSchemaVersion; status = globalAuthzManager->getAuthorizationVersion(startupOpCtx.get(), &foundSchemaVersion); if (!status.isOK()) { log() << "Auth schema version is incompatible: " << "User and role management commands require auth data to have " << "at least schema version " << AuthorizationManager::schemaVersion26Final << " but startup could not verify schema version: " << status; exitCleanly(EXIT_NEED_UPGRADE); } if (foundSchemaVersion < AuthorizationManager::schemaVersion26Final) { log() << "Auth schema version is incompatible: " << "User and role management commands require auth data to have " << "at least schema version " << AuthorizationManager::schemaVersion26Final << " but found " << foundSchemaVersion << ". In order to upgrade " << "the auth schema, first downgrade MongoDB binaries to version " << "2.6 and then run the authSchemaUpgrade command."; exitCleanly(EXIT_NEED_UPGRADE); } } else if (globalAuthzManager->isAuthEnabled()) { error() << "Auth must be disabled when starting without auth schema validation"; exitCleanly(EXIT_BADOPTIONS); } else { // If authSchemaValidation is disabled and server is running without auth, // warn the user and continue startup without authSchema metadata checks. log() << startupWarningsLog; log() << "** WARNING: Startup auth schema validation checks are disabled for the " "database." << startupWarningsLog; log() << "** This mode should only be used to manually repair corrupted auth " "data." << startupWarningsLog; } auto shardingInitialized = uassertStatusOK(ShardingState::get(startupOpCtx.get()) ->initializeShardingAwarenessIfNeeded(startupOpCtx.get())); if (shardingInitialized) { reloadShardRegistryUntilSuccess(startupOpCtx.get()); } if (!storageGlobalParams.readOnly) { logStartup(startupOpCtx.get()); startFTDC(); getDeleter()->startWorkers(); restartInProgressIndexesFromLastShutdown(startupOpCtx.get()); if (serverGlobalParams.clusterRole == ClusterRole::ShardServer) { // Note: For replica sets, ShardingStateRecovery happens on transition to primary. if (!repl::getGlobalReplicationCoordinator()->isReplEnabled()) { uassertStatusOK(ShardingStateRecovery::recover(startupOpCtx.get())); } } else if (serverGlobalParams.clusterRole == ClusterRole::ConfigServer) { uassertStatusOK( initializeGlobalShardingStateForMongod(startupOpCtx.get(), ConnectionString::forLocal(), kDistLockProcessIdForConfigServer)); Balancer::create(startupOpCtx->getServiceContext()); } repl::getGlobalReplicationCoordinator()->startup(startupOpCtx.get()); const unsigned long long missingRepl = checkIfReplMissingFromCommandLine(startupOpCtx.get()); if (missingRepl) { log() << startupWarningsLog; log() << "** WARNING: mongod started without --replSet yet " << missingRepl << " documents are present in local.system.replset" << startupWarningsLog; log() << "** Restart with --replSet unless you are doing maintenance and " << " no other clients are connected." << startupWarningsLog; log() << "** The TTL collection monitor will not start because of this." << startupWarningsLog; log() << "** "; log() << " For more info see http://dochub.mongodb.org/core/ttlcollections"; log() << startupWarningsLog; } else { startTTLBackgroundJob(); } if (!replSettings.usingReplSets() && !replSettings.isSlave() && storageGlobalParams.engine != "devnull") { ScopedTransaction transaction(startupOpCtx.get(), MODE_X); Lock::GlobalWrite lk(startupOpCtx.get()->lockState()); FeatureCompatibilityVersion::setIfCleanStartup( startupOpCtx.get(), repl::StorageInterface::get(getGlobalServiceContext())); } if (replSettings.usingReplSets() || (!replSettings.isMaster() && replSettings.isSlave()) || !internalValidateFeaturesAsMaster) { serverGlobalParams.featureCompatibility.validateFeaturesAsMaster.store(false); } } startClientCursorMonitor(); PeriodicTask::startRunningPeriodicTasks(); // MessageServer::run will return when exit code closes its socket and we don't need the // operation context anymore startupOpCtx.reset(); auto start = getGlobalServiceContext()->addAndStartTransportLayer(std::move(transportLayer)); if (!start.isOK()) { error() << "Failed to start the listener: " << start.toString(); return EXIT_NET_ERROR; } #ifndef _WIN32 mongo::signalForkSuccess(); #else if (ntservice::shouldStartService()) { ntservice::reportStatus(SERVICE_RUNNING); log() << "Service running"; } #endif return waitForShutdown(); }
StatusWith<Shard::CommandResponse> ShardingCatalogManager::_runCommandForAddShard( OperationContext* opCtx, RemoteCommandTargeter* targeter, const std::string& dbName, const BSONObj& cmdObj) { auto swHost = targeter->findHost(opCtx, ReadPreferenceSetting{ReadPreference::PrimaryOnly}); if (!swHost.isOK()) { return swHost.getStatus(); } auto host = std::move(swHost.getValue()); executor::RemoteCommandRequest request( host, dbName, cmdObj, rpc::makeEmptyMetadata(), nullptr, Seconds(30)); executor::RemoteCommandResponse response = Status(ErrorCodes::InternalError, "Internal error running command"); auto swCallbackHandle = _executorForAddShard->scheduleRemoteCommand( request, [&response](const executor::TaskExecutor::RemoteCommandCallbackArgs& args) { response = args.response; }); if (!swCallbackHandle.isOK()) { return swCallbackHandle.getStatus(); } // Block until the command is carried out _executorForAddShard->wait(swCallbackHandle.getValue()); if (response.status == ErrorCodes::ExceededTimeLimit) { LOG(0) << "Operation timed out with status " << redact(response.status); } if (!response.isOK()) { if (!Shard::shouldErrorBePropagated(response.status.code())) { return {ErrorCodes::OperationFailed, str::stream() << "failed to run command " << cmdObj << " when attempting to add shard " << targeter->connectionString().toString() << causedBy(response.status)}; } return response.status; } BSONObj result = response.data.getOwned(); Status commandStatus = getStatusFromCommandResult(result); if (!Shard::shouldErrorBePropagated(commandStatus.code())) { commandStatus = {ErrorCodes::OperationFailed, str::stream() << "failed to run command " << cmdObj << " when attempting to add shard " << targeter->connectionString().toString() << causedBy(commandStatus)}; } Status writeConcernStatus = getWriteConcernStatusFromCommandResult(result); if (!Shard::shouldErrorBePropagated(writeConcernStatus.code())) { writeConcernStatus = {ErrorCodes::OperationFailed, str::stream() << "failed to satisfy writeConcern for command " << cmdObj << " when attempting to add shard " << targeter->connectionString().toString() << causedBy(writeConcernStatus)}; } return Shard::CommandResponse(std::move(host), std::move(result), response.metadata.getOwned(), std::move(commandStatus), std::move(writeConcernStatus)); }
Status KeyGenerator::generateNewKeysIfNeeded(OperationContext* opCtx) { if (MONGO_FAIL_POINT(disableKeyGeneration)) { return {ErrorCodes::FailPointEnabled, "key generation disabled"}; } auto currentTime = LogicalClock::get(opCtx)->getClusterTime(); auto keyStatus = _client->getNewKeys(opCtx, _purpose, currentTime); if (!keyStatus.isOK()) { return keyStatus.getStatus(); } const auto& newKeys = keyStatus.getValue(); auto keyIter = newKeys.cbegin(); LogicalTime currentKeyExpiresAt; long long keyId = currentTime.asTimestamp().asLL(); if (keyIter == newKeys.cend()) { currentKeyExpiresAt = addSeconds(currentTime, _keyValidForInterval); auto status = insertNewKey(opCtx, _client, keyId, _purpose, currentKeyExpiresAt); if (!status.isOK()) { return status; } keyId++; } else if (keyIter->getExpiresAt() < currentTime) { currentKeyExpiresAt = addSeconds(currentTime, _keyValidForInterval); auto status = insertNewKey(opCtx, _client, keyId, _purpose, currentKeyExpiresAt); if (!status.isOK()) { return status; } keyId++; ++keyIter; } else { currentKeyExpiresAt = keyIter->getExpiresAt(); ++keyIter; } // Create a new key in advance if we don't have a key on standby after the current one // expires. // Note: Convert this block into a loop if more reserved keys are desired. if (keyIter == newKeys.cend()) { auto reserveKeyExpiresAt = addSeconds(currentKeyExpiresAt, _keyValidForInterval); auto status = insertNewKey(opCtx, _client, keyId, _purpose, reserveKeyExpiresAt); if (!status.isOK()) { return status; } } else if (keyIter->getExpiresAt() < currentTime) { currentKeyExpiresAt = addSeconds(currentKeyExpiresAt, _keyValidForInterval); auto status = insertNewKey(opCtx, _client, keyId, _purpose, currentKeyExpiresAt); if (!status.isOK()) { return status; } } return Status::OK(); }
StatusWith<boost::optional<ShardType>> ShardingCatalogManager::_checkIfShardExists( OperationContext* opCtx, const ConnectionString& proposedShardConnectionString, const std::string* proposedShardName, long long proposedShardMaxSize) { // Check whether any host in the connection is already part of the cluster. const auto existingShards = Grid::get(opCtx)->catalogClient()->getAllShards( opCtx, repl::ReadConcernLevel::kLocalReadConcern); if (!existingShards.isOK()) { return existingShards.getStatus().withContext( "Failed to load existing shards during addShard"); } // Now check if this shard already exists - if it already exists *with the same options* then // the addShard request can return success early without doing anything more. for (const auto& existingShard : existingShards.getValue().value) { auto swExistingShardConnStr = ConnectionString::parse(existingShard.getHost()); if (!swExistingShardConnStr.isOK()) { return swExistingShardConnStr.getStatus(); } auto existingShardConnStr = std::move(swExistingShardConnStr.getValue()); // Function for determining if the options for the shard that is being added match the // options of an existing shard that conflicts with it. auto shardsAreEquivalent = [&]() { if (proposedShardName && *proposedShardName != existingShard.getName()) { return false; } if (proposedShardConnectionString.type() != existingShardConnStr.type()) { return false; } if (proposedShardConnectionString.type() == ConnectionString::SET && proposedShardConnectionString.getSetName() != existingShardConnStr.getSetName()) { return false; } if (proposedShardMaxSize != existingShard.getMaxSizeMB()) { return false; } return true; }; if (existingShardConnStr.type() == ConnectionString::SET && proposedShardConnectionString.type() == ConnectionString::SET && existingShardConnStr.getSetName() == proposedShardConnectionString.getSetName()) { // An existing shard has the same replica set name as the shard being added. // If the options aren't the same, then this is an error, // but if the options match then the addShard operation should be immediately // considered a success and terminated. if (shardsAreEquivalent()) { return {existingShard}; } else { return {ErrorCodes::IllegalOperation, str::stream() << "A shard already exists containing the replica set '" << existingShardConnStr.getSetName() << "'"}; } } for (const auto& existingHost : existingShardConnStr.getServers()) { // Look if any of the hosts in the existing shard are present within the shard trying // to be added. for (const auto& addingHost : proposedShardConnectionString.getServers()) { if (existingHost == addingHost) { // At least one of the hosts in the shard being added already exists in an // existing shard. If the options aren't the same, then this is an error, // but if the options match then the addShard operation should be immediately // considered a success and terminated. if (shardsAreEquivalent()) { return {existingShard}; } else { return {ErrorCodes::IllegalOperation, str::stream() << "'" << addingHost.toString() << "' " << "is already a member of the existing shard '" << existingShard.getHost() << "' (" << existingShard.getName() << ")."}; } } } } if (proposedShardName && *proposedShardName == existingShard.getName()) { // If we get here then we're trying to add a shard with the same name as an existing // shard, but there was no overlap in the hosts between the existing shard and the // proposed connection string for the new shard. return {ErrorCodes::IllegalOperation, str::stream() << "A shard named " << *proposedShardName << " already exists"}; } } return {boost::none}; }
void SyncSourceFeedback::run(BackgroundSync* bgsync) { Client::initThread("SyncSourceFeedback"); // Task executor used to run replSetUpdatePosition command on sync source. auto net = executor::makeNetworkInterface("NetworkInterfaceASIO-SyncSourceFeedback"); auto pool = stdx::make_unique<executor::NetworkInterfaceThreadPool>(net.get()); executor::ThreadPoolTaskExecutor executor(std::move(pool), std::move(net)); executor.startup(); ON_BLOCK_EXIT([&executor]() { executor.shutdown(); executor.join(); }); HostAndPort syncTarget; // keepAliveInterval indicates how frequently to forward progress in the absence of updates. Milliseconds keepAliveInterval(0); while (true) { // breaks once _shutdownSignaled is true auto txn = cc().makeOperationContext(); if (keepAliveInterval == Milliseconds(0)) { keepAliveInterval = calculateKeepAliveInterval(txn.get(), _mtx); } { // Take SyncSourceFeedback lock before calling into ReplicationCoordinator // to avoid deadlock because ReplicationCoordinator could conceivably calling back into // this class. stdx::unique_lock<stdx::mutex> lock(_mtx); while (!_positionChanged && !_shutdownSignaled) { if (_cond.wait_for(lock, keepAliveInterval.toSystemDuration()) == stdx::cv_status::timeout) { MemberState state = ReplicationCoordinator::get(txn.get())->getMemberState(); if (!(state.primary() || state.startup())) { break; } } } if (_shutdownSignaled) { break; } _positionChanged = false; } { stdx::lock_guard<stdx::mutex> lock(_mtx); MemberState state = ReplicationCoordinator::get(txn.get())->getMemberState(); if (state.primary() || state.startup()) { continue; } } const HostAndPort target = bgsync->getSyncTarget(); // Log sync source changes. if (target.empty()) { if (syncTarget != target) { syncTarget = target; } // Loop back around again; the keepalive functionality will cause us to retry continue; } if (syncTarget != target) { LOG(1) << "setting syncSourceFeedback to " << target; syncTarget = target; // Update keepalive value from config. auto oldKeepAliveInterval = keepAliveInterval; keepAliveInterval = calculateKeepAliveInterval(txn.get(), _mtx); if (oldKeepAliveInterval != keepAliveInterval) { LOG(1) << "new syncSourceFeedback keep alive duration = " << keepAliveInterval << " (previously " << oldKeepAliveInterval << ")"; } } Reporter reporter( &executor, makePrepareReplSetUpdatePositionCommandFn(txn.get(), _mtx, syncTarget, bgsync), syncTarget, keepAliveInterval); { stdx::lock_guard<stdx::mutex> lock(_mtx); _reporter = &reporter; } ON_BLOCK_EXIT([this]() { stdx::lock_guard<stdx::mutex> lock(_mtx); _reporter = nullptr; }); auto status = _updateUpstream(txn.get(), bgsync); if (!status.isOK()) { LOG(1) << "The replication progress command (replSetUpdatePosition) failed and will be " "retried: " << status; } } }
StatusWith<std::string> ShardingCatalogManager::addShard( OperationContext* opCtx, const std::string* shardProposedName, const ConnectionString& shardConnectionString, const long long maxSize) { if (shardConnectionString.type() == ConnectionString::INVALID) { return {ErrorCodes::BadValue, "Invalid connection string"}; } if (shardProposedName && shardProposedName->empty()) { return {ErrorCodes::BadValue, "shard name cannot be empty"}; } // Only one addShard operation can be in progress at a time. Lock::ExclusiveLock lk(opCtx->lockState(), _kShardMembershipLock); // Check if this shard has already been added (can happen in the case of a retry after a network // error, for example) and thus this addShard request should be considered a no-op. auto existingShard = _checkIfShardExists(opCtx, shardConnectionString, shardProposedName, maxSize); if (!existingShard.isOK()) { return existingShard.getStatus(); } if (existingShard.getValue()) { // These hosts already belong to an existing shard, so report success and terminate the // addShard request. Make sure to set the last optime for the client to the system last // optime so that we'll still wait for replication so that this state is visible in the // committed snapshot. repl::ReplClientInfo::forClient(opCtx->getClient()).setLastOpToSystemLastOpTime(opCtx); return existingShard.getValue()->getName(); } // Force a reload of the ShardRegistry to ensure that, in case this addShard is to re-add a // replica set that has recently been removed, we have detached the ReplicaSetMonitor for the // set with that setName from the ReplicaSetMonitorManager and will create a new // ReplicaSetMonitor when targeting the set below. // Note: This is necessary because as of 3.4, removeShard is performed by mongos (unlike // addShard), so the ShardRegistry is not synchronously reloaded on the config server when a // shard is removed. if (!Grid::get(opCtx)->shardRegistry()->reload(opCtx)) { // If the first reload joined an existing one, call reload again to ensure the reload is // fresh. Grid::get(opCtx)->shardRegistry()->reload(opCtx); } // TODO: Don't create a detached Shard object, create a detached RemoteCommandTargeter instead. const std::shared_ptr<Shard> shard{ Grid::get(opCtx)->shardRegistry()->createConnection(shardConnectionString)}; invariant(shard); auto targeter = shard->getTargeter(); auto stopMonitoringGuard = MakeGuard([&] { if (shardConnectionString.type() == ConnectionString::SET) { // This is a workaround for the case were we could have some bad shard being // requested to be added and we put that bad connection string on the global replica set // monitor registry. It needs to be cleaned up so that when a correct replica set is // added, it will be recreated. ReplicaSetMonitor::remove(shardConnectionString.getSetName()); } }); // Validate the specified connection string may serve as shard at all auto shardStatus = _validateHostAsShard(opCtx, targeter, shardProposedName, shardConnectionString); if (!shardStatus.isOK()) { return shardStatus.getStatus(); } ShardType& shardType = shardStatus.getValue(); // Check that none of the existing shard candidate's dbs exist already auto dbNamesStatus = _getDBNamesListFromShard(opCtx, targeter); if (!dbNamesStatus.isOK()) { return dbNamesStatus.getStatus(); } for (const auto& dbName : dbNamesStatus.getValue()) { auto dbt = Grid::get(opCtx)->catalogClient()->getDatabase( opCtx, dbName, repl::ReadConcernLevel::kLocalReadConcern); if (dbt.isOK()) { const auto& dbDoc = dbt.getValue().value; return Status(ErrorCodes::OperationFailed, str::stream() << "can't add shard " << "'" << shardConnectionString.toString() << "'" << " because a local database '" << dbName << "' exists in another " << dbDoc.getPrimary()); } else if (dbt != ErrorCodes::NamespaceNotFound) { return dbt.getStatus(); } } // Check that the shard candidate does not have a local config.system.sessions collection auto res = _dropSessionsCollection(opCtx, targeter); if (!res.isOK()) { return res.withContext( "can't add shard with a local copy of config.system.sessions, please drop this " "collection from the shard manually and try again."); } // If a name for a shard wasn't provided, generate one if (shardType.getName().empty()) { auto result = generateNewShardName(opCtx); if (!result.isOK()) { return result.getStatus(); } shardType.setName(result.getValue()); } if (maxSize > 0) { shardType.setMaxSizeMB(maxSize); } // Insert a shardIdentity document onto the shard. This also triggers sharding initialization on // the shard. LOG(2) << "going to insert shardIdentity document into shard: " << shardType; auto commandRequest = createShardIdentityUpsertForAddShard(opCtx, shardType.getName()); auto swCommandResponse = _runCommandForAddShard(opCtx, targeter.get(), "admin", commandRequest); if (!swCommandResponse.isOK()) { return swCommandResponse.getStatus(); } auto commandResponse = std::move(swCommandResponse.getValue()); BatchedCommandResponse batchResponse; auto batchResponseStatus = Shard::CommandResponse::processBatchWriteResponse(commandResponse, &batchResponse); if (!batchResponseStatus.isOK()) { return batchResponseStatus; } // The featureCompatibilityVersion should be the same throughout the cluster. We don't // explicitly send writeConcern majority to the added shard, because a 3.4 mongod will reject // it (setFCV did not support writeConcern until 3.6), and a 3.6 mongod will still default to // majority writeConcern. // // TODO SERVER-32045: propagate the user's writeConcern auto versionResponse = _runCommandForAddShard( opCtx, targeter.get(), "admin", BSON(FeatureCompatibilityVersion::kCommandName << FeatureCompatibilityVersion::toString( serverGlobalParams.featureCompatibility.getVersion()))); if (!versionResponse.isOK()) { return versionResponse.getStatus(); } if (!versionResponse.getValue().commandStatus.isOK()) { return versionResponse.getValue().commandStatus; } log() << "going to insert new entry for shard into config.shards: " << shardType.toString(); Status result = Grid::get(opCtx)->catalogClient()->insertConfigDocument( opCtx, ShardType::ConfigNS, shardType.toBSON(), ShardingCatalogClient::kMajorityWriteConcern); if (!result.isOK()) { log() << "error adding shard: " << shardType.toBSON() << " err: " << result.reason(); return result; } // Add all databases which were discovered on the new shard for (const auto& dbName : dbNamesStatus.getValue()) { DatabaseType dbt(dbName, shardType.getName(), false); Status status = Grid::get(opCtx)->catalogClient()->updateDatabase(opCtx, dbName, dbt); if (!status.isOK()) { log() << "adding shard " << shardConnectionString.toString() << " even though could not add database " << dbName; } } // Record in changelog BSONObjBuilder shardDetails; shardDetails.append("name", shardType.getName()); shardDetails.append("host", shardConnectionString.toString()); Grid::get(opCtx) ->catalogClient() ->logChange( opCtx, "addShard", "", shardDetails.obj(), ShardingCatalogClient::kMajorityWriteConcern) .transitional_ignore(); // Ensure the added shard is visible to this process. auto shardRegistry = Grid::get(opCtx)->shardRegistry(); if (!shardRegistry->getShard(opCtx, shardType.getName()).isOK()) { return {ErrorCodes::OperationFailed, "Could not find shard metadata for shard after adding it. This most likely " "indicates that the shard was removed immediately after it was added."}; } stopMonitoringGuard.Dismiss(); return shardType.getName(); }
void OplogReader::connectToSyncSource(OperationContext* txn, const OpTime& lastOpTimeFetched, ReplicationCoordinator* replCoord) { const Timestamp sentinelTimestamp(duration_cast<Seconds>(Milliseconds(curTimeMillis64())), 0); const OpTime sentinel(sentinelTimestamp, std::numeric_limits<long long>::max()); OpTime oldestOpTimeSeen = sentinel; invariant(conn() == NULL); while (true) { HostAndPort candidate = replCoord->chooseNewSyncSource(lastOpTimeFetched.getTimestamp()); if (candidate.empty()) { if (oldestOpTimeSeen == sentinel) { // If, in this invocation of connectToSyncSource(), we did not successfully // connect to any node ahead of us, // we apparently have no sync sources to connect to. // This situation is common; e.g. if there are no writes to the primary at // the moment. return; } // Connected to at least one member, but in all cases we were too stale to use them // as a sync source. error() << "too stale to catch up -- entering maintenance mode"; log() << "our last optime : " << lastOpTimeFetched; log() << "oldest available is " << oldestOpTimeSeen; log() << "See http://dochub.mongodb.org/core/resyncingaverystalereplicasetmember"; setMinValid(txn, {lastOpTimeFetched, oldestOpTimeSeen}); auto status = replCoord->setMaintenanceMode(true); if (!status.isOK()) { warning() << "Failed to transition into maintenance mode."; } bool worked = replCoord->setFollowerMode(MemberState::RS_RECOVERING); if (!worked) { warning() << "Failed to transition into " << MemberState(MemberState::RS_RECOVERING) << ". Current state: " << replCoord->getMemberState(); } return; } if (!connect(candidate)) { LOG(2) << "can't connect to " << candidate.toString() << " to read operations"; resetConnection(); replCoord->blacklistSyncSource(candidate, Date_t::now() + Seconds(10)); continue; } // Read the first (oldest) op and confirm that it's not newer than our last // fetched op. Otherwise, we have fallen off the back of that source's oplog. BSONObj remoteOldestOp(findOne(rsOplogName.c_str(), Query())); OpTime remoteOldOpTime = fassertStatusOK(28776, OpTime::parseFromOplogEntry(remoteOldestOp)); // remoteOldOpTime may come from a very old config, so we cannot compare their terms. if (!lastOpTimeFetched.isNull() && lastOpTimeFetched.getTimestamp() < remoteOldOpTime.getTimestamp()) { // We're too stale to use this sync source. resetConnection(); replCoord->blacklistSyncSource(candidate, Date_t::now() + Minutes(1)); if (oldestOpTimeSeen.getTimestamp() > remoteOldOpTime.getTimestamp()) { warning() << "we are too stale to use " << candidate.toString() << " as a sync source"; oldestOpTimeSeen = remoteOldOpTime; } continue; } // TODO: If we were too stale (recovering with maintenance mode on), then turn it off, to // allow becoming secondary/etc. // Got a valid sync source. return; } // while (true) }
StatusWith<BalancerSettingsType> BalancerSettingsType::fromBSON(const BSONObj& obj) { BalancerSettingsType settings; { bool stopped; Status status = bsonExtractBooleanFieldWithDefault(obj, kStopped, false, &stopped); if (!status.isOK()) return status; if (stopped) { settings._mode = kOff; } else { std::string modeStr; status = bsonExtractStringFieldWithDefault(obj, kMode, kBalancerModes[kFull], &modeStr); if (!status.isOK()) return status; auto it = std::find(std::begin(kBalancerModes), std::end(kBalancerModes), modeStr); if (it == std::end(kBalancerModes)) { return Status(ErrorCodes::BadValue, "Invalid balancer mode"); } settings._mode = static_cast<BalancerMode>(it - std::begin(kBalancerModes)); } } { BSONElement activeWindowElem; Status status = bsonExtractTypedField(obj, kActiveWindow, Object, &activeWindowElem); if (status.isOK()) { const BSONObj balancingWindowObj = activeWindowElem.Obj(); if (balancingWindowObj.isEmpty()) { return Status(ErrorCodes::BadValue, "activeWindow not specified"); } // Check if both 'start' and 'stop' are present const std::string start = balancingWindowObj.getField("start").str(); const std::string stop = balancingWindowObj.getField("stop").str(); if (start.empty() || stop.empty()) { return Status(ErrorCodes::BadValue, str::stream() << "must specify both start and stop of balancing window: " << balancingWindowObj); } // Check that both 'start' and 'stop' are valid time-of-day boost::posix_time::ptime startTime; boost::posix_time::ptime stopTime; if (!toPointInTime(start, &startTime) || !toPointInTime(stop, &stopTime)) { return Status(ErrorCodes::BadValue, str::stream() << kActiveWindow << " format is " << " { start: \"hh:mm\" , stop: \"hh:mm\" }"); } // Check that start and stop designate different time points if (startTime == stopTime) { return Status(ErrorCodes::BadValue, str::stream() << "start and stop times must be different"); } settings._activeWindowStart = startTime; settings._activeWindowStop = stopTime; } else if (status != ErrorCodes::NoSuchKey) { return status; } } { auto secondaryThrottleStatus = MigrationSecondaryThrottleOptions::createFromBalancerConfig(obj); if (!secondaryThrottleStatus.isOK()) { return secondaryThrottleStatus.getStatus(); } settings._secondaryThrottle = std::move(secondaryThrottleStatus.getValue()); } { bool waitForDelete; Status status = bsonExtractBooleanFieldWithDefault(obj, kWaitForDelete, false, &waitForDelete); if (!status.isOK()) return status; settings._waitForDelete = waitForDelete; } return settings; }
mongo::Status mongo::cloneCollectionAsCapped(OperationContext* opCtx, Database* db, const std::string& shortFrom, const std::string& shortTo, long long size, bool temp) { NamespaceString fromNss(db->name(), shortFrom); NamespaceString toNss(db->name(), shortTo); Collection* fromCollection = db->getCollection(opCtx, fromNss); if (!fromCollection) { if (db->getViewCatalog()->lookup(opCtx, fromNss.ns())) { return Status(ErrorCodes::CommandNotSupportedOnView, str::stream() << "cloneCollectionAsCapped not supported for views: " << fromNss.ns()); } return Status(ErrorCodes::NamespaceNotFound, str::stream() << "source collection " << fromNss.ns() << " does not exist"); } if (fromNss.isDropPendingNamespace()) { return Status(ErrorCodes::NamespaceNotFound, str::stream() << "source collection " << fromNss.ns() << " is currently in a drop-pending state."); } if (db->getCollection(opCtx, toNss)) { return Status(ErrorCodes::NamespaceExists, str::stream() << "cloneCollectionAsCapped failed - destination collection " << toNss.ns() << " already exists. source collection: " << fromNss.ns()); } // create new collection { auto options = fromCollection->getCatalogEntry()->getCollectionOptions(opCtx); // The capped collection will get its own new unique id, as the conversion isn't reversible, // so it can't be rolled back. options.uuid.reset(); options.capped = true; options.cappedSize = size; if (temp) options.temp = true; BSONObjBuilder cmd; cmd.append("create", toNss.coll()); cmd.appendElements(options.toBSON()); Status status = createCollection(opCtx, toNss.db().toString(), cmd.done()); if (!status.isOK()) return status; } Collection* toCollection = db->getCollection(opCtx, toNss); invariant(toCollection); // we created above // how much data to ignore because it won't fit anyway // datasize and extentSize can't be compared exactly, so add some padding to 'size' long long allocatedSpaceGuess = std::max(static_cast<long long>(size * 2), static_cast<long long>(toCollection->getRecordStore()->storageSize(opCtx) * 2)); long long excessSize = fromCollection->dataSize(opCtx) - allocatedSpaceGuess; auto exec = InternalPlanner::collectionScan(opCtx, fromNss.ns(), fromCollection, PlanExecutor::WRITE_CONFLICT_RETRY_ONLY, InternalPlanner::FORWARD); Snapshotted<BSONObj> objToClone; RecordId loc; PlanExecutor::ExecState state = PlanExecutor::FAILURE; // suppress uninitialized warnings DisableDocumentValidation validationDisabler(opCtx); int retries = 0; // non-zero when retrying our last document. while (true) { if (!retries) { state = exec->getNextSnapshotted(&objToClone, &loc); } switch (state) { case PlanExecutor::IS_EOF: return Status::OK(); case PlanExecutor::ADVANCED: { if (excessSize > 0) { // 4x is for padding, power of 2, etc... excessSize -= (4 * objToClone.value().objsize()); continue; } break; } default: // Unreachable as: // 1) We require a read lock (at a minimum) on the "from" collection // and won't yield, preventing collection drop and PlanExecutor::DEAD // 2) PlanExecutor::FAILURE is only returned on PlanStage::FAILURE. The // CollectionScan PlanStage does not have a FAILURE scenario. // 3) All other PlanExecutor states are handled above MONGO_UNREACHABLE; } try { // Make sure we are working with the latest version of the document. if (objToClone.snapshotId() != opCtx->recoveryUnit()->getSnapshotId() && !fromCollection->findDoc(opCtx, loc, &objToClone)) { // doc was deleted so don't clone it. retries = 0; continue; } WriteUnitOfWork wunit(opCtx); OpDebug* const nullOpDebug = nullptr; uassertStatusOK(toCollection->insertDocument( opCtx, InsertStatement(objToClone.value()), nullOpDebug, true)); wunit.commit(); // Go to the next document retries = 0; } catch (const WriteConflictException&) { CurOp::get(opCtx)->debug().writeConflicts++; retries++; // logAndBackoff expects this to be 1 on first call. WriteConflictException::logAndBackoff(retries, "cloneCollectionAsCapped", fromNss.ns()); // Can't use writeConflictRetry since we need to save/restore exec around call to // abandonSnapshot. exec->saveState(); opCtx->recoveryUnit()->abandonSnapshot(); auto restoreStatus = exec->restoreState(); // Handles any WCEs internally. if (!restoreStatus.isOK()) { return restoreStatus; } } } MONGO_UNREACHABLE; }