StatusWith<ReadPreferenceSetting> ClusterFind::extractUnwrappedReadPref(const BSONObj& cmdObj, const bool isSlaveOk) { BSONElement queryOptionsElt; auto status = bsonExtractTypedField( cmdObj, QueryRequest::kUnwrappedReadPrefField, BSONType::Object, &queryOptionsElt); if (status.isOK()) { // There must be a nested object containing the read preference if there is a queryOptions // field. BSONObj queryOptionsObj = queryOptionsElt.Obj(); invariant(queryOptionsObj[QueryRequest::kWrappedReadPrefField].type() == BSONType::Object); BSONObj readPrefObj = queryOptionsObj[QueryRequest::kWrappedReadPrefField].Obj(); auto readPref = ReadPreferenceSetting::fromBSON(readPrefObj); if (!readPref.isOK()) { return readPref.getStatus(); } return readPref.getValue(); } else if (status != ErrorCodes::NoSuchKey) { return status; } // If there is no explicit read preference, the value we use depends on the setting of the slave // ok bit. ReadPreference pref = isSlaveOk ? mongo::ReadPreference::SecondaryPreferred : mongo::ReadPreference::PrimaryOnly; return ReadPreferenceSetting(pref, TagSet()); }
DBClientConnection& DBClientReplicaSet::slaveConn() { shared_ptr<ReadPreferenceSetting> readPref( new ReadPreferenceSetting(ReadPreference_SecondaryPreferred, TagSet())); DBClientConnection* conn = selectNodeUsingTags(readPref); uassert( 16369, str::stream() << "No good nodes available for set: " << _getMonitor()->getName(), conn != NULL ); return *conn; }
StatusWith<ReadPreferenceSetting> ReadPreferenceSetting::fromBSON(const BSONObj& readPrefObj) { std::string modeStr; auto modeExtractStatus = bsonExtractStringField(readPrefObj, kModeFieldName, &modeStr); if (!modeExtractStatus.isOK()) { return modeExtractStatus; } ReadPreference mode; auto swReadPrefMode = parseReadPreferenceMode(modeStr); if (!swReadPrefMode.isOK()) { return swReadPrefMode.getStatus(); } mode = std::move(swReadPrefMode.getValue()); TagSet tags; BSONElement tagsElem; auto tagExtractStatus = bsonExtractTypedField(readPrefObj, kTagsFieldName, mongo::Array, &tagsElem); if (tagExtractStatus.isOK()) { tags = TagSet{BSONArray(tagsElem.Obj().getOwned())}; // In accordance with the read preference spec, passing the default wildcard tagset // '[{}]' is the same as not passing a TagSet at all. Furthermore, passing an empty // TagSet with a non-primary ReadPreference is equivalent to passing the wildcard // ReadPreference. if (tags == TagSet() || tags == TagSet::primaryOnly()) { tags = defaultTagSetForMode(mode); } // If we are using a user supplied TagSet, check that it is compatible with // the readPreference mode. else if (ReadPreference::PrimaryOnly == mode && (tags != TagSet::primaryOnly())) { return Status(ErrorCodes::BadValue, "Only empty tags are allowed with primary read preference"); } } else if (ErrorCodes::NoSuchKey == tagExtractStatus) { tags = defaultTagSetForMode(mode); } else { return tagExtractStatus; } return ReadPreferenceSetting(mode, tags); }
StatusWith<ReadPreferenceSetting> ReadPreferenceSetting::fromInnerBSON(const BSONObj& readPrefObj) { std::string modeStr; auto modeExtractStatus = bsonExtractStringField(readPrefObj, kModeFieldName, &modeStr); if (!modeExtractStatus.isOK()) { return modeExtractStatus; } ReadPreference mode; auto swReadPrefMode = parseReadPreferenceMode(modeStr); if (!swReadPrefMode.isOK()) { return swReadPrefMode.getStatus(); } mode = std::move(swReadPrefMode.getValue()); TagSet tags; BSONElement tagsElem; auto tagExtractStatus = bsonExtractTypedField(readPrefObj, kTagsFieldName, mongo::Array, &tagsElem); if (tagExtractStatus.isOK()) { tags = TagSet{BSONArray(tagsElem.Obj().getOwned())}; // In accordance with the read preference spec, passing the default wildcard tagset // '[{}]' is the same as not passing a TagSet at all. Furthermore, passing an empty // TagSet with a non-primary ReadPreference is equivalent to passing the wildcard // ReadPreference. if (tags == TagSet() || tags == TagSet::primaryOnly()) { tags = defaultTagSetForMode(mode); } // If we are using a user supplied TagSet, check that it is compatible with // the readPreference mode. else if (ReadPreference::PrimaryOnly == mode && (tags != TagSet::primaryOnly())) { return Status(ErrorCodes::BadValue, "Only empty tags are allowed with primary read preference"); } } else if (ErrorCodes::NoSuchKey == tagExtractStatus) { tags = defaultTagSetForMode(mode); } else { return tagExtractStatus; } long long maxStalenessSecondsValue; auto maxStalenessSecondsExtractStatus = bsonExtractIntegerFieldWithDefault( readPrefObj, kMaxStalenessSecondsFieldName, 0, &maxStalenessSecondsValue); if (!maxStalenessSecondsExtractStatus.isOK()) { return maxStalenessSecondsExtractStatus; } if (maxStalenessSecondsValue && maxStalenessSecondsValue < 0) { return Status(ErrorCodes::BadValue, str::stream() << kMaxStalenessSecondsFieldName << " must be a non-negative integer"); } if (maxStalenessSecondsValue && maxStalenessSecondsValue >= Seconds::max().count()) { return Status(ErrorCodes::BadValue, str::stream() << kMaxStalenessSecondsFieldName << " value can not exceed " << Seconds::max().count()); } if (maxStalenessSecondsValue && maxStalenessSecondsValue < kMinimalMaxStalenessValue.count()) { return Status(ErrorCodes::MaxStalenessOutOfRange, str::stream() << kMaxStalenessSecondsFieldName << " value can not be less than " << kMinimalMaxStalenessValue.count()); } if ((mode == ReadPreference::PrimaryOnly) && maxStalenessSecondsValue) { return Status(ErrorCodes::BadValue, str::stream() << kMaxStalenessSecondsFieldName << " can not be set for the primary mode"); } return ReadPreferenceSetting(mode, tags, Seconds(maxStalenessSecondsValue)); }
void Strategy::queryOp(OperationContext* txn, Request& request) { verify(!NamespaceString(request.getns()).isCommand()); Timer queryTimer; globalOpCounters.gotQuery(); QueryMessage q(request.d()); NamespaceString ns(q.ns); ClientBasic* client = txn->getClient(); AuthorizationSession* authSession = AuthorizationSession::get(client); Status status = authSession->checkAuthForFind(ns, false); audit::logQueryAuthzCheck(client, ns, q.query, status.code()); uassertStatusOK(status); LOG(3) << "query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn << " options: " << q.queryOptions; if (q.ntoreturn == 1 && strstr(q.ns, ".$cmd")) throw UserException(8010, "something is wrong, shouldn't see a command here"); if (q.queryOptions & QueryOption_Exhaust) { uasserted(18526, string("the 'exhaust' query option is invalid for mongos queries: ") + q.ns + " " + q.query.toString()); } // Spigot which controls whether OP_QUERY style find on mongos uses the new ClusterClientCursor // code path. // TODO: Delete the spigot and always use the new code. if (useClusterClientCursor) { // Determine the default read preference mode based on the value of the slaveOk flag. ReadPreference readPreferenceOption = (q.queryOptions & QueryOption_SlaveOk) ? ReadPreference::SecondaryPreferred : ReadPreference::PrimaryOnly; ReadPreferenceSetting readPreference(readPreferenceOption, TagSet()); BSONElement rpElem; auto readPrefExtractStatus = bsonExtractTypedField( q.query, LiteParsedQuery::kWrappedReadPrefField, mongo::Object, &rpElem); if (readPrefExtractStatus.isOK()) { auto parsedRps = ReadPreferenceSetting::fromBSON(rpElem.Obj()); uassertStatusOK(parsedRps.getStatus()); readPreference = parsedRps.getValue(); } else if (readPrefExtractStatus != ErrorCodes::NoSuchKey) { uassertStatusOK(readPrefExtractStatus); } auto canonicalQuery = CanonicalQuery::canonicalize(q, WhereCallbackNoop()); uassertStatusOK(canonicalQuery.getStatus()); // If the $explain flag was set, we must run the operation on the shards as an explain // command rather than a find command. if (canonicalQuery.getValue()->getParsed().isExplain()) { const LiteParsedQuery& lpq = canonicalQuery.getValue()->getParsed(); BSONObj findCommand = lpq.asFindCommand(); // We default to allPlansExecution verbosity. auto verbosity = ExplainCommon::EXEC_ALL_PLANS; const bool secondaryOk = (readPreference.pref != ReadPreference::PrimaryOnly); rpc::ServerSelectionMetadata metadata(secondaryOk, readPreference); BSONObjBuilder explainBuilder; uassertStatusOK(ClusterFind::runExplain( txn, findCommand, lpq, verbosity, metadata, &explainBuilder)); BSONObj explainObj = explainBuilder.done(); replyToQuery(0, // query result flags request.p(), request.m(), static_cast<const void*>(explainObj.objdata()), explainObj.objsize(), 1, // numResults 0, // startingFrom CursorId(0)); return; } // Do the work to generate the first batch of results. This blocks waiting to get responses // from the shard(s). std::vector<BSONObj> batch; // 0 means the cursor is exhausted and // otherwise we assume that a cursor with the returned id can be retrieved via the // ClusterCursorManager auto cursorId = ClusterFind::runQuery(txn, *canonicalQuery.getValue(), readPreference, &batch); uassertStatusOK(cursorId.getStatus()); // TODO: this constant should be shared between mongos and mongod, and should // not be inside ShardedClientCursor. BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); // Fill out the response buffer. int numResults = 0; for (const auto& obj : batch) { buffer.appendBuf((void*)obj.objdata(), obj.objsize()); numResults++; } replyToQuery(0, // query result flags request.p(), request.m(), buffer.buf(), buffer.len(), numResults, 0, // startingFrom cursorId.getValue()); return; } QuerySpec qSpec((string)q.ns, q.query, q.fields, q.ntoskip, q.ntoreturn, q.queryOptions); // Parse "$maxTimeMS". StatusWith<int> maxTimeMS = LiteParsedQuery::parseMaxTimeMSQuery(q.query); uassert(17233, maxTimeMS.getStatus().reason(), maxTimeMS.isOK()); if (_isSystemIndexes(q.ns) && doShardedIndexQuery(txn, request, qSpec)) { return; } ParallelSortClusteredCursor* cursor = new ParallelSortClusteredCursor(qSpec, CommandInfo()); verify(cursor); // TODO: Move out to Request itself, not strategy based try { cursor->init(txn); if (qSpec.isExplain()) { BSONObjBuilder explain_builder; cursor->explain(explain_builder); explain_builder.appendNumber("executionTimeMillis", static_cast<long long>(queryTimer.millis())); BSONObj b = explain_builder.obj(); replyToQuery(0, request.p(), request.m(), b); delete (cursor); return; } } catch (...) { delete cursor; throw; } // TODO: Revisit all of this when we revisit the sharded cursor cache if (cursor->getNumQueryShards() != 1) { // More than one shard (or zero), manage with a ShardedClientCursor // NOTE: We may also have *zero* shards here when the returnPartial flag is set. // Currently the code in ShardedClientCursor handles this. ShardedClientCursorPtr cc(new ShardedClientCursor(q, cursor)); BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); int docCount = 0; const int startFrom = cc->getTotalSent(); bool hasMore = cc->sendNextBatch(q.ntoreturn, buffer, docCount); if (hasMore) { LOG(5) << "storing cursor : " << cc->getId(); int cursorLeftoverMillis = maxTimeMS.getValue() - queryTimer.millis(); if (maxTimeMS.getValue() == 0) { // 0 represents "no limit". cursorLeftoverMillis = kMaxTimeCursorNoTimeLimit; } else if (cursorLeftoverMillis <= 0) { cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired; } cursorCache.store(cc, cursorLeftoverMillis); } replyToQuery(0, request.p(), request.m(), buffer.buf(), buffer.len(), docCount, startFrom, hasMore ? cc->getId() : 0); } else { // Only one shard is used // Remote cursors are stored remotely, we shouldn't need this around. unique_ptr<ParallelSortClusteredCursor> cursorDeleter(cursor); ShardPtr shard = grid.shardRegistry()->getShard(txn, cursor->getQueryShardId()); verify(shard.get()); DBClientCursorPtr shardCursor = cursor->getShardCursor(shard->getId()); // Implicitly stores the cursor in the cache request.reply(*(shardCursor->getMessage()), shardCursor->originalHost()); // We don't want to kill the cursor remotely if there's still data left shardCursor->decouple(); } }
void DBClientReplicaSet::_auth( const BSONObj& params ) { // We prefer to authenticate against a primary, but otherwise a secondary is ok too // Empty tag matches every secondary shared_ptr<ReadPreferenceSetting> readPref( new ReadPreferenceSetting( ReadPreference_PrimaryPreferred, TagSet() ) ); LOG(3) << "dbclient_rs authentication of " << _getMonitor()->getName() << endl; // NOTE that we retry MAX_RETRY + 1 times, since we're always primary preferred we don't // fallback to the primary. Status lastNodeStatus = Status::OK(); for ( size_t retry = 0; retry < MAX_RETRY + 1; retry++ ) { try { DBClientConnection* conn = selectNodeUsingTags( readPref ); if ( conn == NULL ) { break; } conn->auth( params ); // Cache the new auth information since we now validated it's good _auths[params[saslCommandUserDBFieldName].str()] = params.getOwned(); // Ensure the only child connection open is the one we authenticated against - other // child connections may not have full authentication information. // NOTE: _lastSlaveOkConn may or may not be the same as _master dassert(_lastSlaveOkConn.get() == conn || _master.get() == conn); if ( conn != _lastSlaveOkConn.get() ) { _lastSlaveOkHost = HostAndPort(); _lastSlaveOkConn.reset(); } if ( conn != _master.get() ) { _masterHost = HostAndPort(); _master.reset(); } return; } catch ( const DBException &ex ) { // We care if we can't authenticate (i.e. bad password) in credential params. if ( isAuthenticationException( ex ) ) { throw; } StringBuilder errMsgB; errMsgB << "can't authenticate against replica set node " << _lastSlaveOkHost.toString(); lastNodeStatus = ex.toStatus( errMsgB.str() ); LOG(1) << lastNodeStatus.reason() << endl; invalidateLastSlaveOkCache(); } } if ( lastNodeStatus.isOK() ) { StringBuilder assertMsgB; assertMsgB << "Failed to authenticate, no good nodes in " << _getMonitor()->getName(); uasserted( ErrorCodes::NodeNotFound, assertMsgB.str() ); } else { uasserted( lastNodeStatus.code(), lastNodeStatus.reason() ); } }