/** * Returns true if request is a query for sharded indexes. */ static bool doShardedIndexQuery(OperationContext* txn, Request& r, const QuerySpec& qSpec) { // Extract the ns field from the query, which may be embedded within the "query" or // "$query" field. auto nsField = qSpec.filter()["ns"]; if (nsField.eoo()) { return false; } const NamespaceString indexNSSQuery(nsField.str()); auto status = grid.catalogCache()->getDatabase(txn, indexNSSQuery.db().toString()); if (!status.isOK()) { return false; } shared_ptr<DBConfig> config = status.getValue(); if (!config->isSharded(indexNSSQuery.ns())) { return false; } // if you are querying on system.indexes, we need to make sure we go to a shard // that actually has chunks. This is not a perfect solution (what if you just // look at all indexes), but better than doing nothing. ShardPtr shard; ChunkManagerPtr cm; config->getChunkManagerOrPrimary(indexNSSQuery.ns(), cm, shard); if (cm) { set<ShardId> shardIds; cm->getAllShardIds(&shardIds); verify(shardIds.size() > 0); shard = grid.shardRegistry()->getShard(*shardIds.begin()); } ShardConnection dbcon(shard->getConnString(), r.getns()); DBClientBase& c = dbcon.conn(); string actualServer; Message response; bool ok = c.call(r.m(), response, true, &actualServer); uassert(10200, "mongos: error calling db", ok); { QueryResult::View qr = response.singleData().view2ptr(); if (qr.getResultFlags() & ResultFlag_ShardConfigStale) { dbcon.done(); // Version is zero b/c this is deprecated codepath throw RecvStaleConfigException(r.getns(), "Strategy::doQuery", ChunkVersion(0, 0, OID()), ChunkVersion(0, 0, OID())); } } r.reply(response, actualServer.size() ? actualServer : c.getServerAddress()); dbcon.done(); return true; }
/** * Returns true if request is a query for sharded indexes. */ static bool doShardedIndexQuery( Request& r, const QuerySpec& qSpec ) { // Extract the ns field from the query, which may be embedded within the "query" or // "$query" field. string indexNSQuery(qSpec.filter()["ns"].str()); DBConfigPtr config = grid.getDBConfig( r.getns() ); if ( !config->isSharded( indexNSQuery )) { return false; } // if you are querying on system.indexes, we need to make sure we go to a shard // that actually has chunks. This is not a perfect solution (what if you just // look at all indexes), but better than doing nothing. ShardPtr shard; ChunkManagerPtr cm; config->getChunkManagerOrPrimary( indexNSQuery, cm, shard ); if ( cm ) { set<Shard> shards; cm->getAllShards( shards ); verify( shards.size() > 0 ); shard.reset( new Shard( *shards.begin() ) ); } ShardConnection dbcon( *shard , r.getns() ); DBClientBase &c = dbcon.conn(); string actualServer; Message response; bool ok = c.call( r.m(), response, true , &actualServer ); uassert( 10200 , "mongos: error calling db", ok ); { QueryResult *qr = (QueryResult *) response.singleData(); if ( qr->resultFlags() & ResultFlag_ShardConfigStale ) { dbcon.done(); // Version is zero b/c this is deprecated codepath throw RecvStaleConfigException( r.getns(), "Strategy::doQuery", ChunkVersion( 0, 0, OID() ), ChunkVersion( 0, 0, OID() )); } } r.reply( response , actualServer.size() ? actualServer : c.getServerAddress() ); dbcon.done(); return true; }
void remove( const string& name ) { scoped_lock lk( _mutex ); for ( ShardMap::iterator i = _lookup.begin(); i!=_lookup.end(); ) { ShardPtr s = i->second; if ( s->getName() == name ) { _lookup.erase(i++); } else { ++i; } } for ( ShardMap::iterator i = _rsLookup.begin(); i!=_rsLookup.end(); ) { ShardPtr s = i->second; if ( s->getName() == name ) { _rsLookup.erase(i++); } else { ++i; } } }
void _installHost( const string& host , const ShardPtr& s ) { _lookup[host] = s; const ConnectionString& cs = s->getAddress(); if ( cs.type() == ConnectionString::SET ) { if ( cs.getSetName().size() ) { scoped_lock lk( _rsMutex); _rsLookup[ cs.getSetName() ] = s; } vector<HostAndPort> servers = cs.getServers(); for ( unsigned i=0; i<servers.size(); i++ ) { _lookup[ servers[i].toString() ] = s; } } }
bool run( const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool ) { string ns; if ( !FieldParser::extract( cmdObj, nsField, &ns, &errmsg ) ) { return false; } if ( ns.size() == 0 ) { errmsg = "no namespace specified"; return false; } vector<BSONObj> bounds; if ( !FieldParser::extract( cmdObj, boundsField, &bounds, &errmsg ) ) { return false; } if ( bounds.size() == 0 ) { errmsg = "no bounds were specified"; return false; } if ( bounds.size() != 2 ) { errmsg = "only a min and max bound may be specified"; return false; } BSONObj minKey = bounds[0]; BSONObj maxKey = bounds[1]; if ( minKey.isEmpty() ) { errmsg = "no min key specified"; return false; } if ( maxKey.isEmpty() ) { errmsg = "no max key specified"; return false; } ShardPtr mergeShard = guessMergeShard( NamespaceString( ns ), minKey ); if ( !mergeShard ) { errmsg = (string)"could not find shard for merge range starting at " + minKey.toString(); return false; } BSONObjBuilder remoteCmdObjB; remoteCmdObjB.append( cmdObj[ MergeChunksPassCommand::nsField() ] ); remoteCmdObjB.append( cmdObj[ MergeChunksPassCommand::boundsField() ] ); remoteCmdObjB.append( MergeChunksPassCommand::configField(), configServer.getPrimary().getAddress().toString() ); remoteCmdObjB.append( MergeChunksPassCommand::shardNameField(), mergeShard->getName() ); BSONObj remoteResult; // Throws, but handled at level above. Don't want to rewrap to preserve exception // formatting. ScopedDbConnection conn( mergeShard->getAddress() ); bool ok = conn->runCommand( "admin", remoteCmdObjB.obj(), remoteResult ); conn.done(); // Always refresh our chunks afterwards refreshChunkCache( NamespaceString( ns ) ); result.appendElements( remoteResult ); return ok; }
// Useful for ensuring our shard data will not be modified while we use it Shard findCopy( const string& ident ){ ShardPtr found = find( ident ); scoped_lock lk( _mutex ); massert( 13128 , (string)"can't find shard for: " + ident , found.get() ); return *found.get(); }
void Strategy::queryOp(OperationContext* txn, Request& request) { verify(!NamespaceString(request.getns()).isCommand()); Timer queryTimer; globalOpCounters.gotQuery(); QueryMessage q(request.d()); NamespaceString ns(q.ns); ClientBasic* client = txn->getClient(); AuthorizationSession* authSession = AuthorizationSession::get(client); Status status = authSession->checkAuthForFind(ns, false); audit::logQueryAuthzCheck(client, ns, q.query, status.code()); uassertStatusOK(status); LOG(3) << "query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn << " options: " << q.queryOptions; if (q.ntoreturn == 1 && strstr(q.ns, ".$cmd")) throw UserException(8010, "something is wrong, shouldn't see a command here"); if (q.queryOptions & QueryOption_Exhaust) { uasserted(18526, string("the 'exhaust' query option is invalid for mongos queries: ") + q.ns + " " + q.query.toString()); } // Spigot which controls whether OP_QUERY style find on mongos uses the new ClusterClientCursor // code path. // TODO: Delete the spigot and always use the new code. if (useClusterClientCursor) { // Determine the default read preference mode based on the value of the slaveOk flag. ReadPreference readPreferenceOption = (q.queryOptions & QueryOption_SlaveOk) ? ReadPreference::SecondaryPreferred : ReadPreference::PrimaryOnly; ReadPreferenceSetting readPreference(readPreferenceOption, TagSet()); BSONElement rpElem; auto readPrefExtractStatus = bsonExtractTypedField( q.query, LiteParsedQuery::kWrappedReadPrefField, mongo::Object, &rpElem); if (readPrefExtractStatus.isOK()) { auto parsedRps = ReadPreferenceSetting::fromBSON(rpElem.Obj()); uassertStatusOK(parsedRps.getStatus()); readPreference = parsedRps.getValue(); } else if (readPrefExtractStatus != ErrorCodes::NoSuchKey) { uassertStatusOK(readPrefExtractStatus); } auto canonicalQuery = CanonicalQuery::canonicalize(q, WhereCallbackNoop()); uassertStatusOK(canonicalQuery.getStatus()); // If the $explain flag was set, we must run the operation on the shards as an explain // command rather than a find command. if (canonicalQuery.getValue()->getParsed().isExplain()) { const LiteParsedQuery& lpq = canonicalQuery.getValue()->getParsed(); BSONObj findCommand = lpq.asFindCommand(); // We default to allPlansExecution verbosity. auto verbosity = ExplainCommon::EXEC_ALL_PLANS; const bool secondaryOk = (readPreference.pref != ReadPreference::PrimaryOnly); rpc::ServerSelectionMetadata metadata(secondaryOk, readPreference); BSONObjBuilder explainBuilder; uassertStatusOK(ClusterFind::runExplain( txn, findCommand, lpq, verbosity, metadata, &explainBuilder)); BSONObj explainObj = explainBuilder.done(); replyToQuery(0, // query result flags request.p(), request.m(), static_cast<const void*>(explainObj.objdata()), explainObj.objsize(), 1, // numResults 0, // startingFrom CursorId(0)); return; } // Do the work to generate the first batch of results. This blocks waiting to get responses // from the shard(s). std::vector<BSONObj> batch; // 0 means the cursor is exhausted and // otherwise we assume that a cursor with the returned id can be retrieved via the // ClusterCursorManager auto cursorId = ClusterFind::runQuery(txn, *canonicalQuery.getValue(), readPreference, &batch); uassertStatusOK(cursorId.getStatus()); // TODO: this constant should be shared between mongos and mongod, and should // not be inside ShardedClientCursor. BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); // Fill out the response buffer. int numResults = 0; for (const auto& obj : batch) { buffer.appendBuf((void*)obj.objdata(), obj.objsize()); numResults++; } replyToQuery(0, // query result flags request.p(), request.m(), buffer.buf(), buffer.len(), numResults, 0, // startingFrom cursorId.getValue()); return; } QuerySpec qSpec((string)q.ns, q.query, q.fields, q.ntoskip, q.ntoreturn, q.queryOptions); // Parse "$maxTimeMS". StatusWith<int> maxTimeMS = LiteParsedQuery::parseMaxTimeMSQuery(q.query); uassert(17233, maxTimeMS.getStatus().reason(), maxTimeMS.isOK()); if (_isSystemIndexes(q.ns) && doShardedIndexQuery(txn, request, qSpec)) { return; } ParallelSortClusteredCursor* cursor = new ParallelSortClusteredCursor(qSpec, CommandInfo()); verify(cursor); // TODO: Move out to Request itself, not strategy based try { cursor->init(txn); if (qSpec.isExplain()) { BSONObjBuilder explain_builder; cursor->explain(explain_builder); explain_builder.appendNumber("executionTimeMillis", static_cast<long long>(queryTimer.millis())); BSONObj b = explain_builder.obj(); replyToQuery(0, request.p(), request.m(), b); delete (cursor); return; } } catch (...) { delete cursor; throw; } // TODO: Revisit all of this when we revisit the sharded cursor cache if (cursor->getNumQueryShards() != 1) { // More than one shard (or zero), manage with a ShardedClientCursor // NOTE: We may also have *zero* shards here when the returnPartial flag is set. // Currently the code in ShardedClientCursor handles this. ShardedClientCursorPtr cc(new ShardedClientCursor(q, cursor)); BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); int docCount = 0; const int startFrom = cc->getTotalSent(); bool hasMore = cc->sendNextBatch(q.ntoreturn, buffer, docCount); if (hasMore) { LOG(5) << "storing cursor : " << cc->getId(); int cursorLeftoverMillis = maxTimeMS.getValue() - queryTimer.millis(); if (maxTimeMS.getValue() == 0) { // 0 represents "no limit". cursorLeftoverMillis = kMaxTimeCursorNoTimeLimit; } else if (cursorLeftoverMillis <= 0) { cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired; } cursorCache.store(cc, cursorLeftoverMillis); } replyToQuery(0, request.p(), request.m(), buffer.buf(), buffer.len(), docCount, startFrom, hasMore ? cc->getId() : 0); } else { // Only one shard is used // Remote cursors are stored remotely, we shouldn't need this around. unique_ptr<ParallelSortClusteredCursor> cursorDeleter(cursor); ShardPtr shard = grid.shardRegistry()->getShard(txn, cursor->getQueryShardId()); verify(shard.get()); DBClientCursorPtr shardCursor = cursor->getShardCursor(shard->getId()); // Implicitly stores the cursor in the cache request.reply(*(shardCursor->getMessage()), shardCursor->originalHost()); // We don't want to kill the cursor remotely if there's still data left shardCursor->decouple(); } }
/** * Updates the remote cached version on the remote shard host (primary, in the case of replica * sets) if needed with a fully-qualified shard version for the given namespace: * config server(s) + shard name + shard version * * If no remote cached version has ever been set, an initial shard version is sent. * * If the namespace is empty and no version has ever been sent, the config server + shard name * is sent to the remote shard host to initialize the connection as coming from mongos. * NOTE: This initialization is *best-effort only*. Operations which wish to correctly version * must send the namespace. * * Config servers are special and are not (unless otherwise a shard) kept up to date with this * protocol. This is safe so long as config servers only contain unversioned collections. * * It is an error to call checkShardVersion with an unversionable connection (isVersionableCB). * * @return true if we contacted the remote host */ bool checkShardVersion(DBClientBase* conn_in, const string& ns, ChunkManagerPtr refManager, bool authoritative, int tryNumber) { // TODO: cache, optimize, etc... // Empty namespaces are special - we require initialization but not versioning if (ns.size() == 0) { return initShardVersionEmptyNS(conn_in); } auto status = grid.catalogCache()->getDatabase(nsToDatabase(ns)); if (!status.isOK()) { return false; } shared_ptr<DBConfig> conf = status.getValue(); DBClientBase* conn = getVersionable(conn_in); verify(conn); // errors thrown above unsigned long long officialSequenceNumber = 0; ShardPtr primary; ChunkManagerPtr manager; if (authoritative) conf->getChunkManagerIfExists(ns, true); conf->getChunkManagerOrPrimary(ns, manager, primary); if (manager) { officialSequenceNumber = manager->getSequenceNumber(); } const auto shard = grid.shardRegistry()->getShard(conn->getServerAddress()); uassert(ErrorCodes::ShardNotFound, str::stream() << conn->getServerAddress() << " is not recognized as a shard", shard); // Check this manager against the reference manager if (manager) { if (refManager && !refManager->compatibleWith(*manager, shard->getId())) { const ChunkVersion refVersion(refManager->getVersion(shard->getId())); const ChunkVersion currentVersion(manager->getVersion(shard->getId())); string msg(str::stream() << "manager (" << currentVersion.toString() << " : " << manager->getSequenceNumber() << ") " << "not compatible with reference manager (" << refVersion.toString() << " : " << refManager->getSequenceNumber() << ") " << "on shard " << shard->getId() << " (" << shard->getConnString().toString() << ")"); throw SendStaleConfigException(ns, msg, refVersion, currentVersion); } } else if (refManager) { string msg(str::stream() << "not sharded (" << ((manager.get() == 0) ? string("<none>") : str::stream() << manager->getSequenceNumber()) << ") but has reference manager (" << refManager->getSequenceNumber() << ") " << "on conn " << conn->getServerAddress() << " (" << conn_in->getServerAddress() << ")"); throw SendStaleConfigException( ns, msg, refManager->getVersion(shard->getId()), ChunkVersion::UNSHARDED()); } // Do not send setShardVersion to collections on the config servers - this causes problems // when config servers are also shards and get SSV with conflicting names. // TODO: Make config servers regular shards if (primary && primary->getId() == "config") { return false; } // Has the ChunkManager been reloaded since the last time we updated the shard version over // this connection? If we've never updated the shard version, do so now. unsigned long long sequenceNumber = 0; if (connectionShardStatus.getSequence(conn, ns, &sequenceNumber)) { if (sequenceNumber == officialSequenceNumber) { return false; } } ChunkVersion version = ChunkVersion(0, 0, OID()); if (manager) { version = manager->getVersion(shard->getId()); } LOG(1) << "setting shard version of " << version << " for " << ns << " on shard " << shard->toString(); LOG(3) << "last version sent with chunk manager iteration " << sequenceNumber << ", current chunk manager iteration is " << officialSequenceNumber; BSONObj result; if (setShardVersion(*conn, ns, grid.catalogManager()->connectionString().toString(), version, manager.get(), authoritative, result)) { LOG(1) << " setShardVersion success: " << result; connectionShardStatus.setSequence(conn, ns, officialSequenceNumber); return true; } LOG(1) << " setShardVersion failed!\n" << result << endl; if (result["need_authoritative"].trueValue()) massert(10428, "need_authoritative set but in authoritative mode already", !authoritative); if (!authoritative) { // use the original connection and get a fresh versionable connection // since conn can be invalidated (or worse, freed) after the failure checkShardVersion(conn_in, ns, refManager, 1, tryNumber + 1); return true; } if (result["reloadConfig"].trueValue()) { if (result["version"].timestampTime() == Date_t()) { warning() << "reloading full configuration for " << conf->name() << ", connection state indicates significant version changes"; // reload db conf->reload(); } else { // reload config conf->getChunkManager(ns, true); } } const int maxNumTries = 7; if (tryNumber < maxNumTries) { LOG(tryNumber < (maxNumTries / 2) ? 1 : 0) << "going to retry checkShardVersion shard: " << shard->toString() << " " << result; sleepmillis(10 * tryNumber); // use the original connection and get a fresh versionable connection // since conn can be invalidated (or worse, freed) after the failure checkShardVersion(conn_in, ns, refManager, true, tryNumber + 1); return true; } string errmsg = str::stream() << "setShardVersion failed shard: " << shard->toString() << " " << result; log() << " " << errmsg << endl; massert(10429, errmsg, 0); return true; }
void Strategy::queryOp( Request& r ) { verify( !NamespaceString( r.getns() ).isCommand() ); Timer queryTimer; QueryMessage q( r.d() ); NamespaceString ns(q.ns); ClientBasic* client = ClientBasic::getCurrent(); AuthorizationSession* authSession = client->getAuthorizationSession(); Status status = authSession->checkAuthForQuery(ns, q.query); audit::logQueryAuthzCheck(client, ns, q.query, status.code()); uassertStatusOK(status); LOG(3) << "query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn << " options: " << q.queryOptions << endl; if ( q.ntoreturn == 1 && strstr(q.ns, ".$cmd") ) throw UserException( 8010 , "something is wrong, shouldn't see a command here" ); if (q.queryOptions & QueryOption_Exhaust) { uasserted(18526, string("the 'exhaust' query option is invalid for mongos queries: ") + q.ns + " " + q.query.toString()); } QuerySpec qSpec( (string)q.ns, q.query, q.fields, q.ntoskip, q.ntoreturn, q.queryOptions ); // Parse "$maxTimeMS". StatusWith<int> maxTimeMS = LiteParsedQuery::parseMaxTimeMSQuery( q.query ); uassert( 17233, maxTimeMS.getStatus().reason(), maxTimeMS.isOK() ); if ( _isSystemIndexes( q.ns ) && doShardedIndexQuery( r, qSpec )) { return; } ParallelSortClusteredCursor * cursor = new ParallelSortClusteredCursor( qSpec, CommandInfo() ); verify( cursor ); // TODO: Move out to Request itself, not strategy based try { cursor->init(); if ( qSpec.isExplain() ) { BSONObjBuilder explain_builder; cursor->explain( explain_builder ); explain_builder.appendNumber( "executionTimeMillis", static_cast<long long>(queryTimer.millis()) ); BSONObj b = explain_builder.obj(); replyToQuery( 0 , r.p() , r.m() , b ); delete( cursor ); return; } } catch(...) { delete cursor; throw; } // TODO: Revisit all of this when we revisit the sharded cursor cache if (cursor->getNumQueryShards() != 1) { // More than one shard (or zero), manage with a ShardedClientCursor // NOTE: We may also have *zero* shards here when the returnPartial flag is set. // Currently the code in ShardedClientCursor handles this. ShardedClientCursorPtr cc (new ShardedClientCursor( q , cursor )); BufBuilder buffer( ShardedClientCursor::INIT_REPLY_BUFFER_SIZE ); int docCount = 0; const int startFrom = cc->getTotalSent(); bool hasMore = cc->sendNextBatch(q.ntoreturn, buffer, docCount); if ( hasMore ) { LOG(5) << "storing cursor : " << cc->getId() << endl; int cursorLeftoverMillis = maxTimeMS.getValue() - queryTimer.millis(); if ( maxTimeMS.getValue() == 0 ) { // 0 represents "no limit". cursorLeftoverMillis = kMaxTimeCursorNoTimeLimit; } else if ( cursorLeftoverMillis <= 0 ) { cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired; } cursorCache.store( cc, cursorLeftoverMillis ); } replyToQuery( 0, r.p(), r.m(), buffer.buf(), buffer.len(), docCount, startFrom, hasMore ? cc->getId() : 0 ); } else{ // Only one shard is used // Remote cursors are stored remotely, we shouldn't need this around. scoped_ptr<ParallelSortClusteredCursor> cursorDeleter( cursor ); ShardPtr shard = cursor->getQueryShard(); verify( shard.get() ); DBClientCursorPtr shardCursor = cursor->getShardCursor(*shard); // Implicitly stores the cursor in the cache r.reply( *(shardCursor->getMessage()) , shardCursor->originalHost() ); // We don't want to kill the cursor remotely if there's still data left shardCursor->decouple(); } }