void replyToQuery( int queryResultFlags, Message &m, DbResponse &dbresponse, BSONObj obj ) { Message *resp = new Message(); replyToQuery( queryResultFlags, *resp, obj ); dbresponse.response = resp; dbresponse.responseTo = m.header()->id; }
bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) { bool ok = true; DbMessage d(m); const char *ns = d.getns(); int ntoreturn = d.pullInt(); long long cursorid = d.pullInt64(); curop.debug().ns = ns; curop.debug().ntoreturn = ntoreturn; curop.debug().cursorid = cursorid; shared_ptr<AssertionException> ex; scoped_ptr<Timer> timer; int pass = 0; bool exhaust = false; QueryResult* msgdata = 0; OpTime last; while( 1 ) { try { const NamespaceString nsString( ns ); uassert( 16258, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid() ); if (str::startsWith(ns, "local.oplog.")){ if (pass == 0) { mutex::scoped_lock lk(OpTime::m); last = OpTime::getLast(lk); } else { last.waitForDifferent(1000/*ms*/); } } Client::ReadContext ctx(ns); // call this readlocked so state can't change replVerifyReadsOk(); msgdata = processGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust); } catch ( AssertionException& e ) { ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) ); ok = false; break; } if (msgdata == 0) { // this should only happen with QueryOption_AwaitData exhaust = false; massert(13073, "shutting down", !inShutdown() ); if ( ! timer ) { timer.reset( new Timer() ); } else { if ( timer->seconds() >= 4 ) { // after about 4 seconds, return. pass stops at 1000 normally. // we want to return occasionally so slave can checkpoint. pass = 10000; } } pass++; if (debug) sleepmillis(20); else sleepmillis(2); // note: the 1100 is beacuse of the waitForDifferent above // should eventually clean this up a bit curop.setExpectedLatencyMs( 1100 + timer->millis() ); continue; } break; }; if (ex) { exhaust = false; BSONObjBuilder err; ex->getInfo().append( err ); BSONObj errObj = err.done(); log() << errObj << endl; curop.debug().exceptionInfo = ex->getInfo(); if (ex->getCode() == 13436) { replyToQuery(ResultFlag_ErrSet, m, dbresponse, errObj); curop.debug().responseLength = dbresponse.response->header()->dataLen(); curop.debug().nreturned = 1; return ok; } msgdata = emptyMoreResult(cursorid); } Message *resp = new Message(); resp->setData(msgdata, true); curop.debug().responseLength = resp->header()->dataLen(); curop.debug().nreturned = msgdata->nReturned; dbresponse.response = resp; dbresponse.responseTo = m.header()->id; if( exhaust ) { curop.debug().exhaust = true; dbresponse.exhaust = ns; } return ok; }
bool handleSpecialNamespaces( Request& r , QueryMessage& q ) { const char * ns = r.getns(); ns = strstr( r.getns() , ".$cmd.sys." ); if ( ! ns ) return false; ns += 10; BSONObjBuilder b; vector<Shard> shards; if ( strcmp( ns , "inprog" ) == 0 ) { Shard::getAllShards( shards ); BSONArrayBuilder arr( b.subarrayStart( "inprog" ) ); for ( unsigned i=0; i<shards.size(); i++ ) { Shard shard = shards[i]; ScopedDbConnection conn( shard ); BSONObj temp = conn->findOne( r.getns() , BSONObj() ); if ( temp["inprog"].isABSONObj() ) { BSONObjIterator i( temp["inprog"].Obj() ); while ( i.more() ) { BSONObjBuilder x; BSONObjIterator j( i.next().Obj() ); while( j.more() ) { BSONElement e = j.next(); if ( str::equals( e.fieldName() , "opid" ) ) { stringstream ss; ss << shard.getName() << ':' << e.numberInt(); x.append( "opid" , ss.str() ); } else if ( str::equals( e.fieldName() , "client" ) ) { x.appendAs( e , "client_s" ); } else { x.append( e ); } } arr.append( x.obj() ); } } conn.done(); } arr.done(); } else if ( strcmp( ns , "killop" ) == 0 ) { BSONElement e = q.query["op"]; if ( strstr( r.getns() , "admin." ) != 0 ) { b.append( "err" , "unauthorized" ); } else if ( e.type() != String ) { b.append( "err" , "bad op" ); b.append( e ); } else { b.append( e ); string s = e.String(); string::size_type i = s.find( ':' ); if ( i == string::npos ) { b.append( "err" , "bad opid" ); } else { string shard = s.substr( 0 , i ); int opid = atoi( s.substr( i + 1 ).c_str() ); b.append( "shard" , shard ); b.append( "shardid" , opid ); log() << "want to kill op: " << e << endl; Shard s(shard); ScopedDbConnection conn( s ); conn->findOne( r.getns() , BSON( "op" << opid ) ); conn.done(); } } } else if ( strcmp( ns , "unlock" ) == 0 ) { b.append( "err" , "can't do unlock through mongos" ); } else { log( LL_WARNING ) << "unknown sys command [" << ns << "]" << endl; return false; } BSONObj x = b.done(); replyToQuery(0, r.p(), r.m(), x); return true; }
bool handleSpecialNamespaces( Request& r , QueryMessage& q ) { const char * ns = strstr( r.getns() , ".$cmd.sys." ); if ( ! ns ) return false; ns += 10; BSONObjBuilder b; vector<Shard> shards; ClientBasic* client = ClientBasic::getCurrent(); AuthorizationSession* authSession = client->getAuthorizationSession(); if ( strcmp( ns , "inprog" ) == 0 ) { const bool isAuthorized = authSession->isAuthorizedForActionsOnResource( ResourcePattern::forClusterResource(), ActionType::inprog); audit::logInProgAuthzCheck( client, q.query, isAuthorized ? ErrorCodes::OK : ErrorCodes::Unauthorized); uassert(ErrorCodes::Unauthorized, "not authorized to run inprog", isAuthorized); Shard::getAllShards( shards ); BSONArrayBuilder arr( b.subarrayStart( "inprog" ) ); for ( unsigned i=0; i<shards.size(); i++ ) { Shard shard = shards[i]; ScopedDbConnection conn(shard.getConnString()); BSONObj temp = conn->findOne( r.getns() , q.query ); if ( temp["inprog"].isABSONObj() ) { BSONObjIterator i( temp["inprog"].Obj() ); while ( i.more() ) { BSONObjBuilder x; BSONObjIterator j( i.next().Obj() ); while( j.more() ) { BSONElement e = j.next(); if ( str::equals( e.fieldName() , "opid" ) ) { stringstream ss; ss << shard.getName() << ':' << e.numberInt(); x.append( "opid" , ss.str() ); } else if ( str::equals( e.fieldName() , "client" ) ) { x.appendAs( e , "client_s" ); } else { x.append( e ); } } arr.append( x.obj() ); } } conn.done(); } arr.done(); } else if ( strcmp( ns , "killop" ) == 0 ) { const bool isAuthorized = authSession->isAuthorizedForActionsOnResource( ResourcePattern::forClusterResource(), ActionType::killop); audit::logKillOpAuthzCheck( client, q.query, isAuthorized ? ErrorCodes::OK : ErrorCodes::Unauthorized); uassert(ErrorCodes::Unauthorized, "not authorized to run killop", isAuthorized); BSONElement e = q.query["op"]; if ( e.type() != String ) { b.append( "err" , "bad op" ); b.append( e ); } else { b.append( e ); string s = e.String(); string::size_type i = s.find( ':' ); if ( i == string::npos ) { b.append( "err" , "bad opid" ); } else { string shard = s.substr( 0 , i ); int opid = atoi( s.substr( i + 1 ).c_str() ); b.append( "shard" , shard ); b.append( "shardid" , opid ); log() << "want to kill op: " << e << endl; Shard s(shard); ScopedDbConnection conn(s.getConnString()); conn->findOne( r.getns() , BSON( "op" << opid ) ); conn.done(); } } } else if ( strcmp( ns , "unlock" ) == 0 ) { b.append( "err" , "can't do unlock through mongos" ); } else { warning() << "unknown sys command [" << ns << "]" << endl; return false; } BSONObj x = b.done(); replyToQuery(0, r.p(), r.m(), x); return true; }
virtual void queryOp( Request& r ) { QueryMessage q( r.d() ); LOG(3) << "single query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn << " options : " << q.queryOptions << endl; NamespaceString nss( r.getns() ); // Regular queries are handled in strategy_shard.cpp verify( nss.isCommand() || nss.isSpecialCommand() ); if ( handleSpecialNamespaces( r , q ) ) return; int loops = 5; while ( true ) { BSONObjBuilder builder; try { BSONObj cmdObj = q.query; { BSONElement e = cmdObj.firstElement(); if (e.type() == Object && (e.fieldName()[0] == '$' ? str::equals("query", e.fieldName()+1) : str::equals("query", e.fieldName()))) { // Extract the embedded query object. if (cmdObj.hasField(Query::ReadPrefField.name())) { // The command has a read preference setting. We don't want // to lose this information so we copy this to a new field // called $queryOptions.$readPreference BSONObjBuilder finalCmdObjBuilder; finalCmdObjBuilder.appendElements(e.embeddedObject()); BSONObjBuilder queryOptionsBuilder( finalCmdObjBuilder.subobjStart("$queryOptions")); queryOptionsBuilder.append(cmdObj[Query::ReadPrefField.name()]); queryOptionsBuilder.done(); cmdObj = finalCmdObjBuilder.obj(); } else { cmdObj = e.embeddedObject(); } } } Command::runAgainstRegistered(q.ns, cmdObj, builder, q.queryOptions); BSONObj x = builder.done(); replyToQuery(0, r.p(), r.m(), x); return; } catch ( StaleConfigException& e ) { if ( loops <= 0 ) throw e; loops--; log() << "retrying command: " << q.query << endl; // For legacy reasons, ns may not actually be set in the exception :-( string staleNS = e.getns(); if( staleNS.size() == 0 ) staleNS = q.ns; ShardConnection::checkMyConnectionVersions( staleNS ); if( loops < 4 ) versionManager.forceRemoteCheckShardVersionCB( staleNS ); } catch ( AssertionException& e ) { Command::appendCommandStatus(builder, e.toStatus()); BSONObj x = builder.done(); replyToQuery(0, r.p(), r.m(), x); return; } } }
void Strategy::clientCommandOp(OperationContext* txn, Request& request) { QueryMessage q(request.d()); LOG(3) << "command: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn << " options: " << q.queryOptions; if (q.queryOptions & QueryOption_Exhaust) { uasserted(18527, string("the 'exhaust' query option is invalid for mongos commands: ") + q.ns + " " + q.query.toString()); } NamespaceString nss(request.getns()); // Regular queries are handled in strategy_shard.cpp verify(nss.isCommand() || nss.isSpecialCommand()); if (handleSpecialNamespaces(txn, request, q)) return; int loops = 5; bool cmChangeAttempted = false; while (true) { BSONObjBuilder builder; try { BSONObj cmdObj = q.query; { BSONElement e = cmdObj.firstElement(); if (e.type() == Object && (e.fieldName()[0] == '$' ? str::equals("query", e.fieldName() + 1) : str::equals("query", e.fieldName()))) { // Extract the embedded query object. if (cmdObj.hasField(Query::ReadPrefField.name())) { // The command has a read preference setting. We don't want // to lose this information so we copy this to a new field // called $queryOptions.$readPreference BSONObjBuilder finalCmdObjBuilder; finalCmdObjBuilder.appendElements(e.embeddedObject()); BSONObjBuilder queryOptionsBuilder( finalCmdObjBuilder.subobjStart("$queryOptions")); queryOptionsBuilder.append(cmdObj[Query::ReadPrefField.name()]); queryOptionsBuilder.done(); cmdObj = finalCmdObjBuilder.obj(); } else { cmdObj = e.embeddedObject(); } } } Command::runAgainstRegistered(txn, q.ns, cmdObj, builder, q.queryOptions); BSONObj x = builder.done(); replyToQuery(0, request.p(), request.m(), x); return; } catch (const StaleConfigException& e) { if (loops <= 0) throw e; loops--; log() << "retrying command: " << q.query; // For legacy reasons, ns may not actually be set in the exception :-( string staleNS = e.getns(); if (staleNS.size() == 0) staleNS = q.ns; ShardConnection::checkMyConnectionVersions(txn, staleNS); if (loops < 4) versionManager.forceRemoteCheckShardVersionCB(txn, staleNS); } catch (const DBException& e) { if (e.getCode() == ErrorCodes::IncompatibleCatalogManager) { fassert(28791, !cmChangeAttempted); cmChangeAttempted = true; grid.forwardingCatalogManager()->waitForCatalogManagerChange(txn); } else { Command::appendCommandStatus(builder, e.toStatus()); BSONObj x = builder.done(); replyToQuery(0, request.p(), request.m(), x); return; } } } }
void Strategy::getMore(OperationContext* txn, Request& request) { Timer getMoreTimer; const char* ns = request.getns(); const int ntoreturn = request.d().pullInt(); const long long id = request.d().pullInt64(); // TODO: Handle stale config exceptions here from coll being dropped or sharded during op // for now has same semantics as legacy request const NamespaceString nss(ns); auto statusGetDb = grid.catalogCache()->getDatabase(txn, nss.db().toString()); if (statusGetDb == ErrorCodes::DatabaseNotFound) { cursorCache.remove(id); replyToQuery(ResultFlag_CursorNotFound, request.p(), request.m(), 0, 0, 0); return; } uassertStatusOK(statusGetDb); // Spigot which controls whether OP_QUERY style find on mongos uses the new ClusterClientCursor // code path. // // TODO: Delete the spigot and always use the new code. if (useClusterClientCursor) { boost::optional<long long> batchSize; if (ntoreturn) { batchSize = ntoreturn; } GetMoreRequest getMoreRequest(NamespaceString(ns), id, batchSize, boost::none); auto cursorResponse = ClusterFind::runGetMore(txn, getMoreRequest); if (cursorResponse == ErrorCodes::CursorNotFound) { replyToQuery(ResultFlag_CursorNotFound, request.p(), request.m(), 0, 0, 0); return; } uassertStatusOK(cursorResponse.getStatus()); // Build the response document. // // TODO: this constant should be shared between mongos and mongod, and should not be inside // ShardedClientCursor. BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); int numResults = 0; for (const auto& obj : cursorResponse.getValue().batch) { buffer.appendBuf((void*)obj.objdata(), obj.objsize()); ++numResults; } replyToQuery(0, request.p(), request.m(), buffer.buf(), buffer.len(), numResults, cursorResponse.getValue().numReturnedSoFar.value_or(0), cursorResponse.getValue().cursorId); return; } shared_ptr<DBConfig> config = statusGetDb.getValue(); ShardPtr primary; ChunkManagerPtr info; config->getChunkManagerOrPrimary(txn, ns, info, primary); // // TODO: Cleanup cursor cache, consolidate into single codepath // const string host = cursorCache.getRef(id); ShardedClientCursorPtr cursor = cursorCache.get(id); int cursorMaxTimeMS = cursorCache.getMaxTimeMS(id); // Cursor ids should not overlap between sharded and unsharded cursors massert(17012, str::stream() << "duplicate sharded and unsharded cursor id " << id << " detected for " << ns << ", duplicated on host " << host, NULL == cursorCache.get(id).get() || host.empty()); ClientBasic* client = ClientBasic::getCurrent(); NamespaceString nsString(ns); AuthorizationSession* authSession = AuthorizationSession::get(client); Status status = authSession->checkAuthForGetMore(nsString, id, false); audit::logGetMoreAuthzCheck(client, nsString, id, status.code()); uassertStatusOK(status); if (!host.empty()) { LOG(3) << "single getmore: " << ns; // we used ScopedDbConnection because we don't get about config versions // not deleting data is handled elsewhere // and we don't want to call setShardVersion ScopedDbConnection conn(host); Message response; bool ok = conn->callRead(request.m(), response); uassert(10204, "dbgrid: getmore: error calling db", ok); bool hasMore = (response.singleData().getCursor() != 0); if (!hasMore) { cursorCache.removeRef(id); } request.reply(response, "" /*conn->getServerAddress() */); conn.done(); return; } else if (cursor) { if (cursorMaxTimeMS == kMaxTimeCursorTimeLimitExpired) { cursorCache.remove(id); uasserted(ErrorCodes::ExceededTimeLimit, "operation exceeded time limit"); } // TODO: Try to match logic of mongod, where on subsequent getMore() we pull lots more data? BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); int docCount = 0; const int startFrom = cursor->getTotalSent(); bool hasMore = cursor->sendNextBatch(ntoreturn, buffer, docCount); if (hasMore) { // still more data cursor->accessed(); if (cursorMaxTimeMS != kMaxTimeCursorNoTimeLimit) { // Update remaining amount of time in cursor cache. int cursorLeftoverMillis = cursorMaxTimeMS - getMoreTimer.millis(); if (cursorLeftoverMillis <= 0) { cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired; } cursorCache.updateMaxTimeMS(id, cursorLeftoverMillis); } } else { // we've exhausted the cursor cursorCache.remove(id); } replyToQuery(0, request.p(), request.m(), buffer.buf(), buffer.len(), docCount, startFrom, hasMore ? cursor->getId() : 0); return; } else { LOG(3) << "could not find cursor " << id << " in cache for " << ns; replyToQuery(ResultFlag_CursorNotFound, request.p(), request.m(), 0, 0, 0); return; } }
virtual void queryOp( Request& r ) { QueryMessage q( r.d() ); LOG(3) << "single query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn << " options : " << q.queryOptions << endl; if ( r.isCommand() ) { if ( handleSpecialNamespaces( r , q ) ) return; int loops = 5; while ( true ) { BSONObjBuilder builder; try { BSONObj cmdObj = q.query; { BSONElement e = cmdObj.firstElement(); if ( e.type() == Object && (e.fieldName()[0] == '$' ? str::equals("query", e.fieldName()+1) : str::equals("query", e.fieldName()))) cmdObj = e.embeddedObject(); } bool ok = Command::runAgainstRegistered(q.ns, cmdObj, builder, q.queryOptions); if ( ok ) { BSONObj x = builder.done(); replyToQuery(0, r.p(), r.m(), x); return; } break; } catch ( StaleConfigException& e ) { if ( loops <= 0 ) throw e; loops--; log() << "retrying command: " << q.query << endl; // For legacy reasons, ns may not actually be set in the exception :-( string staleNS = e.getns(); if( staleNS.size() == 0 ) staleNS = q.ns; ShardConnection::checkMyConnectionVersions( staleNS ); if( loops < 4 ) versionManager.forceRemoteCheckShardVersionCB( staleNS ); } catch ( AssertionException& e ) { e.getInfo().append( builder , "assertion" , "assertionCode" ); builder.append( "errmsg" , "db assertion failure" ); builder.append( "ok" , 0 ); BSONObj x = builder.done(); replyToQuery(0, r.p(), r.m(), x); return; } } string commandName = q.query.firstElementFieldName(); uassert(13390, "unrecognized command: " + commandName, _commandsSafeToPass.count(commandName) != 0); } doQuery( r , r.primaryShard() ); }
void Strategy::queryOp(OperationContext* txn, Request& request) { verify(!NamespaceString(request.getns()).isCommand()); Timer queryTimer; globalOpCounters.gotQuery(); QueryMessage q(request.d()); NamespaceString ns(q.ns); ClientBasic* client = txn->getClient(); AuthorizationSession* authSession = AuthorizationSession::get(client); Status status = authSession->checkAuthForFind(ns, false); audit::logQueryAuthzCheck(client, ns, q.query, status.code()); uassertStatusOK(status); LOG(3) << "query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn << " options: " << q.queryOptions; if (q.ntoreturn == 1 && strstr(q.ns, ".$cmd")) throw UserException(8010, "something is wrong, shouldn't see a command here"); if (q.queryOptions & QueryOption_Exhaust) { uasserted(18526, string("the 'exhaust' query option is invalid for mongos queries: ") + q.ns + " " + q.query.toString()); } // Spigot which controls whether OP_QUERY style find on mongos uses the new ClusterClientCursor // code path. // TODO: Delete the spigot and always use the new code. if (useClusterClientCursor) { ReadPreferenceSetting readPreference(ReadPreference::PrimaryOnly, TagSet::primaryOnly()); BSONElement rpElem; auto readPrefExtractStatus = bsonExtractTypedField( q.query, LiteParsedQuery::kWrappedReadPrefField, mongo::Object, &rpElem); if (readPrefExtractStatus.isOK()) { auto parsedRps = ReadPreferenceSetting::fromBSON(rpElem.Obj()); uassertStatusOK(parsedRps.getStatus()); readPreference = parsedRps.getValue(); } else if (readPrefExtractStatus != ErrorCodes::NoSuchKey) { uassertStatusOK(readPrefExtractStatus); } auto canonicalQuery = CanonicalQuery::canonicalize(q, WhereCallbackNoop()); uassertStatusOK(canonicalQuery.getStatus()); // If the $explain flag was set, we must run the operation on the shards as an explain // command rather than a find command. if (canonicalQuery.getValue()->getParsed().isExplain()) { const LiteParsedQuery& lpq = canonicalQuery.getValue()->getParsed(); BSONObj findCommand = lpq.asFindCommand(); // We default to allPlansExecution verbosity. auto verbosity = ExplainCommon::EXEC_ALL_PLANS; const bool secondaryOk = (readPreference.pref != ReadPreference::PrimaryOnly); rpc::ServerSelectionMetadata metadata(secondaryOk, readPreference); BSONObjBuilder explainBuilder; uassertStatusOK(ClusterFind::runExplain( txn, findCommand, lpq, verbosity, metadata, &explainBuilder)); BSONObj explainObj = explainBuilder.done(); replyToQuery(0, // query result flags request.p(), request.m(), static_cast<const void*>(explainObj.objdata()), explainObj.objsize(), 1, // numResults 0, // startingFrom CursorId(0)); return; } // Do the work to generate the first batch of results. This blocks waiting to get responses // from the shard(s). std::vector<BSONObj> batch; // 0 means the cursor is exhausted and // otherwise we assume that a cursor with the returned id can be retrieved via the // ClusterCursorManager auto cursorId = ClusterFind::runQuery(txn, *canonicalQuery.getValue(), readPreference, &batch); uassertStatusOK(cursorId.getStatus()); // TODO: this constant should be shared between mongos and mongod, and should // not be inside ShardedClientCursor. BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); // Fill out the response buffer. int numResults = 0; for (const auto& obj : batch) { buffer.appendBuf((void*)obj.objdata(), obj.objsize()); numResults++; } replyToQuery(0, // query result flags request.p(), request.m(), buffer.buf(), buffer.len(), numResults, 0, // startingFrom cursorId.getValue()); return; } QuerySpec qSpec((string)q.ns, q.query, q.fields, q.ntoskip, q.ntoreturn, q.queryOptions); // Parse "$maxTimeMS". StatusWith<int> maxTimeMS = LiteParsedQuery::parseMaxTimeMSQuery(q.query); uassert(17233, maxTimeMS.getStatus().reason(), maxTimeMS.isOK()); if (_isSystemIndexes(q.ns) && doShardedIndexQuery(txn, request, qSpec)) { return; } ParallelSortClusteredCursor* cursor = new ParallelSortClusteredCursor(qSpec, CommandInfo()); verify(cursor); // TODO: Move out to Request itself, not strategy based try { cursor->init(txn); if (qSpec.isExplain()) { BSONObjBuilder explain_builder; cursor->explain(explain_builder); explain_builder.appendNumber("executionTimeMillis", static_cast<long long>(queryTimer.millis())); BSONObj b = explain_builder.obj(); replyToQuery(0, request.p(), request.m(), b); delete (cursor); return; } } catch (...) { delete cursor; throw; } // TODO: Revisit all of this when we revisit the sharded cursor cache if (cursor->getNumQueryShards() != 1) { // More than one shard (or zero), manage with a ShardedClientCursor // NOTE: We may also have *zero* shards here when the returnPartial flag is set. // Currently the code in ShardedClientCursor handles this. ShardedClientCursorPtr cc(new ShardedClientCursor(q, cursor)); BufBuilder buffer(ShardedClientCursor::INIT_REPLY_BUFFER_SIZE); int docCount = 0; const int startFrom = cc->getTotalSent(); bool hasMore = cc->sendNextBatch(q.ntoreturn, buffer, docCount); if (hasMore) { LOG(5) << "storing cursor : " << cc->getId(); int cursorLeftoverMillis = maxTimeMS.getValue() - queryTimer.millis(); if (maxTimeMS.getValue() == 0) { // 0 represents "no limit". cursorLeftoverMillis = kMaxTimeCursorNoTimeLimit; } else if (cursorLeftoverMillis <= 0) { cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired; } cursorCache.store(cc, cursorLeftoverMillis); } replyToQuery(0, request.p(), request.m(), buffer.buf(), buffer.len(), docCount, startFrom, hasMore ? cc->getId() : 0); } else { // Only one shard is used // Remote cursors are stored remotely, we shouldn't need this around. unique_ptr<ParallelSortClusteredCursor> cursorDeleter(cursor); ShardPtr shard = grid.shardRegistry()->getShard(txn, cursor->getQueryShardId()); verify(shard.get()); DBClientCursorPtr shardCursor = cursor->getShardCursor(shard->getId()); // Implicitly stores the cursor in the cache request.reply(*(shardCursor->getMessage()), shardCursor->originalHost()); // We don't want to kill the cursor remotely if there's still data left shardCursor->decouple(); } }
void Strategy::getMore( Request& r ) { Timer getMoreTimer; const char *ns = r.getns(); // TODO: Handle stale config exceptions here from coll being dropped or sharded during op // for now has same semantics as legacy request DBConfigPtr config = grid.getDBConfig( ns ); ShardPtr primary; ChunkManagerPtr info; config->getChunkManagerOrPrimary( ns, info, primary ); // // TODO: Cleanup cursor cache, consolidate into single codepath // int ntoreturn = r.d().pullInt(); long long id = r.d().pullInt64(); string host = cursorCache.getRef( id ); ShardedClientCursorPtr cursor = cursorCache.get( id ); int cursorMaxTimeMS = cursorCache.getMaxTimeMS( id ); // Cursor ids should not overlap between sharded and unsharded cursors massert( 17012, str::stream() << "duplicate sharded and unsharded cursor id " << id << " detected for " << ns << ", duplicated on host " << host, NULL == cursorCache.get( id ).get() || host.empty() ); ClientBasic* client = ClientBasic::getCurrent(); NamespaceString nsString(ns); AuthorizationSession* authSession = client->getAuthorizationSession(); Status status = authSession->checkAuthForGetMore( nsString, id ); audit::logGetMoreAuthzCheck( client, nsString, id, status.code() ); uassertStatusOK(status); if( !host.empty() ){ LOG(3) << "single getmore: " << ns << endl; // we used ScopedDbConnection because we don't get about config versions // not deleting data is handled elsewhere // and we don't want to call setShardVersion ScopedDbConnection conn(host); Message response; bool ok = conn->callRead( r.m() , response); uassert( 10204 , "dbgrid: getmore: error calling db", ok); bool hasMore = (response.singleData().getCursor() != 0); if ( !hasMore ) { cursorCache.removeRef( id ); } r.reply( response , "" /*conn->getServerAddress() */ ); conn.done(); return; } else if ( cursor ) { if ( cursorMaxTimeMS == kMaxTimeCursorTimeLimitExpired ) { cursorCache.remove( id ); uasserted( ErrorCodes::ExceededTimeLimit, "operation exceeded time limit" ); } // TODO: Try to match logic of mongod, where on subsequent getMore() we pull lots more data? BufBuilder buffer( ShardedClientCursor::INIT_REPLY_BUFFER_SIZE ); int docCount = 0; const int startFrom = cursor->getTotalSent(); bool hasMore = cursor->sendNextBatch( r, ntoreturn, buffer, docCount ); if ( hasMore ) { // still more data cursor->accessed(); if ( cursorMaxTimeMS != kMaxTimeCursorNoTimeLimit ) { // Update remaining amount of time in cursor cache. int cursorLeftoverMillis = cursorMaxTimeMS - getMoreTimer.millis(); if ( cursorLeftoverMillis <= 0 ) { cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired; } cursorCache.updateMaxTimeMS( id, cursorLeftoverMillis ); } } else { // we've exhausted the cursor cursorCache.remove( id ); } replyToQuery( 0, r.p(), r.m(), buffer.buf(), buffer.len(), docCount, startFrom, hasMore ? cursor->getId() : 0 ); return; } else { LOG( 3 ) << "could not find cursor " << id << " in cache for " << ns << endl; replyToQuery( ResultFlag_CursorNotFound , r.p() , r.m() , 0 , 0 , 0 ); return; } }
void Strategy::queryOp( Request& r ) { verify( !NamespaceString( r.getns() ).isCommand() ); Timer queryTimer; QueryMessage q( r.d() ); NamespaceString ns(q.ns); ClientBasic* client = ClientBasic::getCurrent(); AuthorizationSession* authSession = client->getAuthorizationSession(); Status status = authSession->checkAuthForQuery(ns, q.query); audit::logQueryAuthzCheck(client, ns, q.query, status.code()); uassertStatusOK(status); LOG(3) << "query: " << q.ns << " " << q.query << " ntoreturn: " << q.ntoreturn << " options: " << q.queryOptions << endl; if ( q.ntoreturn == 1 && strstr(q.ns, ".$cmd") ) throw UserException( 8010 , "something is wrong, shouldn't see a command here" ); if (q.queryOptions & QueryOption_Exhaust) { uasserted(18526, string("the 'exhaust' query option is invalid for mongos queries: ") + q.ns + " " + q.query.toString()); } QuerySpec qSpec( (string)q.ns, q.query, q.fields, q.ntoskip, q.ntoreturn, q.queryOptions ); // Parse "$maxTimeMS". StatusWith<int> maxTimeMS = LiteParsedQuery::parseMaxTimeMSQuery( q.query ); uassert( 17233, maxTimeMS.getStatus().reason(), maxTimeMS.isOK() ); if ( _isSystemIndexes( q.ns ) && doShardedIndexQuery( r, qSpec )) { return; } ParallelSortClusteredCursor * cursor = new ParallelSortClusteredCursor( qSpec, CommandInfo() ); verify( cursor ); // TODO: Move out to Request itself, not strategy based try { cursor->init(); if ( qSpec.isExplain() ) { BSONObjBuilder explain_builder; cursor->explain( explain_builder ); explain_builder.appendNumber( "millis", static_cast<long long>(queryTimer.millis()) ); BSONObj b = explain_builder.obj(); replyToQuery( 0 , r.p() , r.m() , b ); delete( cursor ); return; } } catch(...) { delete cursor; throw; } // TODO: Revisit all of this when we revisit the sharded cursor cache if (cursor->getNumQueryShards() != 1) { // More than one shard (or zero), manage with a ShardedClientCursor // NOTE: We may also have *zero* shards here when the returnPartial flag is set. // Currently the code in ShardedClientCursor handles this. ShardedClientCursorPtr cc (new ShardedClientCursor( q , cursor )); BufBuilder buffer( ShardedClientCursor::INIT_REPLY_BUFFER_SIZE ); int docCount = 0; const int startFrom = cc->getTotalSent(); bool hasMore = cc->sendNextBatch( r, q.ntoreturn, buffer, docCount ); if ( hasMore ) { LOG(5) << "storing cursor : " << cc->getId() << endl; int cursorLeftoverMillis = maxTimeMS.getValue() - queryTimer.millis(); if ( maxTimeMS.getValue() == 0 ) { // 0 represents "no limit". cursorLeftoverMillis = kMaxTimeCursorNoTimeLimit; } else if ( cursorLeftoverMillis <= 0 ) { cursorLeftoverMillis = kMaxTimeCursorTimeLimitExpired; } cursorCache.store( cc, cursorLeftoverMillis ); } replyToQuery( 0, r.p(), r.m(), buffer.buf(), buffer.len(), docCount, startFrom, hasMore ? cc->getId() : 0 ); } else{ // Only one shard is used // Remote cursors are stored remotely, we shouldn't need this around. scoped_ptr<ParallelSortClusteredCursor> cursorDeleter( cursor ); ShardPtr shard = cursor->getQueryShard(); verify( shard.get() ); DBClientCursorPtr shardCursor = cursor->getShardCursor(*shard); // Implicitly stores the cursor in the cache r.reply( *(shardCursor->getMessage()) , shardCursor->originalHost() ); // We don't want to kill the cursor remotely if there's still data left shardCursor->decouple(); } }
virtual void queryOp( Request& r ){ QueryMessage q( r.d() ); log(3) << "shard query: " << q.ns << " " << q.query << endl; if ( q.ntoreturn == 1 && strstr(q.ns, ".$cmd") ) throw UserException( 8010 , "something is wrong, shouldn't see a command here" ); ChunkManagerPtr info = r.getChunkManager(); assert( info ); Query query( q.query ); vector<shared_ptr<ChunkRange> > shards; info->getChunksForQuery( shards , query.getFilter() ); set<ServerAndQuery> servers; for ( vector<shared_ptr<ChunkRange> >::iterator i = shards.begin(); i != shards.end(); i++ ){ shared_ptr<ChunkRange> c = *i; //servers.insert( ServerAndQuery( c->getShard().getConnString() , BSONObj() ) ); // ERH ERH ERH servers.insert( ServerAndQuery( c->getShard().getConnString() , c->getFilter() ) ); } if ( logLevel > 4 ){ StringBuilder ss; ss << " shard query servers: " << servers.size() << '\n'; for ( set<ServerAndQuery>::iterator i = servers.begin(); i!=servers.end(); i++ ){ const ServerAndQuery& s = *i; ss << " " << s.toString() << '\n'; } log() << ss.str(); } ClusteredCursor * cursor = 0; BSONObj sort = query.getSort(); if ( sort.isEmpty() ){ // 1. no sort, can just hit them in serial cursor = new SerialServerClusteredCursor( servers , q ); } else { int shardKeyOrder = info->getShardKey().canOrder( sort ); if ( shardKeyOrder ){ // 2. sort on shard key, can do in serial intelligently set<ServerAndQuery> buckets; for ( vector<shared_ptr<ChunkRange> >::iterator i = shards.begin(); i != shards.end(); i++ ){ shared_ptr<ChunkRange> s = *i; buckets.insert( ServerAndQuery( s->getShard().getConnString() , s->getFilter() , s->getMin() ) ); } cursor = new SerialServerClusteredCursor( buckets , q , shardKeyOrder ); } else { // 3. sort on non-sharded key, pull back a portion from each server and iterate slowly cursor = new ParallelSortClusteredCursor( servers , q , sort ); } } assert( cursor ); log(5) << " cursor type: " << cursor->type() << endl; shardedCursorTypes.hit( cursor->type() ); if ( query.isExplain() ){ BSONObj explain = cursor->explain(); replyToQuery( 0 , r.p() , r.m() , explain ); delete( cursor ); return; } ShardedClientCursorPtr cc (new ShardedClientCursor( q , cursor )); if ( ! cc->sendNextBatch( r ) ){ return; } log(6) << "storing cursor : " << cc->getId() << endl; cursorCache.store( cc ); }
bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) { bool ok = true; DbMessage d(m); const char *ns = d.getns(); int ntoreturn = d.pullInt(); long long cursorid = d.pullInt64(); curop.debug().ns = ns; curop.debug().ntoreturn = ntoreturn; curop.debug().cursorid = cursorid; shared_ptr<AssertionException> ex; scoped_ptr<Timer> timer; int pass = 0; bool exhaust = false; QueryResult* msgdata = 0; GTID last; bool isOplog = false; while( 1 ) { bool isCursorAuthorized = false; try { uassert( 16258, str::stream() << "Invalid ns [" << ns << "]", NamespaceString::isValid(ns) ); Status status = cc().getAuthorizationManager()->checkAuthForGetMore(ns); uassert(16543, status.reason(), status.isOK()); // I (Zardosht), am not crazy about this, but I cannot think of // better alternatives at the moment. The high level goal is to find // a way to do a wait without having a read lock held // via Client::ReadContext. Unfortunately, we can't get the exact position // of the cursor without accessing it, which required a read lock. // So, we do this, which is a good estimate. // // Note this is similar to what vanilla MongoDB does. // // in the first pass, we extract the minimum live GTID. This must be // greater than or equal to the existing cursor's position. // In the second pass, we wait for the GTID manager to have a // minumum live GTID greater than what we saw in the first pass. // This new GTID will be greater than the cursor's starting position, // and therefore the cursor should have more data to look at. // It is theoretically possible that one day, the cursor will still // return no new data because all new GTIDs in between these // two values aborted, but that is not possible right now. Any GTID // assigned is done so with the intent to commit, and tokumx // aborts if a coommit is not successful. if (str::startsWith(ns, "local.oplog.") && theReplSet){ isOplog = true; if (pass == 0) { last = theReplSet->gtidManager->getMinLiveGTID(); } else { theReplSet->gtidManager->waitForDifferentMinLive( last, 2000 // ms, this will be called twice ); } } LOCK_REASON(lockReason, "getMore"); Client::ReadContext ctx(ns, lockReason); // call this readlocked so state can't change replVerifyReadsOk(); msgdata = processGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust, &isCursorAuthorized); } catch ( AssertionException& e ) { if ( isCursorAuthorized ) { // If a cursor with id 'cursorid' was authorized, it may have been advanced // before an exception terminated processGetMore. Erase the ClientCursor // because it may now be out of sync with the client's iteration state. // SERVER-7952 // TODO Temporary code, see SERVER-4563 for a cleanup overview. ClientCursor::erase( cursorid ); } ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) ); ok = false; break; } pass++; if (msgdata == 0) { // this should only happen with QueryOption_AwaitData exhaust = false; massert(13073, "shutting down", !inShutdown() ); if (!isOplog) { if ( ! timer ) { timer.reset( new Timer() ); } else { if ( timer->seconds() >= 4 ) { // after about 4 seconds, return. pass stops at 1000 normally. // we want to return occasionally so slave can checkpoint. pass = 10000; } } if (debug) { sleepmillis(20); } else { sleepmillis(2); } } else { // in the case where we are the oplog, using // waitForDifferentMinLive is sufficient. That // waits for 4 seconds, as the timer above does. // So, we we don't need more than 2 passes. if (pass > 1) { pass = 10000; } } // should eventually clean this up a bit if (isOplog) { curop.setExpectedLatencyMs( 4100 ); } else { // not sure if this 1100 is still wise. curop.setExpectedLatencyMs( 1100 + timer->millis() ); } continue; } break; }; if (ex) { exhaust = false; BSONObjBuilder err; ex->getInfo().append( err ); BSONObj errObj = err.done(); if (!ex->interrupted()) { log() << errObj << endl; } curop.debug().exceptionInfo = ex->getInfo(); if (ex->getCode() == 13436) { replyToQuery(ResultFlag_ErrSet, m, dbresponse, errObj); curop.debug().responseLength = dbresponse.response->header()->dataLen(); curop.debug().nreturned = 1; return ok; } msgdata = emptyMoreResult(cursorid); } Message *resp = new Message(); resp->setData(msgdata, true); curop.debug().responseLength = resp->header()->dataLen(); curop.debug().nreturned = msgdata->nReturned; dbresponse.response = resp; dbresponse.responseTo = m.header()->id; if( exhaust ) { curop.debug().exhaust = true; dbresponse.exhaustNS = ns; } return ok; }
// Returns false when request includes 'end' void assembleResponse( Message &m, DbResponse &dbresponse, const HostAndPort& remote ) { // before we lock... int op = m.operation(); bool isCommand = false; const char *ns = m.singleData()->_data + 4; if ( op == dbQuery ) { if( strstr(ns, ".$cmd") ) { isCommand = true; opwrite(m); if( strstr(ns, ".$cmd.sys.") ) { if( strstr(ns, "$cmd.sys.inprog") ) { inProgCmd(m, dbresponse); return; } if( strstr(ns, "$cmd.sys.killop") ) { killOp(m, dbresponse); return; } if( strstr(ns, "$cmd.sys.unlock") ) { // Reply to this deprecated operation with the standard "not locked" // error for legacy reasons. replyToQuery(0, m, dbresponse, BSON("ok" << 0 << "errmsg" << "not locked")); return; } } } else { opread(m); } } else if( op == dbGetMore ) { opread(m); } else { opwrite(m); } globalOpCounters.gotOp( op , isCommand ); Client& c = cc(); c.getAuthorizationManager()->startRequest(); // initialize the default OpSettings, OpSettings settings; c.setOpSettings(settings); auto_ptr<CurOp> nestedOp; CurOp* currentOpP = c.curop(); if ( currentOpP->active() ) { nestedOp.reset( new CurOp( &c , currentOpP ) ); currentOpP = nestedOp.get(); } CurOp& currentOp = *currentOpP; currentOp.reset(remote,op); OpDebug& debug = currentOp.debug(); debug.op = op; long long logThreshold = cmdLine.slowMS; bool shouldLog = logLevel >= 1; if ( op == dbQuery ) { try { checkPossiblyShardedMessageWithoutLock(m); receivedQuery(c, dbresponse, m); } catch (MustHandleShardedMessage &e) { e.handleShardedMessage(m, &dbresponse); return; } } else if ( op == dbGetMore ) { if ( ! receivedGetMore(dbresponse, m, currentOp) ) shouldLog = true; } else if ( op == dbMsg ) { // deprecated - replaced by commands char *p = m.singleData()->_data; int len = strlen(p); if ( len > 400 ) out() << curTimeMillis64() % 10000 << " long msg received, len:" << len << endl; Message *resp = new Message(); if ( strcmp( "end" , p ) == 0 ) resp->setData( opReply , "dbMsg end no longer supported" ); else resp->setData( opReply , "i am fine - dbMsg deprecated"); dbresponse.response = resp; dbresponse.responseTo = m.header()->id; } else { try { // The following operations all require authorization. // dbInsert, dbUpdate and dbDelete can be easily pre-authorized, // here, but dbKillCursors cannot. if ( op == dbKillCursors ) { currentOp.ensureStarted(); logThreshold = 10; receivedKillCursors(m); } else if ( !NamespaceString::isValid(ns) ) { // Only killCursors doesn't care about namespaces uassert( 16257, str::stream() << "Invalid ns [" << ns << "]", false ); } else if ( op == dbInsert ) { receivedInsert(m, currentOp); } else if ( op == dbUpdate ) { receivedUpdate(m, currentOp); } else if ( op == dbDelete ) { receivedDelete(m, currentOp); } else { mongo::log() << " operation isn't supported: " << op << endl; currentOp.done(); shouldLog = true; } } catch ( UserException& ue ) { tlog(3) << " Caught Assertion in " << opToString(op) << ", continuing " << ue.toString() << endl; debug.exceptionInfo = ue.getInfo(); if (ue.getCode() == storage::ASSERT_IDS::LockDeadlock) { shouldLog = true; } } catch ( AssertionException& e ) { tlog(3) << " Caught Assertion in " << opToString(op) << ", continuing " << e.toString() << endl; debug.exceptionInfo = e.getInfo(); shouldLog = true; } } currentOp.ensureStarted(); currentOp.done(); debug.executionTime = currentOp.totalTimeMillis(); logThreshold += currentOp.getExpectedLatencyMs(); if ( (shouldLog || debug.executionTime > logThreshold) && !debug.vetoLog(currentOp) ) { mongo::tlog() << debug.report( currentOp ) << endl; } if ( currentOp.shouldDBProfile( debug.executionTime ) ) { // performance profiling is on if ( Lock::isReadLocked() ) { LOG(1) << "note: not profiling because recursive read lock" << endl; } else { LOCK_REASON(lockReason, "writing to system.profile collection"); try { Lock::DBRead lk( currentOp.getNS(), lockReason ); lockedDoProfile( c, op, currentOp ); } catch (RetryWithWriteLock &e) { Lock::DBWrite lk( currentOp.getNS(), lockReason ); lockedDoProfile( c, op, currentOp ); } } } debug.recordStats(); debug.reset(); } /* assembleResponse() */