std::string runQuery(OperationContext* opCtx, QueryMessage& q, const NamespaceString& nss, Message& result) { CurOp& curOp = *CurOp::get(opCtx); curOp.ensureStarted(); uassert(ErrorCodes::InvalidNamespace, str::stream() << "Invalid ns [" << nss.ns() << "]", nss.isValid()); invariant(!nss.isCommand()); // Set CurOp information. const auto upconvertedQuery = upconvertQueryEntry(q.query, nss, q.ntoreturn, q.ntoskip); beginQueryOp(opCtx, nss, upconvertedQuery, q.ntoreturn, q.ntoskip); // Parse the qm into a CanonicalQuery. const boost::intrusive_ptr<ExpressionContext> expCtx; auto cq = uassertStatusOKWithContext( CanonicalQuery::canonicalize(opCtx, q, expCtx, ExtensionsCallbackReal(opCtx, &nss), MatchExpressionParser::kAllowAllSpecialFeatures), "Can't canonicalize query"); invariant(cq.get()); LOG(5) << "Running query:\n" << redact(cq->toString()); LOG(2) << "Running query: " << redact(cq->toStringShort()); // Parse, canonicalize, plan, transcribe, and get a plan executor. AutoGetCollectionForReadCommand ctx(opCtx, nss, AutoGetCollection::ViewMode::kViewsForbidden); Collection* const collection = ctx.getCollection(); { const QueryRequest& qr = cq->getQueryRequest(); // Allow the query to run on secondaries if the read preference permits it. If no read // preference was specified, allow the query to run iff slaveOk has been set. const bool slaveOK = qr.hasReadPref() ? uassertStatusOK(ReadPreferenceSetting::fromContainingBSON(q.query)) .canRunOnSecondary() : qr.isSlaveOk(); uassertStatusOK( repl::ReplicationCoordinator::get(opCtx)->checkCanServeReadsFor(opCtx, nss, slaveOK)); } // We have a parsed query. Time to get the execution plan for it. auto exec = uassertStatusOK(getExecutorLegacyFind(opCtx, collection, nss, std::move(cq))); const QueryRequest& qr = exec->getCanonicalQuery()->getQueryRequest(); // If it's actually an explain, do the explain and return rather than falling through // to the normal query execution loop. if (qr.isExplain()) { BufBuilder bb; bb.skip(sizeof(QueryResult::Value)); BSONObjBuilder explainBob; Explain::explainStages( exec.get(), collection, ExplainOptions::Verbosity::kExecAllPlans, &explainBob); // Add the resulting object to the return buffer. BSONObj explainObj = explainBob.obj(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // Set query result fields. QueryResult::View qr = bb.buf(); qr.setResultFlagsToOk(); qr.msgdata().setLen(bb.len()); curOp.debug().responseLength = bb.len(); qr.msgdata().setOperation(opReply); qr.setCursorId(0); qr.setStartingFrom(0); qr.setNReturned(1); result.setData(bb.release()); return ""; } // Handle query option $maxTimeMS (not used with commands). if (qr.getMaxTimeMS() > 0) { uassert(40116, "Illegal attempt to set operation deadline within DBDirectClient", !opCtx->getClient()->isInDirectClient()); opCtx->setDeadlineAfterNowBy(Milliseconds{qr.getMaxTimeMS()}); } opCtx->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(FindCommon::kInitReplyBufferSize); bb.skip(sizeof(QueryResult::Value)); // How many results have we obtained from the executor? int numResults = 0; BSONObj obj; PlanExecutor::ExecState state; // Get summary info about which plan the executor is using. { stdx::lock_guard<Client> lk(*opCtx->getClient()); curOp.setPlanSummary_inlock(Explain::getPlanSummary(exec.get())); } while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // If we can't fit this result inside the current batch, then we stash it for later. if (!FindCommon::haveSpaceForNext(obj, numResults, bb.len())) { exec->enqueue(obj); break; } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; if (FindCommon::enoughForFirstBatch(qr, numResults)) { LOG(5) << "Enough for first batch, wantMore=" << qr.wantMore() << " ntoreturn=" << qr.getNToReturn().value_or(0) << " numResults=" << numResults; break; } } // Caller expects exceptions thrown in certain cases. if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) { error() << "Plan executor error during find: " << PlanExecutor::statestr(state) << ", stats: " << redact(Explain::getWinningPlanStats(exec.get())); uassertStatusOKWithContext(WorkingSetCommon::getMemberObjectStatus(obj), "Executor error during OP_QUERY find"); MONGO_UNREACHABLE; } // Before saving the cursor, ensure that whatever plan we established happened with the expected // collection version auto css = CollectionShardingState::get(opCtx, nss); css->checkShardVersionOrThrow(opCtx); // Fill out CurOp based on query results. If we have a cursorid, we will fill out CurOp with // this cursorid later. long long ccId = 0; if (shouldSaveCursor(opCtx, collection, state, exec.get())) { // We won't use the executor until it's getMore'd. exec->saveState(); exec->detachFromOperationContext(); // Allocate a new ClientCursor and register it with the cursor manager. ClientCursorPin pinnedCursor = collection->getCursorManager()->registerCursor( opCtx, {std::move(exec), nss, AuthorizationSession::get(opCtx->getClient())->getAuthenticatedUserNames(), opCtx->recoveryUnit()->getReadConcernLevel(), upconvertedQuery}); ccId = pinnedCursor.getCursor()->cursorid(); LOG(5) << "caching executor with cursorid " << ccId << " after returning " << numResults << " results"; // TODO document if (qr.isExhaust()) { curOp.debug().exhaust = true; } pinnedCursor.getCursor()->setPos(numResults); // We assume that cursors created through a DBDirectClient are always used from their // original OperationContext, so we do not need to move time to and from the cursor. if (!opCtx->getClient()->isInDirectClient()) { // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). pinnedCursor.getCursor()->setLeftoverMaxTimeMicros(opCtx->getRemainingMaxTimeMicros()); } endQueryOp(opCtx, collection, *pinnedCursor.getCursor()->getExecutor(), numResults, ccId); } else { LOG(5) << "Not caching executor but returning " << numResults << " results."; endQueryOp(opCtx, collection, *exec, numResults, ccId); } // Fill out the output buffer's header. QueryResult::View queryResultView = bb.buf(); queryResultView.setCursorId(ccId); queryResultView.setResultFlagsToOk(); queryResultView.msgdata().setLen(bb.len()); queryResultView.msgdata().setOperation(opReply); queryResultView.setStartingFrom(0); queryResultView.setNReturned(numResults); // Add the results from the query into the output buffer. result.setData(bb.release()); // curOp.debug().exhaust is set above. return curOp.debug().exhaust ? nss.ns() : ""; }
std::string runQuery(OperationContext* txn, QueryMessage& q, const NamespaceString& nss, Message& result) { CurOp& curOp = *CurOp::get(txn); uassert(ErrorCodes::InvalidNamespace, str::stream() << "Invalid ns [" << nss.ns() << "]", nss.isValid()); invariant(!nss.isCommand()); // Set CurOp information. beginQueryOp(txn, nss, q.query, q.ntoreturn, q.ntoskip); // Parse the qm into a CanonicalQuery. auto statusWithCQ = CanonicalQuery::canonicalize(txn, q, ExtensionsCallbackReal(txn, &nss)); if (!statusWithCQ.isOK()) { uasserted(17287, str::stream() << "Can't canonicalize query: " << statusWithCQ.getStatus().toString()); } unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue()); invariant(cq.get()); LOG(5) << "Running query:\n" << redact(cq->toString()); LOG(2) << "Running query: " << redact(cq->toStringShort()); // Parse, canonicalize, plan, transcribe, and get a plan executor. AutoGetCollectionOrViewForRead ctx(txn, nss); Collection* collection = ctx.getCollection(); if (ctx.getView()) { uasserted(ErrorCodes::CommandNotSupportedOnView, str::stream() << "Namespace " << nss.ns() << " is a view. Legacy find operations are not supported on views. " << "Only clients which support the find command can be used to query views."); } // We have a parsed query. Time to get the execution plan for it. std::unique_ptr<PlanExecutor> exec = uassertStatusOK( getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO)); const QueryRequest& qr = exec->getCanonicalQuery()->getQueryRequest(); // If it's actually an explain, do the explain and return rather than falling through // to the normal query execution loop. if (qr.isExplain()) { BufBuilder bb; bb.skip(sizeof(QueryResult::Value)); BSONObjBuilder explainBob; Explain::explainStages(exec.get(), collection, ExplainCommon::EXEC_ALL_PLANS, &explainBob); // Add the resulting object to the return buffer. BSONObj explainObj = explainBob.obj(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // Set query result fields. QueryResult::View qr = bb.buf(); qr.setResultFlagsToOk(); qr.msgdata().setLen(bb.len()); curOp.debug().responseLength = bb.len(); qr.msgdata().setOperation(opReply); qr.setCursorId(0); qr.setStartingFrom(0); qr.setNReturned(1); result.setData(bb.release()); return ""; } // Handle query option $maxTimeMS (not used with commands). if (qr.getMaxTimeMS() > 0) { uassert(40116, "Illegal attempt to set operation deadline within DBDirectClient", !txn->getClient()->isInDirectClient()); txn->setDeadlineAfterNowBy(Milliseconds{qr.getMaxTimeMS()}); } txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. bool slaveOK = qr.isSlaveOk() || qr.hasReadPref(); Status serveReadsStatus = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(txn, nss, slaveOK); uassertStatusOK(serveReadsStatus); // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(FindCommon::kInitReplyBufferSize); bb.skip(sizeof(QueryResult::Value)); // How many results have we obtained from the executor? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. Timestamp slaveReadTill; BSONObj obj; PlanExecutor::ExecState state; // Get summary info about which plan the executor is using. { stdx::lock_guard<Client> lk(*txn->getClient()); curOp.setPlanSummary_inlock(Explain::getPlanSummary(exec.get())); } while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // If we can't fit this result inside the current batch, then we stash it for later. if (!FindCommon::haveSpaceForNext(obj, numResults, bb.len())) { exec->enqueue(obj); break; } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (qr.isOplogReplay()) { BSONElement e = obj["ts"]; if (Date == e.type() || bsonTimestamp == e.type()) { slaveReadTill = e.timestamp(); } } if (FindCommon::enoughForFirstBatch(qr, numResults)) { LOG(5) << "Enough for first batch, wantMore=" << qr.wantMore() << " ntoreturn=" << qr.getNToReturn().value_or(0) << " numResults=" << numResults; break; } } // If we cache the executor later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the executor later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the executor. exec->deregisterExec(); // Caller expects exceptions thrown in certain cases. if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) { error() << "Plan executor error during find: " << PlanExecutor::statestr(state) << ", stats: " << redact(Explain::getWinningPlanStats(exec.get())); uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj)); } // Before saving the cursor, ensure that whatever plan we established happened with the expected // collection version auto css = CollectionShardingState::get(txn, nss); css->checkShardVersionOrThrow(txn); // Fill out CurOp based on query results. If we have a cursorid, we will fill out CurOp with // this cursorid later. long long ccId = 0; if (shouldSaveCursor(txn, collection, state, exec.get())) { // We won't use the executor until it's getMore'd. exec->saveState(); exec->detachFromOperationContext(); // Allocate a new ClientCursor and register it with the cursor manager. ClientCursorPin pinnedCursor = collection->getCursorManager()->registerCursor( {exec.release(), nss.ns(), txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(), qr.getOptions(), qr.getFilter()}); ccId = pinnedCursor.getCursor()->cursorid(); LOG(5) << "caching executor with cursorid " << ccId << " after returning " << numResults << " results"; // TODO document if (qr.isOplogReplay() && !slaveReadTill.isNull()) { pinnedCursor.getCursor()->slaveReadTill(slaveReadTill); } // TODO document if (qr.isExhaust()) { curOp.debug().exhaust = true; } pinnedCursor.getCursor()->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). pinnedCursor.getCursor()->setLeftoverMaxTimeMicros(txn->getRemainingMaxTimeMicros()); endQueryOp(txn, collection, *pinnedCursor.getCursor()->getExecutor(), numResults, ccId); } else { LOG(5) << "Not caching executor but returning " << numResults << " results."; endQueryOp(txn, collection, *exec, numResults, ccId); } // Fill out the output buffer's header. QueryResult::View queryResultView = bb.buf(); queryResultView.setCursorId(ccId); queryResultView.setResultFlagsToOk(); queryResultView.msgdata().setLen(bb.len()); queryResultView.msgdata().setOperation(opReply); queryResultView.setStartingFrom(0); queryResultView.setNReturned(numResults); // Add the results from the query into the output buffer. result.setData(bb.release()); // curOp.debug().exhaust is set above. return curOp.debug().exhaust ? nss.ns() : ""; }