void run() { OldClientWriteContext ctx(&_txn, nss.ns()); insert(BSON("a" << 1 << "b" << 1)); Collection* collection = ctx.getCollection(); BSONObj filterObj = fromjson("{_id: {$gt: 0}, b: {$gt: 0}}"); PlanExecutor* exec = makeCollScanExec(collection, filterObj); // Make a client cursor from the runner. ClientCursor* cc = new ClientCursor(collection->getCursorManager(), exec, nss.ns(), false, 0, BSONObj()); ClientCursorPin ccPin(collection->getCursorManager(), cc->cursorid()); // If the cursor is pinned, it sticks around, // even after invalidation. ASSERT_EQUALS(1U, numCursors()); const std::string invalidateReason("InvalidatePinned Test"); collection->getCursorManager()->invalidateAll(false, invalidateReason); ASSERT_EQUALS(1U, numCursors()); // The invalidation should have killed the runner. BSONObj objOut; ASSERT_EQUALS(PlanExecutor::DEAD, exec->getNext(&objOut, NULL)); ASSERT(WorkingSetCommon::isValidStatusMemberObject(objOut)); const Status status = WorkingSetCommon::getMemberObjectStatus(objOut); ASSERT(status.reason().find(invalidateReason) != string::npos); // Deleting the underlying cursor should cause the // number of cursors to return to 0. ccPin.deleteUnderlying(); ASSERT_EQUALS(0U, numCursors()); }
// static long long DeleteStage::getNumDeleted(const PlanExecutor& exec) { invariant(exec.getRootStage()->isEOF()); invariant(exec.getRootStage()->stageType() == STAGE_DELETE); DeleteStage* deleteStage = static_cast<DeleteStage*>(exec.getRootStage()); const DeleteStats* deleteStats = static_cast<const DeleteStats*>(deleteStage->getSpecificStats()); return deleteStats->docsDeleted; }
/** * Uses 'cursor' and 'request' to fill out 'nextBatch' with the batch of result documents to * be returned by this getMore. * * Returns the number of documents in the batch in *numResults, which must be initialized to * zero by the caller. Returns the final ExecState returned by the cursor in *state. * * Returns an OK status if the batch was successfully generated, and a non-OK status if the * PlanExecutor encounters a failure. */ Status generateBatch(ClientCursor* cursor, const GetMoreRequest& request, BSONArrayBuilder* nextBatch, PlanExecutor::ExecState* state, int* numResults) { PlanExecutor* exec = cursor->getExecutor(); const bool isAwaitData = isCursorAwaitData(cursor); // If an awaitData getMore is killed during this process due to our max time expiring at // an interrupt point, we just continue as normal and return rather than reporting a // timeout to the user. BSONObj obj; try { while (PlanExecutor::ADVANCED == (*state = exec->getNext(&obj, NULL))) { // If adding this object will cause us to exceed the BSON size limit, then we // stash it for later. if (nextBatch->len() + obj.objsize() > BSONObjMaxUserSize && *numResults > 0) { exec->enqueue(obj); break; } // Add result to output buffer. nextBatch->append(obj); (*numResults)++; if (enoughForGetMore(request.batchSize.value_or(0), *numResults, nextBatch->len())) { break; } } } catch (const UserException& except) { if (isAwaitData && except.getCode() == ErrorCodes::ExceededTimeLimit) { // We ignore exceptions from interrupt points due to max time expiry for // awaitData cursors. } else { throw; } } if (PlanExecutor::FAILURE == *state) { const std::unique_ptr<PlanStageStats> stats(exec->getStats()); error() << "GetMore executor error, stats: " << Explain::statsToBSON(*stats); return Status(ErrorCodes::OperationFailed, str::stream() << "GetMore executor error: " << WorkingSetCommon::toStatusString(obj)); } else if (PlanExecutor::DEAD == *state) { return Status(ErrorCodes::OperationFailed, str::stream() << "Plan executor killed during getMore command, " << "ns: " << request.nss.ns()); } return Status::OK(); }
// static void Explain::getSummaryStats(const PlanExecutor& exec, PlanSummaryStats* statsOut) { invariant(NULL != statsOut); PlanStage* root = exec.getRootStage(); // We can get some of the fields we need from the common stats stored in the // root stage of the plan tree. const CommonStats* common = root->getCommonStats(); statsOut->nReturned = common->advanced; statsOut->executionTimeMillis = common->executionTimeMillis; // The other fields are aggregations over the stages in the plan tree. We flatten // the tree into a list and then compute these aggregations. std::vector<const PlanStage*> stages; flattenExecTree(root, &stages); for (size_t i = 0; i < stages.size(); i++) { statsOut->totalKeysExamined += getKeysExamined(stages[i]->stageType(), stages[i]->getSpecificStats()); statsOut->totalDocsExamined += getDocsExamined(stages[i]->stageType(), stages[i]->getSpecificStats()); if (STAGE_IDHACK == stages[i]->stageType()) { statsOut->isIdhack = true; } if (STAGE_SORT == stages[i]->stageType()) { statsOut->hasSortStage = true; } } }
int countResults(const IndexScanParams& params, BSONObj filterObj = BSONObj()) { Client::ReadContext ctx(ns()); PlanExecutor runner; StatusWithMatchExpression swme = MatchExpressionParser::parse(filterObj); verify(swme.isOK()); auto_ptr<MatchExpression> filterExpr(swme.getValue()); runner.setRoot(new IndexScan(params, runner.getWorkingSet(), filterExpr.get())); int count = 0; for (BSONObj obj; Runner::RUNNER_ADVANCED == runner.getNext(&obj); ) { ++count; } return count; }
void endQueryOp(OperationContext* txn, Collection* collection, const PlanExecutor& exec, int dbProfilingLevel, long long numResults, CursorId cursorId) { auto curop = CurOp::get(txn); // Fill out basic curop query exec properties. curop->debug().nreturned = numResults; curop->debug().cursorid = (0 == cursorId ? -1 : cursorId); curop->debug().cursorExhausted = (0 == cursorId); // Fill out curop based on explain summary statistics. PlanSummaryStats summaryStats; Explain::getSummaryStats(exec, &summaryStats); curop->debug().hasSortStage = summaryStats.hasSortStage; curop->debug().keysExamined = summaryStats.totalKeysExamined; curop->debug().docsExamined = summaryStats.totalDocsExamined; curop->debug().idhack = summaryStats.isIdhack; if (collection) { collection->infoCache()->notifyOfQuery(txn, summaryStats.indexesUsed); } const logger::LogComponent commandLogComponent = logger::LogComponent::kCommand; const logger::LogSeverity logLevelOne = logger::LogSeverity::Debug(1); // Set debug information for consumption by the profiler and slow query log. if (dbProfilingLevel > 0 || curop->elapsedMillis() > serverGlobalParams.slowMS || logger::globalLogDomain()->shouldLog(commandLogComponent, logLevelOne)) { // Generate plan summary string. stdx::lock_guard<Client>(*txn->getClient()); curop->setPlanSummary_inlock(Explain::getPlanSummary(&exec)); } // Set debug information for consumption by the profiler only. if (dbProfilingLevel > 0) { // Get BSON stats. unique_ptr<PlanStageStats> execStats(exec.getStats()); BSONObjBuilder statsBob; Explain::statsToBSON(*execStats, &statsBob); curop->debug().execStats.set(statsBob.obj()); // Replace exec stats with plan summary if stats cannot fit into CachedBSONObj. if (curop->debug().execStats.tooBig() && !curop->getPlanSummary().empty()) { BSONObjBuilder bob; bob.append("summary", curop->getPlanSummary()); curop->debug().execStats.set(bob.done()); } } }
/** * Runs a query using the following steps: * --Parsing. * --Acquire locks. * --Plan query, obtaining an executor that can run it. * --Generate the first batch. * --Save state for getMore, transferring ownership of the executor to a ClientCursor. * --Generate response to send to the client. */ bool run(OperationContext* txn, const std::string& dbname, BSONObj& cmdObj, int options, std::string& errmsg, BSONObjBuilder& result) override { const NamespaceString nss(parseNs(dbname, cmdObj)); if (!nss.isValid() || nss.isCommand() || nss.isSpecialCommand()) { return appendCommandStatus(result, {ErrorCodes::InvalidNamespace, str::stream() << "Invalid collection name: " << nss.ns()}); } // Although it is a command, a find command gets counted as a query. globalOpCounters.gotQuery(); if (txn->getClient()->isInDirectClient()) { return appendCommandStatus( result, Status(ErrorCodes::IllegalOperation, "Cannot run find command from eval()")); } // Parse the command BSON to a QueryRequest. const bool isExplain = false; auto qrStatus = QueryRequest::makeFromFindCommand(nss, cmdObj, isExplain); if (!qrStatus.isOK()) { return appendCommandStatus(result, qrStatus.getStatus()); } auto& qr = qrStatus.getValue(); // Validate term before acquiring locks, if provided. if (auto term = qr->getReplicationTerm()) { auto replCoord = repl::ReplicationCoordinator::get(txn); Status status = replCoord->updateTerm(txn, *term); // Note: updateTerm returns ok if term stayed the same. if (!status.isOK()) { return appendCommandStatus(result, status); } } // Fill out curop information. // // We pass negative values for 'ntoreturn' and 'ntoskip' to indicate that these values // should be omitted from the log line. Limit and skip information is already present in the // find command parameters, so these fields are redundant. const int ntoreturn = -1; const int ntoskip = -1; beginQueryOp(txn, nss, cmdObj, ntoreturn, ntoskip); // Finish the parsing step by using the QueryRequest to create a CanonicalQuery. ExtensionsCallbackReal extensionsCallback(txn, &nss); auto statusWithCQ = CanonicalQuery::canonicalize(txn, std::move(qr), extensionsCallback); if (!statusWithCQ.isOK()) { return appendCommandStatus(result, statusWithCQ.getStatus()); } std::unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue()); // Acquire locks. AutoGetCollectionForRead ctx(txn, nss); Collection* collection = ctx.getCollection(); // Get the execution plan for the query. auto statusWithPlanExecutor = getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO); if (!statusWithPlanExecutor.isOK()) { return appendCommandStatus(result, statusWithPlanExecutor.getStatus()); } std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue()); { stdx::lock_guard<Client>(*txn->getClient()); CurOp::get(txn)->setPlanSummary_inlock(Explain::getPlanSummary(exec.get())); } if (!collection) { // No collection. Just fill out curop indicating that there were zero results and // there is no ClientCursor id, and then return. const long long numResults = 0; const CursorId cursorId = 0; endQueryOp(txn, collection, *exec, numResults, cursorId); appendCursorResponseObject(cursorId, nss.ns(), BSONArray(), &result); return true; } const QueryRequest& originalQR = exec->getCanonicalQuery()->getQueryRequest(); // Stream query results, adding them to a BSONArray as we go. CursorResponseBuilder firstBatch(/*isInitialResponse*/ true, &result); BSONObj obj; PlanExecutor::ExecState state = PlanExecutor::ADVANCED; long long numResults = 0; while (!FindCommon::enoughForFirstBatch(originalQR, numResults) && PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // If we can't fit this result inside the current batch, then we stash it for later. if (!FindCommon::haveSpaceForNext(obj, numResults, firstBatch.bytesUsed())) { exec->enqueue(obj); break; } // Add result to output buffer. firstBatch.append(obj); numResults++; } // Throw an assertion if query execution fails for any reason. if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) { firstBatch.abandon(); error() << "Plan executor error during find command: " << PlanExecutor::statestr(state) << ", stats: " << Explain::getWinningPlanStats(exec.get()); return appendCommandStatus(result, Status(ErrorCodes::OperationFailed, str::stream() << "Executor error during find command: " << WorkingSetCommon::toStatusString(obj))); } // Before saving the cursor, ensure that whatever plan we established happened with the // expected collection version auto css = CollectionShardingState::get(txn, nss); css->checkShardVersionOrThrow(txn); // Set up the cursor for getMore. CursorId cursorId = 0; if (shouldSaveCursor(txn, collection, state, exec.get())) { // Register the execution plan inside a ClientCursor. Ownership of the PlanExecutor is // transferred to the ClientCursor. // // First unregister the PlanExecutor so it can be re-registered with ClientCursor. exec->deregisterExec(); // Create a ClientCursor containing this plan executor. We don't have to worry about // leaking it as it's inserted into a global map by its ctor. ClientCursor* cursor = new ClientCursor(collection->getCursorManager(), exec.release(), nss.ns(), txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(), originalQR.getOptions(), cmdObj.getOwned()); cursorId = cursor->cursorid(); invariant(!exec); PlanExecutor* cursorExec = cursor->getExecutor(); // State will be restored on getMore. cursorExec->saveState(); cursorExec->detachFromOperationContext(); cursor->setLeftoverMaxTimeMicros(txn->getRemainingMaxTimeMicros()); cursor->setPos(numResults); // Fill out curop based on the results. endQueryOp(txn, collection, *cursorExec, numResults, cursorId); } else { endQueryOp(txn, collection, *exec, numResults, cursorId); } // Generate the response object to send to the client. firstBatch.done(cursorId, nss.ns()); return true; }
/** * Called by db/instance.cpp. This is the getMore entry point. * * pass - when QueryOption_AwaitData is in use, the caller will make repeated calls * when this method returns an empty result, incrementing pass on each call. * Thus, pass == 0 indicates this is the first "attempt" before any 'awaiting'. */ QueryResult::View newGetMore(OperationContext* txn, const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized, bool fromDBDirectClient) { // For testing, we may want to fail if we receive a getmore. if (MONGO_FAIL_POINT(failReceivedGetmore)) { invariant(0); } exhaust = false; // This is a read lock. const NamespaceString nss(ns); scoped_ptr<AutoGetCollectionForRead> ctx(new AutoGetCollectionForRead(txn, nss)); Collection* collection = ctx->getCollection(); uassert( 17356, "collection dropped between getMore calls", collection ); QLOG() << "Running getMore, cursorid: " << cursorid << endl; // This checks to make sure the operation is allowed on a replicated node. Since we are not // passing in a query object (necessary to check SlaveOK query option), the only state where // reads are allowed is PRIMARY (or master in master/slave). This function uasserts if // reads are not okay. Status status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor( txn, nss, true); uassertStatusOK(status); // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. ClientCursorPin ccPin(collection, cursorid); ClientCursor* cc = ccPin.c(); // If we're not being called from DBDirectClient we want to associate the RecoveryUnit // used to create the execution machinery inside the cursor with our OperationContext. // If we throw or otherwise exit this method in a disorderly fashion, we must ensure // that further calls to getMore won't fail, and that the provided OperationContext // has a valid RecoveryUnit. As such, we use RAII to accomplish this. // // This must be destroyed before the ClientCursor is destroyed. std::auto_ptr<ScopedRecoveryUnitSwapper> ruSwapper; // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; const int InitialBufSize = 512 + sizeof(QueryResult::Value) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(InitialBufSize); bb.skip(sizeof(QueryResult::Value)); if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Quote: check for spoofing of the ns such that it does not match the one originally // there for the cursor uassert(17011, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // Restore the RecoveryUnit if we need to. if (fromDBDirectClient) { if (cc->hasRecoveryUnit()) invariant(txn->recoveryUnit() == cc->getUnownedRecoveryUnit()); } else { if (!cc->hasRecoveryUnit()) { // Start using a new RecoveryUnit cc->setOwnedRecoveryUnit( getGlobalEnvironment()->getGlobalStorageEngine()->newRecoveryUnit(txn)); } // Swap RecoveryUnit(s) between the ClientCursor and OperationContext. ruSwapper.reset(new ScopedRecoveryUnitSwapper(cc, txn)); } // Reset timeout timer on the cursor since the cursor is still in use. cc->setIdleTime(0); // TODO: fail point? // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. if (0 == pass) { cc->updateSlaveLocation(txn, curop); } if (cc->isAggCursor) { // Agg cursors handle their own locking internally. ctx.reset(); // unlocks } CollectionMetadataPtr collMetadata = cc->getCollMetadata(); // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); // What gives us results. PlanExecutor* exec = cc->getExecutor(); const int queryOptions = cc->queryOptions(); // Get results out of the executor. exec->restoreState(txn); BSONObj obj; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (queryOptions & QueryOption_OplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } if ((ntoreturn && numResults >= ntoreturn) || bb.len() > MaxBytesToReturnToClientAtOnce) { break; } } // We save the client cursor when there might be more results, and hence we may receive // another getmore. If we receive a EOF or an error, or 'exec' is dead, then we know // that we will not be producing more results. We indicate that the cursor is closed by // sending a cursorId of 0 back to the client. // // On the other hand, if we retrieve all results necessary for this batch, then // 'saveClientCursor' is true and we send a valid cursorId back to the client. In // this case, there may or may not actually be more results (for example, the next call // to getNext(...) might just return EOF). bool saveClientCursor = false; if (PlanExecutor::DEAD == state || PlanExecutor::EXEC_ERROR == state) { // Propagate this error to caller. if (PlanExecutor::EXEC_ERROR == state) { scoped_ptr<PlanStageStats> stats(exec->getStats()); error() << "Plan executor error, stats: " << Explain::statsToBSON(*stats); uasserted(17406, "getMore executor error: " + WorkingSetCommon::toStatusString(obj)); } // If we're dead there's no way to get more results. saveClientCursor = false; // In the old system tailable capped cursors would be killed off at the // cursorid level. If a tailable capped cursor is nuked the cursorid // would vanish. // // In the new system they die and are cleaned up later (or time out). // So this is where we get to remove the cursorid. if (0 == numResults) { resultFlags = ResultFlag_CursorNotFound; } } else if (PlanExecutor::IS_EOF == state) { // EOF is also end of the line unless it's tailable. saveClientCursor = queryOptions & QueryOption_CursorTailable; } else { verify(PlanExecutor::ADVANCED == state); saveClientCursor = true; } if (!saveClientCursor) { ruSwapper.reset(); ccPin.deleteUnderlying(); // cc is now invalid, as is the executor cursorid = 0; cc = NULL; QLOG() << "getMore NOT saving client cursor, ended with state " << PlanExecutor::statestr(state) << endl; } else { // Continue caching the ClientCursor. cc->incPos(numResults); exec->saveState(); QLOG() << "getMore saving client cursor ended with state " << PlanExecutor::statestr(state) << endl; if (PlanExecutor::IS_EOF == state && (queryOptions & QueryOption_CursorTailable)) { if (!fromDBDirectClient) { // Don't stash the RU. Get a new one on the next getMore. ruSwapper.reset(); delete cc->releaseOwnedRecoveryUnit(); } if ((queryOptions & QueryOption_AwaitData) && (numResults == 0) && (pass < 1000)) { // Bubble up to the AwaitData handling code in receivedGetMore which will // try again. return NULL; } } // Possibly note slave's position in the oplog. if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = (queryOptions & QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult::View qr = bb.buf(); qr.msgdata().setLen(bb.len()); qr.msgdata().setOperation(opReply); qr.setResultFlags(resultFlags); qr.setCursorId(cursorid); qr.setStartingFrom(startingResult); qr.setNReturned(numResults); bb.decouple(); QLOG() << "getMore returned " << numResults << " results\n"; return qr; }
virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl = false ) { NamespaceString ns( dbname, cmdObj[name].String() ); AutoGetCollectionForRead ctx(txn, ns.ns()); Collection* collection = ctx.getCollection(); if ( !collection ) return appendCommandStatus( result, Status( ErrorCodes::NamespaceNotFound, str::stream() << "ns does not exist: " << ns.ns() ) ); size_t numCursors = static_cast<size_t>( cmdObj["numCursors"].numberInt() ); if ( numCursors == 0 || numCursors > 10000 ) return appendCommandStatus( result, Status( ErrorCodes::BadValue, str::stream() << "numCursors has to be between 1 and 10000" << " was: " << numCursors ) ); OwnedPointerVector<RecordIterator> iterators(collection->getManyIterators(txn)); if (iterators.size() < numCursors) { numCursors = iterators.size(); } OwnedPointerVector<PlanExecutor> execs; for ( size_t i = 0; i < numCursors; i++ ) { WorkingSet* ws = new WorkingSet(); MultiIteratorStage* mis = new MultiIteratorStage(txn, ws, collection); PlanExecutor* rawExec; // Takes ownership of 'ws' and 'mis'. Status execStatus = PlanExecutor::make(txn, ws, mis, collection, PlanExecutor::YIELD_AUTO, &rawExec); invariant(execStatus.isOK()); auto_ptr<PlanExecutor> curExec(rawExec); // The PlanExecutor was registered on construction due to the YIELD_AUTO policy. // We have to deregister it, as it will be registered with ClientCursor. curExec->deregisterExec(); // Need to save state while yielding locks between now and getMore(). curExec->saveState(); execs.push_back(curExec.release()); } // transfer iterators to executors using a round-robin distribution. // TODO consider using a common work queue once invalidation issues go away. for (size_t i = 0; i < iterators.size(); i++) { PlanExecutor* theExec = execs[i % execs.size()]; MultiIteratorStage* mis = static_cast<MultiIteratorStage*>(theExec->getRootStage()); // This wasn't called above as they weren't assigned yet iterators[i]->saveState(); mis->addIterator(iterators.releaseAt(i)); } { BSONArrayBuilder bucketsBuilder; for (size_t i = 0; i < execs.size(); i++) { // transfer ownership of an executor to the ClientCursor (which manages its own // lifetime). ClientCursor* cc = new ClientCursor( collection->getCursorManager(), execs.releaseAt(i), ns.ns() ); BSONObjBuilder threadResult; appendCursorResponseObject( cc->cursorid(), ns.ns(), BSONArray(), &threadResult ); threadResult.appendBool( "ok", 1 ); bucketsBuilder.append( threadResult.obj() ); } result.appendArray( "cursors", bucketsBuilder.obj() ); } return true; }
virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl = false ) { NamespaceString ns( dbname, cmdObj[name].String() ); AutoGetCollectionForRead ctx(txn, ns.ns()); Collection* collection = ctx.getCollection(); if ( !collection ) return appendCommandStatus( result, Status( ErrorCodes::NamespaceNotFound, str::stream() << "ns does not exist: " << ns.ns() ) ); size_t numCursors = static_cast<size_t>( cmdObj["numCursors"].numberInt() ); if ( numCursors == 0 || numCursors > 10000 ) return appendCommandStatus( result, Status( ErrorCodes::BadValue, str::stream() << "numCursors has to be between 1 and 10000" << " was: " << numCursors ) ); OwnedPointerVector<RecordIterator> iterators(collection->getManyIterators(txn)); if (iterators.size() < numCursors) { numCursors = iterators.size(); } OwnedPointerVector<PlanExecutor> execs; for ( size_t i = 0; i < numCursors; i++ ) { WorkingSet* ws = new WorkingSet(); MultiIteratorStage* mis = new MultiIteratorStage(txn, ws, collection); // Takes ownership of 'ws' and 'mis'. auto_ptr<PlanExecutor> curExec(new PlanExecutor(txn, ws, mis, collection)); // Each of the plan executors should yield automatically. We pass "false" to // indicate that 'curExec' should not register itself, as it will get registered // by ClientCursor instead. curExec->setYieldPolicy(PlanExecutor::YIELD_AUTO, false); // Need to save state while yielding locks between now and newGetMore. curExec->saveState(); execs.push_back(curExec.release()); } // transfer iterators to executors using a round-robin distribution. // TODO consider using a common work queue once invalidation issues go away. for (size_t i = 0; i < iterators.size(); i++) { PlanExecutor* theExec = execs[i % execs.size()]; MultiIteratorStage* mis = static_cast<MultiIteratorStage*>(theExec->getRootStage()); mis->addIterator(iterators.releaseAt(i)); } { BSONArrayBuilder bucketsBuilder; for (size_t i = 0; i < execs.size(); i++) { // transfer ownership of an executor to the ClientCursor (which manages its own // lifetime). ClientCursor* cc = new ClientCursor( collection, execs.releaseAt(i) ); // we are mimicking the aggregation cursor output here // that is why there are ns, ok and empty firstBatch BSONObjBuilder threadResult; { BSONObjBuilder cursor; cursor.appendArray( "firstBatch", BSONObj() ); cursor.append( "ns", ns ); cursor.append( "id", cc->cursorid() ); threadResult.append( "cursor", cursor.obj() ); } threadResult.appendBool( "ok", 1 ); bucketsBuilder.append( threadResult.obj() ); } result.appendArray( "cursors", bucketsBuilder.obj() ); } return true; }
/** * Called by db/instance.cpp. This is the getMore entry point. * * pass - when QueryOption_AwaitData is in use, the caller will make repeated calls * when this method returns an empty result, incrementing pass on each call. * Thus, pass == 0 indicates this is the first "attempt" before any 'awaiting'. */ QueryResult::View getMore(OperationContext* txn, const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized) { // For testing, we may want to fail if we receive a getmore. if (MONGO_FAIL_POINT(failReceivedGetmore)) { invariant(0); } exhaust = false; const NamespaceString nss(ns); // Depending on the type of cursor being operated on, we hold locks for the whole getMore, // or none of the getMore, or part of the getMore. The three cases in detail: // // 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore. // 2) Cursor owned by global cursor manager: we don't lock anything. These cursors don't // own any collection state. // 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and // "unpinCollLock". This is because agg cursors handle locking internally (hence the // release), but the pin and unpin of the cursor must occur under the collection lock. // We don't use our AutoGetCollectionForRead "ctx" to relock, because // AutoGetCollectionForRead checks the sharding version (and we want the relock for the // unpin to succeed even if the sharding version has changed). // // Note that we declare our locks before our ClientCursorPin, in order to ensure that the // pin's destructor is called before the lock destructors (so that the unpin occurs under // the lock). boost::scoped_ptr<AutoGetCollectionForRead> ctx; boost::scoped_ptr<Lock::DBLock> unpinDBLock; boost::scoped_ptr<Lock::CollectionLock> unpinCollLock; CursorManager* cursorManager; CursorManager* globalCursorManager = CursorManager::getGlobalCursorManager(); if (globalCursorManager->ownsCursorId(cursorid)) { cursorManager = globalCursorManager; } else { ctx.reset(new AutoGetCollectionForRead(txn, nss)); Collection* collection = ctx->getCollection(); uassert( 17356, "collection dropped between getMore calls", collection ); cursorManager = collection->getCursorManager(); } LOG(5) << "Running getMore, cursorid: " << cursorid << endl; // This checks to make sure the operation is allowed on a replicated node. Since we are not // passing in a query object (necessary to check SlaveOK query option), the only state where // reads are allowed is PRIMARY (or master in master/slave). This function uasserts if // reads are not okay. Status status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor( txn, nss, true); uassertStatusOK(status); // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. ClientCursorPin ccPin(cursorManager, cursorid); ClientCursor* cc = ccPin.c(); // If we're not being called from DBDirectClient we want to associate the RecoveryUnit // used to create the execution machinery inside the cursor with our OperationContext. // If we throw or otherwise exit this method in a disorderly fashion, we must ensure // that further calls to getMore won't fail, and that the provided OperationContext // has a valid RecoveryUnit. As such, we use RAII to accomplish this. // // This must be destroyed before the ClientCursor is destroyed. std::auto_ptr<ScopedRecoveryUnitSwapper> ruSwapper; // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; const int InitialBufSize = 512 + sizeof(QueryResult::Value) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(InitialBufSize); bb.skip(sizeof(QueryResult::Value)); if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Check for spoofing of the ns such that it does not match the one originally // there for the cursor. uassert(ErrorCodes::Unauthorized, str::stream() << "Requested getMore on namespace " << ns << ", but cursor " << cursorid << " belongs to namespace " << cc->ns(), ns == cc->ns()); *isCursorAuthorized = true; // Restore the RecoveryUnit if we need to. if (txn->getClient()->isInDirectClient()) { if (cc->hasRecoveryUnit()) invariant(txn->recoveryUnit() == cc->getUnownedRecoveryUnit()); } else { if (!cc->hasRecoveryUnit()) { // Start using a new RecoveryUnit cc->setOwnedRecoveryUnit( getGlobalServiceContext()->getGlobalStorageEngine()->newRecoveryUnit()); } // Swap RecoveryUnit(s) between the ClientCursor and OperationContext. ruSwapper.reset(new ScopedRecoveryUnitSwapper(cc, txn)); } // Reset timeout timer on the cursor since the cursor is still in use. cc->setIdleTime(0); // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. if (0 == pass) { cc->updateSlaveLocation(txn); } if (cc->isAggCursor()) { // Agg cursors handle their own locking internally. ctx.reset(); // unlocks } // If we're replaying the oplog, we save the last time that we read. Timestamp slaveReadTill; // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); // What gives us results. PlanExecutor* exec = cc->getExecutor(); const int queryOptions = cc->queryOptions(); // Get results out of the executor. exec->restoreState(txn); BSONObj obj; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (queryOptions & QueryOption_OplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || bsonTimestamp == e.type()) { slaveReadTill = e.timestamp(); } } if (enoughForGetMore(ntoreturn, numResults, bb.len())) { break; } } if (PlanExecutor::DEAD == state || PlanExecutor::FAILURE == state) { // Propagate this error to caller. if (PlanExecutor::FAILURE == state) { scoped_ptr<PlanStageStats> stats(exec->getStats()); error() << "Plan executor error, stats: " << Explain::statsToBSON(*stats); uasserted(17406, "getMore executor error: " + WorkingSetCommon::toStatusString(obj)); } // In the old system tailable capped cursors would be killed off at the // cursorid level. If a tailable capped cursor is nuked the cursorid // would vanish. // // In the new system they die and are cleaned up later (or time out). // So this is where we get to remove the cursorid. if (0 == numResults) { resultFlags = ResultFlag_CursorNotFound; } } const bool shouldSaveCursor = shouldSaveCursorGetMore(state, exec, isCursorTailable(cc)); // In order to deregister a cursor, we need to be holding the DB + collection lock and // if the cursor is aggregation, we release these locks. if (cc->isAggCursor()) { invariant(NULL == ctx.get()); unpinDBLock.reset(new Lock::DBLock(txn->lockState(), nss.db(), MODE_IS)); unpinCollLock.reset(new Lock::CollectionLock(txn->lockState(), nss.ns(), MODE_IS)); } // Our two possible ClientCursorPin cleanup paths are: // 1) If the cursor is not going to be saved, we call deleteUnderlying() on the pin. // 2) If the cursor is going to be saved, we simply let the pin go out of scope. In // this case, the pin's destructor will be invoked, which will call release() on the // pin. Because our ClientCursorPin is declared after our lock is declared, this // will happen under the lock. if (!shouldSaveCursor) { ruSwapper.reset(); ccPin.deleteUnderlying(); // cc is now invalid, as is the executor cursorid = 0; cc = NULL; curop.debug().cursorExhausted = true; LOG(5) << "getMore NOT saving client cursor, ended with state " << PlanExecutor::statestr(state) << endl; } else { // Continue caching the ClientCursor. cc->incPos(numResults); exec->saveState(); LOG(5) << "getMore saving client cursor ended with state " << PlanExecutor::statestr(state) << endl; if (PlanExecutor::IS_EOF == state && (queryOptions & QueryOption_CursorTailable)) { if (!txn->getClient()->isInDirectClient()) { // Don't stash the RU. Get a new one on the next getMore. ruSwapper->dismiss(); } if ((queryOptions & QueryOption_AwaitData) && (numResults == 0) && (pass < 1000)) { // Bubble up to the AwaitData handling code in receivedGetMore which will // try again. return NULL; } } // Possibly note slave's position in the oplog. if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = (queryOptions & QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult::View qr = bb.buf(); qr.msgdata().setLen(bb.len()); qr.msgdata().setOperation(opReply); qr.setResultFlags(resultFlags); qr.setCursorId(cursorid); qr.setStartingFrom(startingResult); qr.setNReturned(numResults); bb.decouple(); LOG(5) << "getMore returned " << numResults << " results\n"; return qr; }
/** * Called by db/instance.cpp. This is the getMore entry point. */ QueryResult::View getMore(OperationContext* txn, const char* ns, int ntoreturn, long long cursorid, bool* exhaust, bool* isCursorAuthorized) { invariant(ntoreturn >= 0); CurOp& curop = *CurOp::get(txn); // For testing, we may want to fail if we receive a getmore. if (MONGO_FAIL_POINT(failReceivedGetmore)) { invariant(0); } *exhaust = false; const NamespaceString nss(ns); // Depending on the type of cursor being operated on, we hold locks for the whole getMore, // or none of the getMore, or part of the getMore. The three cases in detail: // // 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore. // 2) Cursor owned by global cursor manager: we don't lock anything. These cursors don't own // any collection state. These cursors are generated either by the listCollections or // listIndexes commands, as these special cursor-generating commands operate over catalog // data rather than targeting the data within a collection. // 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and // "unpinCollLock". This is because agg cursors handle locking internally (hence the // release), but the pin and unpin of the cursor must occur under the collection lock. // We don't use our AutoGetCollectionForRead "ctx" to relock, because // AutoGetCollectionForRead checks the sharding version (and we want the relock for the // unpin to succeed even if the sharding version has changed). // // Note that we declare our locks before our ClientCursorPin, in order to ensure that the // pin's destructor is called before the lock destructors (so that the unpin occurs under // the lock). unique_ptr<AutoGetCollectionForRead> ctx; unique_ptr<Lock::DBLock> unpinDBLock; unique_ptr<Lock::CollectionLock> unpinCollLock; CursorManager* cursorManager; if (nss.isListIndexesCursorNS() || nss.isListCollectionsCursorNS()) { // List collections and list indexes are special cursor-generating commands whose // cursors are managed globally, as they operate over catalog data rather than targeting // the data within a collection. cursorManager = CursorManager::getGlobalCursorManager(); } else { ctx = stdx::make_unique<AutoGetCollectionForRead>(txn, nss); Collection* collection = ctx->getCollection(); uassert(17356, "collection dropped between getMore calls", collection); cursorManager = collection->getCursorManager(); } LOG(5) << "Running getMore, cursorid: " << cursorid << endl; // This checks to make sure the operation is allowed on a replicated node. Since we are not // passing in a query object (necessary to check SlaveOK query option), the only state where // reads are allowed is PRIMARY (or master in master/slave). This function uasserts if // reads are not okay. Status status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(txn, nss, true); uassertStatusOK(status); // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. ClientCursorPin ccPin(cursorManager, cursorid); ClientCursor* cc = ccPin.c(); // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; const int InitialBufSize = 512 + sizeof(QueryResult::Value) + FindCommon::kMaxBytesToReturnToClientAtOnce; BufBuilder bb(InitialBufSize); bb.skip(sizeof(QueryResult::Value)); if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Check for spoofing of the ns such that it does not match the one originally // there for the cursor. uassert(ErrorCodes::Unauthorized, str::stream() << "Requested getMore on namespace " << ns << ", but cursor " << cursorid << " belongs to namespace " << cc->ns(), ns == cc->ns()); *isCursorAuthorized = true; if (cc->isReadCommitted()) uassertStatusOK(txn->recoveryUnit()->setReadFromMajorityCommittedSnapshot()); // Reset timeout timer on the cursor since the cursor is still in use. cc->setIdleTime(0); // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // Ensure that the original query or command object is available in the slow query log, // profiler, and currentOp. curop.debug().query = cc->getQuery(); { stdx::lock_guard<Client> lk(*txn->getClient()); curop.setQuery_inlock(cc->getQuery()); } cc->updateSlaveLocation(txn); if (cc->isAggCursor()) { // Agg cursors handle their own locking internally. ctx.reset(); // unlocks } // If we're replaying the oplog, we save the last time that we read. Timestamp slaveReadTill; // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); uint64_t notifierVersion = 0; std::shared_ptr<CappedInsertNotifier> notifier; if (isCursorAwaitData(cc)) { invariant(ctx->getCollection()->isCapped()); // Retrieve the notifier which we will wait on until new data arrives. We make sure // to do this in the lock because once we drop the lock it is possible for the // collection to become invalid. The notifier itself will outlive the collection if // the collection is dropped, as we keep a shared_ptr to it. notifier = ctx->getCollection()->getCappedInsertNotifier(); // Must get the version before we call generateBatch in case a write comes in after // that call and before we call wait on the notifier. notifierVersion = notifier->getVersion(); } PlanExecutor* exec = cc->getExecutor(); exec->reattachToOperationContext(txn); exec->restoreState(); PlanExecutor::ExecState state; generateBatch(ntoreturn, cc, &bb, &numResults, &slaveReadTill, &state); // If this is an await data cursor, and we hit EOF without generating any results, then // we block waiting for new data to arrive. if (isCursorAwaitData(cc) && state == PlanExecutor::IS_EOF && numResults == 0) { // Save the PlanExecutor and drop our locks. exec->saveState(); ctx.reset(); // Block waiting for data for up to 1 second. Seconds timeout(1); notifier->wait(notifierVersion, timeout); notifier.reset(); // Set expected latency to match wait time. This makes sure the logs aren't spammed // by awaitData queries that exceed slowms due to blocking on the CappedInsertNotifier. curop.setExpectedLatencyMs(durationCount<Milliseconds>(timeout)); // Reacquiring locks. ctx = make_unique<AutoGetCollectionForRead>(txn, nss); exec->restoreState(); // We woke up because either the timed_wait expired, or there was more data. Either // way, attempt to generate another batch of results. generateBatch(ntoreturn, cc, &bb, &numResults, &slaveReadTill, &state); } // We have to do this before re-acquiring locks in the agg case because // shouldSaveCursorGetMore() can make a network call for agg cursors. // // TODO: Getting rid of PlanExecutor::isEOF() in favor of PlanExecutor::IS_EOF would mean // that this network operation is no longer necessary. const bool shouldSaveCursor = shouldSaveCursorGetMore(state, exec, isCursorTailable(cc)); // In order to deregister a cursor, we need to be holding the DB + collection lock and // if the cursor is aggregation, we release these locks. if (cc->isAggCursor()) { invariant(NULL == ctx.get()); unpinDBLock = make_unique<Lock::DBLock>(txn->lockState(), nss.db(), MODE_IS); unpinCollLock = make_unique<Lock::CollectionLock>(txn->lockState(), nss.ns(), MODE_IS); } // Our two possible ClientCursorPin cleanup paths are: // 1) If the cursor is not going to be saved, we call deleteUnderlying() on the pin. // 2) If the cursor is going to be saved, we simply let the pin go out of scope. In // this case, the pin's destructor will be invoked, which will call release() on the // pin. Because our ClientCursorPin is declared after our lock is declared, this // will happen under the lock. if (!shouldSaveCursor) { ccPin.deleteUnderlying(); // cc is now invalid, as is the executor cursorid = 0; cc = NULL; curop.debug().cursorExhausted = true; LOG(5) << "getMore NOT saving client cursor, ended with state " << PlanExecutor::statestr(state) << endl; } else { // Continue caching the ClientCursor. cc->incPos(numResults); exec->saveState(); exec->detachFromOperationContext(); LOG(5) << "getMore saving client cursor ended with state " << PlanExecutor::statestr(state) << endl; // Possibly note slave's position in the oplog. if ((cc->queryOptions() & QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } *exhaust = cc->queryOptions() & QueryOption_Exhaust; // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } } QueryResult::View qr = bb.buf(); qr.msgdata().setLen(bb.len()); qr.msgdata().setOperation(opReply); qr.setResultFlags(resultFlags); qr.setCursorId(cursorid); qr.setStartingFrom(startingResult); qr.setNReturned(numResults); bb.decouple(); LOG(5) << "getMore returned " << numResults << " results\n"; return qr; }
/** * Runs a query using the following steps: * 1) Parsing. * 2) Acquire locks. * 3) Plan query, obtaining an executor that can run it. * 4) Setup a cursor for the query, which may be used on subsequent getMores. * 5) Generate the first batch. * 6) Save state for getMore. * 7) Generate response to send to the client. * * TODO: Rather than using the sharding version available in thread-local storage (i.e. the * call to ShardingState::needCollectionMetadata() below), shard version information * should be passed as part of the command parameter. */ bool run(OperationContext* txn, const std::string& dbname, BSONObj& cmdObj, int options, std::string& errmsg, BSONObjBuilder& result) override { const std::string fullns = parseNs(dbname, cmdObj); const NamespaceString nss(fullns); if (!nss.isValid()) { return appendCommandStatus(result, {ErrorCodes::InvalidNamespace, str::stream() << "Invalid collection name: " << nss.ns()}); } // Although it is a command, a find command gets counted as a query. globalOpCounters.gotQuery(); if (txn->getClient()->isInDirectClient()) { return appendCommandStatus( result, Status(ErrorCodes::IllegalOperation, "Cannot run find command from eval()")); } // 1a) Parse the command BSON to a LiteParsedQuery. const bool isExplain = false; auto lpqStatus = LiteParsedQuery::makeFromFindCommand(nss, cmdObj, isExplain); if (!lpqStatus.isOK()) { return appendCommandStatus(result, lpqStatus.getStatus()); } auto& lpq = lpqStatus.getValue(); // Validate term, if provided. if (auto term = lpq->getReplicationTerm()) { auto replCoord = repl::ReplicationCoordinator::get(txn); Status status = replCoord->updateTerm(*term); // Note: updateTerm returns ok if term stayed the same. if (!status.isOK()) { return appendCommandStatus(result, status); } } // Fill out curop information. long long ntoreturn = lpq->getBatchSize().value_or(0); beginQueryOp(txn, nss, cmdObj, ntoreturn, lpq->getSkip()); // 1b) Finish the parsing step by using the LiteParsedQuery to create a CanonicalQuery. WhereCallbackReal whereCallback(txn, nss.db()); auto statusWithCQ = CanonicalQuery::canonicalize(lpq.release(), whereCallback); if (!statusWithCQ.isOK()) { return appendCommandStatus(result, statusWithCQ.getStatus()); } std::unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue()); // 2) Acquire locks. AutoGetCollectionForRead ctx(txn, nss); Collection* collection = ctx.getCollection(); const int dbProfilingLevel = ctx.getDb() ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile; ShardingState* const shardingState = ShardingState::get(txn); // It is possible that the sharding version will change during yield while we are // retrieving a plan executor. If this happens we will throw an error and mongos will // retry. const ChunkVersion shardingVersionAtStart = shardingState->getVersion(nss.ns()); // 3) Get the execution plan for the query. auto statusWithPlanExecutor = getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO); if (!statusWithPlanExecutor.isOK()) { return appendCommandStatus(result, statusWithPlanExecutor.getStatus()); } std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue()); // TODO: Currently, chunk ranges are kept around until all ClientCursors created while // the chunk belonged on this node are gone. Separating chunk lifetime management from // ClientCursor should allow this check to go away. if (!shardingState->getVersion(nss.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // Version changed while retrieving a PlanExecutor. Terminate the operation, // signaling that mongos should retry. throw SendStaleConfigException(nss.ns(), "version changed during find command", shardingVersionAtStart, shardingState->getVersion(nss.ns())); } if (!collection) { // No collection. Just fill out curop indicating that there were zero results and // there is no ClientCursor id, and then return. const long long numResults = 0; const CursorId cursorId = 0; endQueryOp(txn, *exec, dbProfilingLevel, numResults, cursorId); appendCursorResponseObject(cursorId, nss.ns(), BSONArray(), &result); return true; } const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed(); // 4) If possible, register the execution plan inside a ClientCursor, and pin that // cursor. In this case, ownership of the PlanExecutor is transferred to the // ClientCursor, and 'exec' becomes null. // // First unregister the PlanExecutor so it can be re-registered with ClientCursor. exec->deregisterExec(); // Create a ClientCursor containing this plan executor. We don't have to worry // about leaking it as it's inserted into a global map by its ctor. ClientCursor* cursor = new ClientCursor(collection->getCursorManager(), exec.release(), nss.ns(), txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(), pq.getOptions(), pq.getFilter()); CursorId cursorId = cursor->cursorid(); ClientCursorPin ccPin(collection->getCursorManager(), cursorId); // On early return, get rid of the the cursor. ScopeGuard cursorFreer = MakeGuard(&ClientCursorPin::deleteUnderlying, ccPin); invariant(!exec); PlanExecutor* cursorExec = cursor->getExecutor(); // 5) Stream query results, adding them to a BSONArray as we go. BSONArrayBuilder firstBatch; BSONObj obj; PlanExecutor::ExecState state; long long numResults = 0; while (!enoughForFirstBatch(pq, numResults, firstBatch.len()) && PlanExecutor::ADVANCED == (state = cursorExec->getNext(&obj, NULL))) { // If adding this object will cause us to exceed the BSON size limit, then we stash // it for later. if (firstBatch.len() + obj.objsize() > BSONObjMaxUserSize && numResults > 0) { cursorExec->enqueue(obj); break; } // Add result to output buffer. firstBatch.append(obj); numResults++; } // Throw an assertion if query execution fails for any reason. if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) { const std::unique_ptr<PlanStageStats> stats(cursorExec->getStats()); error() << "Plan executor error during find command: " << PlanExecutor::statestr(state) << ", stats: " << Explain::statsToBSON(*stats); return appendCommandStatus(result, Status(ErrorCodes::OperationFailed, str::stream() << "Executor error during find command: " << WorkingSetCommon::toStatusString(obj))); } // 6) Set up the cursor for getMore. if (shouldSaveCursor(txn, collection, state, cursorExec)) { // State will be restored on getMore. cursorExec->saveState(); cursorExec->detachFromOperationContext(); cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros()); cursor->setPos(numResults); } else { cursorId = 0; } // Fill out curop based on the results. endQueryOp(txn, *cursorExec, dbProfilingLevel, numResults, cursorId); // 7) Generate the response object to send to the client. appendCursorResponseObject(cursorId, nss.ns(), firstBatch.arr(), &result); if (cursorId) { cursorFreer.Dismiss(); } return true; }
/** * Called by db/instance.cpp. This is the getMore entry point. */ Message getMore(OperationContext* opCtx, const char* ns, int ntoreturn, long long cursorid, bool* exhaust, bool* isCursorAuthorized) { invariant(ntoreturn >= 0); CurOp& curOp = *CurOp::get(opCtx); curOp.ensureStarted(); // For testing, we may want to fail if we receive a getmore. if (MONGO_FAIL_POINT(failReceivedGetmore)) { MONGO_UNREACHABLE; } *exhaust = false; const NamespaceString nss(ns); // Cursors come in one of two flavors: // - Cursors owned by the collection cursor manager, such as those generated via the find // command. For these cursors, we hold the appropriate collection lock for the duration of the // getMore using AutoGetCollectionForRead. // - Cursors owned by the global cursor manager, such as those generated via the aggregate // command. These cursors either hold no collection state or manage their collection state // internally, so we acquire no locks. // // While we only need to acquire locks in the case of a cursor which is *not* globally owned, we // need to create an AutoStatsTracker in either case. This is responsible for updating // statistics in CurOp and Top. We avoid using AutoGetCollectionForReadCommand because we may // need to drop and reacquire locks when the cursor is awaitData, but we don't want to update // the stats twice. // // Note that we acquire our locks before our ClientCursorPin, in order to ensure that the pin's // destructor is called before the lock's destructor (if there is one) so that the cursor // cleanup can occur under the lock. UninterruptibleLockGuard noInterrupt(opCtx->lockState()); boost::optional<AutoGetCollectionForRead> readLock; boost::optional<AutoStatsTracker> statsTracker; CursorManager* cursorManager; if (CursorManager::isGloballyManagedCursor(cursorid)) { cursorManager = CursorManager::getGlobalCursorManager(); if (boost::optional<NamespaceString> nssForCurOp = nss.isGloballyManagedNamespace() ? nss.getTargetNSForGloballyManagedNamespace() : nss) { AutoGetDb autoDb(opCtx, nssForCurOp->db(), MODE_IS); const auto profilingLevel = autoDb.getDb() ? boost::optional<int>{autoDb.getDb()->getProfilingLevel()} : boost::none; statsTracker.emplace(opCtx, *nssForCurOp, Top::LockType::NotLocked, profilingLevel); auto view = autoDb.getDb() ? autoDb.getDb()->getViewCatalog()->lookup(opCtx, nssForCurOp->ns()) : nullptr; uassert( ErrorCodes::CommandNotSupportedOnView, str::stream() << "Namespace " << nssForCurOp->ns() << " is a view. OP_GET_MORE operations are not supported on views. " << "Only clients which support the getMore command can be used to " "query views.", !view); } } else { readLock.emplace(opCtx, nss); const int doNotChangeProfilingLevel = 0; statsTracker.emplace(opCtx, nss, Top::LockType::ReadLocked, readLock->getDb() ? readLock->getDb()->getProfilingLevel() : doNotChangeProfilingLevel); Collection* collection = readLock->getCollection(); uassert( ErrorCodes::OperationFailed, "collection dropped between getMore calls", collection); cursorManager = collection->getCursorManager(); // This checks to make sure the operation is allowed on a replicated node. Since we are not // passing in a query object (necessary to check SlaveOK query option), we allow reads // whether we are PRIMARY or SECONDARY. uassertStatusOK( repl::ReplicationCoordinator::get(opCtx)->checkCanServeReadsFor(opCtx, nss, true)); } LOG(5) << "Running getMore, cursorid: " << cursorid; // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. auto ccPin = cursorManager->pinCursor(opCtx, cursorid); // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; const int InitialBufSize = 512 + sizeof(QueryResult::Value) + FindCommon::kMaxBytesToReturnToClientAtOnce; BufBuilder bb(InitialBufSize); bb.skip(sizeof(QueryResult::Value)); if (!ccPin.isOK()) { if (ccPin == ErrorCodes::CursorNotFound) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { uassertStatusOK(ccPin.getStatus()); } } else { ClientCursor* cc = ccPin.getValue().getCursor(); // Check for spoofing of the ns such that it does not match the one originally // there for the cursor. uassert(ErrorCodes::Unauthorized, str::stream() << "Requested getMore on namespace " << ns << ", but cursor " << cursorid << " belongs to namespace " << cc->nss().ns(), nss == cc->nss()); // A user can only call getMore on their own cursor. If there were multiple users // authenticated when the cursor was created, then at least one of them must be // authenticated in order to run getMore on the cursor. uassert(ErrorCodes::Unauthorized, str::stream() << "cursor id " << cursorid << " was not created by the authenticated user", AuthorizationSession::get(opCtx->getClient()) ->isCoauthorizedWith(cc->getAuthenticatedUsers())); *isCursorAuthorized = true; const auto replicationMode = repl::ReplicationCoordinator::get(opCtx)->getReplicationMode(); opCtx->recoveryUnit()->setReadConcernLevelAndReplicationMode(cc->getReadConcernLevel(), replicationMode); // TODO SERVER-33698: Remove kSnapshotReadConcern clause once we can guarantee that a // readConcern level snapshot getMore will have an established point-in-time WiredTiger // snapshot. if (replicationMode == repl::ReplicationCoordinator::modeReplSet && (cc->getReadConcernLevel() == repl::ReadConcernLevel::kMajorityReadConcern || cc->getReadConcernLevel() == repl::ReadConcernLevel::kSnapshotReadConcern)) { uassertStatusOK(opCtx->recoveryUnit()->obtainMajorityCommittedSnapshot()); } uassert(40548, "OP_GET_MORE operations are not supported on tailable aggregations. Only clients " "which support the getMore command can be used on tailable aggregations.", readLock || !cc->isAwaitData()); // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. if (cc->getLeftoverMaxTimeMicros() < Microseconds::max()) { uassert(40136, "Illegal attempt to set operation deadline within DBDirectClient", !opCtx->getClient()->isInDirectClient()); opCtx->setDeadlineAfterNowBy(cc->getLeftoverMaxTimeMicros()); } opCtx->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); uint64_t notifierVersion = 0; std::shared_ptr<CappedInsertNotifier> notifier; if (cc->isAwaitData()) { invariant(readLock->getCollection()->isCapped()); // Retrieve the notifier which we will wait on until new data arrives. We make sure // to do this in the lock because once we drop the lock it is possible for the // collection to become invalid. The notifier itself will outlive the collection if // the collection is dropped, as we keep a shared_ptr to it. notifier = readLock->getCollection()->getCappedInsertNotifier(); // Must get the version before we call generateBatch in case a write comes in after // that call and before we call wait on the notifier. notifierVersion = notifier->getVersion(); } PlanExecutor* exec = cc->getExecutor(); exec->reattachToOperationContext(opCtx); uassertStatusOK(exec->restoreState()); auto planSummary = Explain::getPlanSummary(exec); { stdx::lock_guard<Client> lk(*opCtx->getClient()); curOp.setPlanSummary_inlock(planSummary); // Ensure that the original query object is available in the slow query log, profiler // and currentOp. Upconvert _query to resemble a getMore command, and set the original // command or upconverted legacy query in the originatingCommand field. curOp.setOpDescription_inlock(upconvertGetMoreEntry(nss, cursorid, ntoreturn)); curOp.setOriginatingCommand_inlock(cc->getOriginatingCommandObj()); } PlanExecutor::ExecState state; // We report keysExamined and docsExamined to OpDebug for a given getMore operation. To // obtain these values we need to take a diff of the pre-execution and post-execution // metrics, as they accumulate over the course of a cursor's lifetime. PlanSummaryStats preExecutionStats; Explain::getSummaryStats(*exec, &preExecutionStats); generateBatch(ntoreturn, cc, &bb, &numResults, &state); // If this is an await data cursor, and we hit EOF without generating any results, then // we block waiting for new data to arrive. if (cc->isAwaitData() && state == PlanExecutor::IS_EOF && numResults == 0) { // Save the PlanExecutor and drop our locks. exec->saveState(); readLock.reset(); // Block waiting for data for up to 1 second. Time spent blocking is not counted towards // the total operation latency. curOp.pauseTimer(); Seconds timeout(1); notifier->waitUntil(notifierVersion, opCtx->getServiceContext()->getPreciseClockSource()->now() + timeout); notifier.reset(); curOp.resumeTimer(); // Reacquiring locks. readLock.emplace(opCtx, nss); uassertStatusOK(exec->restoreState()); // We woke up because either the timed_wait expired, or there was more data. Either // way, attempt to generate another batch of results. generateBatch(ntoreturn, cc, &bb, &numResults, &state); } PlanSummaryStats postExecutionStats; Explain::getSummaryStats(*exec, &postExecutionStats); postExecutionStats.totalKeysExamined -= preExecutionStats.totalKeysExamined; postExecutionStats.totalDocsExamined -= preExecutionStats.totalDocsExamined; curOp.debug().setPlanSummaryMetrics(postExecutionStats); // We do not report 'execStats' for aggregation or other globally managed cursors, both in // the original request and subsequent getMore. It would be useful to have this information // for an aggregation, but the source PlanExecutor could be destroyed before we know whether // we need execStats and we do not want to generate for all operations due to cost. if (!CursorManager::isGloballyManagedCursor(cursorid) && curOp.shouldDBProfile()) { BSONObjBuilder execStatsBob; Explain::getWinningPlanStats(exec, &execStatsBob); curOp.debug().execStats = execStatsBob.obj(); } // Our two possible ClientCursorPin cleanup paths are: // 1) If the cursor is not going to be saved, we call deleteUnderlying() on the pin. // 2) If the cursor is going to be saved, we simply let the pin go out of scope. In this // case, the pin's destructor will be invoked, which will call release() on the pin. // Because our ClientCursorPin is declared after our lock is declared, this will happen // under the lock if any locking was necessary. if (!shouldSaveCursorGetMore(state, exec, cc->isTailable())) { ccPin.getValue().deleteUnderlying(); // cc is now invalid, as is the executor cursorid = 0; cc = nullptr; curOp.debug().cursorExhausted = true; LOG(5) << "getMore NOT saving client cursor, ended with state " << PlanExecutor::statestr(state); } else { // Continue caching the ClientCursor. cc->incPos(numResults); exec->saveState(); exec->detachFromOperationContext(); LOG(5) << "getMore saving client cursor ended with state " << PlanExecutor::statestr(state); *exhaust = cc->queryOptions() & QueryOption_Exhaust; // We assume that cursors created through a DBDirectClient are always used from their // original OperationContext, so we do not need to move time to and from the cursor. if (!opCtx->getClient()->isInDirectClient()) { // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros(opCtx->getRemainingMaxTimeMicros()); } } } QueryResult::View qr = bb.buf(); qr.msgdata().setLen(bb.len()); qr.msgdata().setOperation(opReply); qr.setResultFlags(resultFlags); qr.setCursorId(cursorid); qr.setStartingFrom(startingResult); qr.setNReturned(numResults); LOG(5) << "getMore returned " << numResults << " results\n"; return Message(bb.release()); }
/** * Runs a query using the following steps: * --Parsing. * --Acquire locks. * --Plan query, obtaining an executor that can run it. * --Generate the first batch. * --Save state for getMore, transferring ownership of the executor to a ClientCursor. * --Generate response to send to the client. */ bool run(OperationContext* txn, const std::string& dbname, BSONObj& cmdObj, int options, std::string& errmsg, BSONObjBuilder& result) override { const std::string fullns = parseNs(dbname, cmdObj); const NamespaceString nss(fullns); if (!nss.isValid() || nss.isCommand() || nss.isSpecialCommand()) { return appendCommandStatus(result, {ErrorCodes::InvalidNamespace, str::stream() << "Invalid collection name: " << nss.ns()}); } // Although it is a command, a find command gets counted as a query. globalOpCounters.gotQuery(); if (txn->getClient()->isInDirectClient()) { return appendCommandStatus( result, Status(ErrorCodes::IllegalOperation, "Cannot run find command from eval()")); } // Parse the command BSON to a LiteParsedQuery. const bool isExplain = false; auto lpqStatus = LiteParsedQuery::makeFromFindCommand(nss, cmdObj, isExplain); if (!lpqStatus.isOK()) { return appendCommandStatus(result, lpqStatus.getStatus()); } auto& lpq = lpqStatus.getValue(); // Validate term before acquiring locks, if provided. if (auto term = lpq->getReplicationTerm()) { auto replCoord = repl::ReplicationCoordinator::get(txn); Status status = replCoord->updateTerm(txn, *term); // Note: updateTerm returns ok if term stayed the same. if (!status.isOK()) { return appendCommandStatus(result, status); } } // Fill out curop information. // // We pass negative values for 'ntoreturn' and 'ntoskip' to indicate that these values // should be omitted from the log line. Limit and skip information is already present in the // find command parameters, so these fields are redundant. const int ntoreturn = -1; const int ntoskip = -1; beginQueryOp(txn, nss, cmdObj, ntoreturn, ntoskip); // Finish the parsing step by using the LiteParsedQuery to create a CanonicalQuery. ExtensionsCallbackReal extensionsCallback(txn, &nss); auto statusWithCQ = CanonicalQuery::canonicalize(lpq.release(), extensionsCallback); if (!statusWithCQ.isOK()) { return appendCommandStatus(result, statusWithCQ.getStatus()); } std::unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue()); ShardingState* const shardingState = ShardingState::get(txn); if (OperationShardVersion::get(txn).hasShardVersion() && shardingState->enabled()) { ChunkVersion receivedVersion = OperationShardVersion::get(txn).getShardVersion(nss); ChunkVersion latestVersion; // Wait for migration completion to get the correct chunk version. const int maxTimeoutSec = 30; int timeoutSec = cq->getParsed().getMaxTimeMS() / 1000; if (!timeoutSec || timeoutSec > maxTimeoutSec) { timeoutSec = maxTimeoutSec; } if (!shardingState->waitTillNotInCriticalSection(timeoutSec)) { uasserted(ErrorCodes::LockTimeout, "Timeout while waiting for migration commit"); } // If the received version is newer than the version cached in 'shardingState', then we // have to refresh 'shardingState' from the config servers. We do this before acquiring // locks so that we don't hold locks while waiting on the network. uassertStatusOK(shardingState->refreshMetadataIfNeeded( txn, nss.ns(), receivedVersion, &latestVersion)); } // Acquire locks. AutoGetCollectionForRead ctx(txn, nss); Collection* collection = ctx.getCollection(); const int dbProfilingLevel = ctx.getDb() ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile; // It is possible that the sharding version will change during yield while we are // retrieving a plan executor. If this happens we will throw an error and mongos will // retry. const ChunkVersion shardingVersionAtStart = shardingState->getVersion(nss.ns()); // Get the execution plan for the query. auto statusWithPlanExecutor = getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO); if (!statusWithPlanExecutor.isOK()) { return appendCommandStatus(result, statusWithPlanExecutor.getStatus()); } std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue()); if (!collection) { // No collection. Just fill out curop indicating that there were zero results and // there is no ClientCursor id, and then return. const long long numResults = 0; const CursorId cursorId = 0; endQueryOp(txn, collection, *exec, dbProfilingLevel, numResults, cursorId); appendCursorResponseObject(cursorId, nss.ns(), BSONArray(), &result); return true; } const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed(); // Stream query results, adding them to a BSONArray as we go. BSONArrayBuilder firstBatch; BSONObj obj; PlanExecutor::ExecState state = PlanExecutor::ADVANCED; long long numResults = 0; while (!FindCommon::enoughForFirstBatch(pq, numResults, firstBatch.len()) && PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // If adding this object will cause us to exceed the BSON size limit, then we stash // it for later. if (firstBatch.len() + obj.objsize() > BSONObjMaxUserSize && numResults > 0) { exec->enqueue(obj); break; } // Add result to output buffer. firstBatch.append(obj); numResults++; } // Throw an assertion if query execution fails for any reason. if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) { const std::unique_ptr<PlanStageStats> stats(exec->getStats()); error() << "Plan executor error during find command: " << PlanExecutor::statestr(state) << ", stats: " << Explain::statsToBSON(*stats); return appendCommandStatus(result, Status(ErrorCodes::OperationFailed, str::stream() << "Executor error during find command: " << WorkingSetCommon::toStatusString(obj))); } // TODO: Currently, chunk ranges are kept around until all ClientCursors created while the // chunk belonged on this node are gone. Separating chunk lifetime management from // ClientCursor should allow this check to go away. if (!shardingState->getVersion(nss.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // Version changed while retrieving a PlanExecutor. Terminate the operation, // signaling that mongos should retry. throw SendStaleConfigException(nss.ns(), "version changed during find command", shardingVersionAtStart, shardingState->getVersion(nss.ns())); } // Set up the cursor for getMore. CursorId cursorId = 0; if (shouldSaveCursor(txn, collection, state, exec.get())) { // Register the execution plan inside a ClientCursor. Ownership of the PlanExecutor is // transferred to the ClientCursor. // // First unregister the PlanExecutor so it can be re-registered with ClientCursor. exec->deregisterExec(); // Create a ClientCursor containing this plan executor. We don't have to worry about // leaking it as it's inserted into a global map by its ctor. ClientCursor* cursor = new ClientCursor(collection->getCursorManager(), exec.release(), nss.ns(), txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(), pq.getOptions(), pq.getFilter()); cursorId = cursor->cursorid(); invariant(!exec); PlanExecutor* cursorExec = cursor->getExecutor(); // State will be restored on getMore. cursorExec->saveState(); cursorExec->detachFromOperationContext(); cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros()); cursor->setPos(numResults); // Fill out curop based on the results. endQueryOp(txn, collection, *cursorExec, dbProfilingLevel, numResults, cursorId); } else { endQueryOp(txn, collection, *exec, dbProfilingLevel, numResults, cursorId); } // Generate the response object to send to the client. appendCursorResponseObject(cursorId, nss.ns(), firstBatch.arr(), &result); return true; }
/** * Generates the next batch of results for a ClientCursor. * * TODO: Do we need to support some equivalent of OP_REPLY responseFlags? * * TODO: Is it possible to support awaitData? */ bool run(OperationContext* txn, const std::string& dbname, BSONObj& cmdObj, int options, std::string& errmsg, BSONObjBuilder& result) override { // Counted as a getMore, not as a command. globalOpCounters.gotGetMore(); if (txn->getClient()->isInDirectClient()) { return appendCommandStatus(result, Status(ErrorCodes::IllegalOperation, "Cannot run getMore command from eval()")); } StatusWith<GetMoreRequest> parseStatus = GetMoreRequest::parseFromBSON(dbname, cmdObj); if (!parseStatus.isOK()) { return appendCommandStatus(result, parseStatus.getStatus()); } const GetMoreRequest& request = parseStatus.getValue(); // Depending on the type of cursor being operated on, we hold locks for the whole // getMore, or none of the getMore, or part of the getMore. The three cases in detail: // // 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore. // 2) Cursor owned by global cursor manager: we don't lock anything. These cursors // don't own any collection state. // 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and // "unpinCollLock". This is because agg cursors handle locking internally (hence the // release), but the pin and unpin of the cursor must occur under the collection // lock. We don't use our AutoGetCollectionForRead "ctx" to relock, because // AutoGetCollectionForRead checks the sharding version (and we want the relock for // the unpin to succeed even if the sharding version has changed). // // Note that we declare our locks before our ClientCursorPin, in order to ensure that // the pin's destructor is called before the lock destructors (so that the unpin occurs // under the lock). std::unique_ptr<AutoGetCollectionForRead> ctx; std::unique_ptr<Lock::DBLock> unpinDBLock; std::unique_ptr<Lock::CollectionLock> unpinCollLock; CursorManager* cursorManager; CursorManager* globalCursorManager = CursorManager::getGlobalCursorManager(); if (globalCursorManager->ownsCursorId(request.cursorid)) { cursorManager = globalCursorManager; } else { ctx.reset(new AutoGetCollectionForRead(txn, request.nss)); Collection* collection = ctx->getCollection(); if (!collection) { return appendCommandStatus(result, Status(ErrorCodes::OperationFailed, "collection dropped between getMore calls")); } cursorManager = collection->getCursorManager(); } ClientCursorPin ccPin(cursorManager, request.cursorid); ClientCursor* cursor = ccPin.c(); if (!cursor) { // We didn't find the cursor. return appendCommandStatus(result, Status(ErrorCodes::CursorNotFound, str::stream() << "Cursor not found, cursor id: " << request.cursorid)); } if (request.nss.ns() != cursor->ns()) { return appendCommandStatus(result, Status(ErrorCodes::Unauthorized, str::stream() << "Requested getMore on namespace '" << request.nss.ns() << "', but cursor belongs to a different namespace")); } // On early return, get rid of the the cursor. ScopeGuard cursorFreer = MakeGuard(&ClientCursorPin::deleteUnderlying, ccPin); if (!cursor->hasRecoveryUnit()) { // Start using a new RecoveryUnit. cursor->setOwnedRecoveryUnit( getGlobalServiceContext()->getGlobalStorageEngine()->newRecoveryUnit()); } // Swap RecoveryUnit(s) between the ClientCursor and OperationContext. ScopedRecoveryUnitSwapper ruSwapper(cursor, txn); // Reset timeout timer on the cursor since the cursor is still in use. cursor->setIdleTime(0); // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. txn->getCurOp()->setMaxTimeMicros(cursor->getLeftoverMaxTimeMicros()); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. if (cursor->isAggCursor()) { // Agg cursors handle their own locking internally. ctx.reset(); // unlocks } PlanExecutor* exec = cursor->getExecutor(); exec->restoreState(txn); // TODO: Handle result sets larger than 16MB. BSONArrayBuilder nextBatch; BSONObj obj; PlanExecutor::ExecState state; int numResults = 0; while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // Add result to output buffer. nextBatch.append(obj); numResults++; if (enoughForGetMore(request.batchSize, numResults, nextBatch.len())) { break; } } // If we are operating on an aggregation cursor, then we dropped our collection lock // earlier and need to reacquire it in order to clean up our ClientCursorPin. // // TODO: We need to ensure that this relock happens if we release the pin above in // response to PlanExecutor::getNext() throwing an exception. if (cursor->isAggCursor()) { invariant(NULL == ctx.get()); unpinDBLock.reset(new Lock::DBLock(txn->lockState(), request.nss.db(), MODE_IS)); unpinCollLock.reset( new Lock::CollectionLock(txn->lockState(), request.nss.ns(), MODE_IS)); } // Fail the command if the PlanExecutor reports execution failure. if (PlanExecutor::FAILURE == state) { const std::unique_ptr<PlanStageStats> stats(exec->getStats()); error() << "GetMore executor error, stats: " << Explain::statsToBSON(*stats); return appendCommandStatus(result, Status(ErrorCodes::OperationFailed, str::stream() << "GetMore executor error: " << WorkingSetCommon::toStatusString(obj))); } CursorId respondWithId = 0; if (shouldSaveCursorGetMore(state, exec, isCursorTailable(cursor))) { respondWithId = request.cursorid; exec->saveState(); cursor->setLeftoverMaxTimeMicros(txn->getCurOp()->getRemainingMaxTimeMicros()); cursor->incPos(numResults); if (isCursorTailable(cursor) && state == PlanExecutor::IS_EOF) { // Rather than swapping their existing RU into the client cursor, tailable // cursors should get a new recovery unit. ruSwapper.dismiss(); } } else { txn->getCurOp()->debug().cursorExhausted = true; } appendGetMoreResponseObject(respondWithId, request.nss.ns(), nextBatch.arr(), &result); if (respondWithId) { cursorFreer.Dismiss(); } return true; }
PlanStage::StageState SubplanStage::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (_killed) { return PlanStage::DEAD; } if (isEOF()) { return PlanStage::IS_EOF; } if (SubplanStage::PLANNING == _state) { // Try to run as sub-plans. if (runSubplans()) { // If runSubplans returns true we expect something here. invariant(_child.get()); } else if (!_killed) { // Couldn't run as subplans so we'll just call normal getExecutor. PlanExecutor* exec; Status status = getExecutorAlwaysPlan(_collection, _query, _plannerParams, &exec); if (!status.isOK()) { // We utterly failed. _killed = true; // Propagate the error to the user wrapped in a BSONObj WorkingSetID id = _ws->allocate(); WorkingSetMember* member = _ws->get(id); member->state = WorkingSetMember::OWNED_OBJ; member->keyData.clear(); member->loc = DiskLoc(); BSONObjBuilder bob; bob.append("ok", status.isOK() ? 1.0 : 0.0); bob.append("code", status.code()); bob.append("errmsg", status.reason()); member->obj = bob.obj(); *out = id; return PlanStage::FAILURE; } else { scoped_ptr<PlanExecutor> cleanupExec(exec); _child.reset(exec->releaseStages()); } } // We can change state when we're either killed or we have an underlying runner. invariant(_killed || NULL != _child.get()); _state = SubplanStage::RUNNING; } if (_killed) { return PlanStage::DEAD; } if (isEOF()) { return PlanStage::IS_EOF; } // If we're here we should have planned already. invariant(SubplanStage::RUNNING == _state); invariant(_child.get()); return _child->work(out); }
bool run(OperationContext* txn, const std::string& dbname, BSONObj& cmdObj, int options, std::string& errmsg, BSONObjBuilder& result) override { // Counted as a getMore, not as a command. globalOpCounters.gotGetMore(); if (txn->getClient()->isInDirectClient()) { return appendCommandStatus(result, Status(ErrorCodes::IllegalOperation, "Cannot run getMore command from eval()")); } StatusWith<GetMoreRequest> parseStatus = GetMoreRequest::parseFromBSON(dbname, cmdObj); if (!parseStatus.isOK()) { return appendCommandStatus(result, parseStatus.getStatus()); } const GetMoreRequest& request = parseStatus.getValue(); // Depending on the type of cursor being operated on, we hold locks for the whole // getMore, or none of the getMore, or part of the getMore. The three cases in detail: // // 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore. // 2) Cursor owned by global cursor manager: we don't lock anything. These cursors // don't own any collection state. // 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and // "unpinCollLock". This is because agg cursors handle locking internally (hence the // release), but the pin and unpin of the cursor must occur under the collection // lock. We don't use our AutoGetCollectionForRead "ctx" to relock, because // AutoGetCollectionForRead checks the sharding version (and we want the relock for // the unpin to succeed even if the sharding version has changed). // // Note that we declare our locks before our ClientCursorPin, in order to ensure that // the pin's destructor is called before the lock destructors (so that the unpin occurs // under the lock). std::unique_ptr<AutoGetCollectionForRead> ctx; std::unique_ptr<Lock::DBLock> unpinDBLock; std::unique_ptr<Lock::CollectionLock> unpinCollLock; CursorManager* cursorManager; CursorManager* globalCursorManager = CursorManager::getGlobalCursorManager(); if (globalCursorManager->ownsCursorId(request.cursorid)) { cursorManager = globalCursorManager; } else { ctx.reset(new AutoGetCollectionForRead(txn, request.nss)); Collection* collection = ctx->getCollection(); if (!collection) { return appendCommandStatus(result, Status(ErrorCodes::OperationFailed, "collection dropped between getMore calls")); } cursorManager = collection->getCursorManager(); } ClientCursorPin ccPin(cursorManager, request.cursorid); ClientCursor* cursor = ccPin.c(); if (!cursor) { // We didn't find the cursor. return appendCommandStatus(result, Status(ErrorCodes::CursorNotFound, str::stream() << "Cursor not found, cursor id: " << request.cursorid)); } if (request.nss.ns() != cursor->ns()) { return appendCommandStatus(result, Status(ErrorCodes::Unauthorized, str::stream() << "Requested getMore on namespace '" << request.nss.ns() << "', but cursor belongs to a different namespace")); } const bool hasOwnMaxTime = CurOp::get(txn)->isMaxTimeSet(); // Validation related to awaitData. if (isCursorAwaitData(cursor)) { invariant(isCursorTailable(cursor)); if (!hasOwnMaxTime) { Status status(ErrorCodes::BadValue, str::stream() << "Must set maxTimeMS on a getMore if the initial " << "query had 'awaitData' set: " << cmdObj); return appendCommandStatus(result, status); } if (cursor->isAggCursor()) { Status status(ErrorCodes::BadValue, "awaitData cannot be set on an aggregation cursor"); return appendCommandStatus(result, status); } } // On early return, get rid of the cursor. ScopeGuard cursorFreer = MakeGuard(&GetMoreCmd::cleanupCursor, txn, &ccPin, request); if (!cursor->hasRecoveryUnit()) { // Start using a new RecoveryUnit. cursor->setOwnedRecoveryUnit( getGlobalServiceContext()->getGlobalStorageEngine()->newRecoveryUnit()); } // Swap RecoveryUnit(s) between the ClientCursor and OperationContext. ScopedRecoveryUnitSwapper ruSwapper(cursor, txn); // Reset timeout timer on the cursor since the cursor is still in use. cursor->setIdleTime(0); // If there is no time limit set directly on this getMore command, but the operation // that spawned this cursor had a time limit set, then we have to apply any leftover // time to this getMore. if (!hasOwnMaxTime) { CurOp::get(txn)->setMaxTimeMicros(cursor->getLeftoverMaxTimeMicros()); } txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. if (cursor->isAggCursor()) { // Agg cursors handle their own locking internally. ctx.reset(); // unlocks } PlanExecutor* exec = cursor->getExecutor(); exec->restoreState(txn); // If we're tailing a capped collection, retrieve a monotonically increasing insert // counter. uint64_t lastInsertCount = 0; if (isCursorAwaitData(cursor)) { invariant(ctx->getCollection()->isCapped()); lastInsertCount = ctx->getCollection()->getCappedInsertNotifier()->getCount(); } CursorId respondWithId = 0; BSONArrayBuilder nextBatch; BSONObj obj; PlanExecutor::ExecState state; int numResults = 0; Status batchStatus = generateBatch(cursor, request, &nextBatch, &state, &numResults); if (!batchStatus.isOK()) { return appendCommandStatus(result, batchStatus); } // If this is an await data cursor, and we hit EOF without generating any results, then // we block waiting for new oplog data to arrive. if (isCursorAwaitData(cursor) && state == PlanExecutor::IS_EOF && numResults == 0) { // Retrieve the notifier which we will wait on until new data arrives. We make sure // to do this in the lock because once we drop the lock it is possible for the // collection to become invalid. The notifier itself will outlive the collection if // the collection is dropped, as we keep a shared_ptr to it. auto notifier = ctx->getCollection()->getCappedInsertNotifier(); // Save the PlanExecutor and drop our locks. exec->saveState(); ctx.reset(); // Block waiting for data. Microseconds timeout(CurOp::get(txn)->getRemainingMaxTimeMicros()); notifier->waitForInsert(lastInsertCount, timeout); notifier.reset(); ctx.reset(new AutoGetCollectionForRead(txn, request.nss)); exec->restoreState(txn); // We woke up because either the timed_wait expired, or there was more data. Either // way, attempt to generate another batch of results. batchStatus = generateBatch(cursor, request, &nextBatch, &state, &numResults); if (!batchStatus.isOK()) { return appendCommandStatus(result, batchStatus); } } if (shouldSaveCursorGetMore(state, exec, isCursorTailable(cursor))) { respondWithId = request.cursorid; exec->saveState(); // If maxTimeMS was set directly on the getMore rather than being rolled over // from a previous find, then don't roll remaining micros over to the next // getMore. if (!hasOwnMaxTime) { cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros()); } cursor->incPos(numResults); if (isCursorTailable(cursor) && state == PlanExecutor::IS_EOF) { // Rather than swapping their existing RU into the client cursor, tailable // cursors should get a new recovery unit. ruSwapper.dismiss(); } } else { CurOp::get(txn)->debug().cursorExhausted = true; } appendGetMoreResponseObject(respondWithId, request.nss.ns(), nextBatch.arr(), &result); if (respondWithId) { cursorFreer.Dismiss(); // If we are operating on an aggregation cursor, then we dropped our collection lock // earlier and need to reacquire it in order to clean up our ClientCursorPin. if (cursor->isAggCursor()) { invariant(NULL == ctx.get()); unpinDBLock.reset( new Lock::DBLock(txn->lockState(), request.nss.db(), MODE_IS)); unpinCollLock.reset( new Lock::CollectionLock(txn->lockState(), request.nss.ns(), MODE_IS)); } } return true; }