/** * Called by db/instance.cpp. This is the getMore entry point. */ Message getMore(OperationContext* opCtx, const char* ns, int ntoreturn, long long cursorid, bool* exhaust, bool* isCursorAuthorized) { invariant(ntoreturn >= 0); CurOp& curOp = *CurOp::get(opCtx); curOp.ensureStarted(); // For testing, we may want to fail if we receive a getmore. if (MONGO_FAIL_POINT(failReceivedGetmore)) { MONGO_UNREACHABLE; } *exhaust = false; const NamespaceString nss(ns); // Cursors come in one of two flavors: // - Cursors owned by the collection cursor manager, such as those generated via the find // command. For these cursors, we hold the appropriate collection lock for the duration of the // getMore using AutoGetCollectionForRead. // - Cursors owned by the global cursor manager, such as those generated via the aggregate // command. These cursors either hold no collection state or manage their collection state // internally, so we acquire no locks. // // While we only need to acquire locks in the case of a cursor which is *not* globally owned, we // need to create an AutoStatsTracker in either case. This is responsible for updating // statistics in CurOp and Top. We avoid using AutoGetCollectionForReadCommand because we may // need to drop and reacquire locks when the cursor is awaitData, but we don't want to update // the stats twice. // // Note that we acquire our locks before our ClientCursorPin, in order to ensure that the pin's // destructor is called before the lock's destructor (if there is one) so that the cursor // cleanup can occur under the lock. UninterruptibleLockGuard noInterrupt(opCtx->lockState()); boost::optional<AutoGetCollectionForRead> readLock; boost::optional<AutoStatsTracker> statsTracker; CursorManager* cursorManager; if (CursorManager::isGloballyManagedCursor(cursorid)) { cursorManager = CursorManager::getGlobalCursorManager(); if (boost::optional<NamespaceString> nssForCurOp = nss.isGloballyManagedNamespace() ? nss.getTargetNSForGloballyManagedNamespace() : nss) { AutoGetDb autoDb(opCtx, nssForCurOp->db(), MODE_IS); const auto profilingLevel = autoDb.getDb() ? boost::optional<int>{autoDb.getDb()->getProfilingLevel()} : boost::none; statsTracker.emplace(opCtx, *nssForCurOp, Top::LockType::NotLocked, profilingLevel); auto view = autoDb.getDb() ? autoDb.getDb()->getViewCatalog()->lookup(opCtx, nssForCurOp->ns()) : nullptr; uassert( ErrorCodes::CommandNotSupportedOnView, str::stream() << "Namespace " << nssForCurOp->ns() << " is a view. OP_GET_MORE operations are not supported on views. " << "Only clients which support the getMore command can be used to " "query views.", !view); } } else { readLock.emplace(opCtx, nss); const int doNotChangeProfilingLevel = 0; statsTracker.emplace(opCtx, nss, Top::LockType::ReadLocked, readLock->getDb() ? readLock->getDb()->getProfilingLevel() : doNotChangeProfilingLevel); Collection* collection = readLock->getCollection(); uassert( ErrorCodes::OperationFailed, "collection dropped between getMore calls", collection); cursorManager = collection->getCursorManager(); // This checks to make sure the operation is allowed on a replicated node. Since we are not // passing in a query object (necessary to check SlaveOK query option), we allow reads // whether we are PRIMARY or SECONDARY. uassertStatusOK( repl::ReplicationCoordinator::get(opCtx)->checkCanServeReadsFor(opCtx, nss, true)); } LOG(5) << "Running getMore, cursorid: " << cursorid; // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. auto ccPin = cursorManager->pinCursor(opCtx, cursorid); // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; const int InitialBufSize = 512 + sizeof(QueryResult::Value) + FindCommon::kMaxBytesToReturnToClientAtOnce; BufBuilder bb(InitialBufSize); bb.skip(sizeof(QueryResult::Value)); if (!ccPin.isOK()) { if (ccPin == ErrorCodes::CursorNotFound) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { uassertStatusOK(ccPin.getStatus()); } } else { ClientCursor* cc = ccPin.getValue().getCursor(); // Check for spoofing of the ns such that it does not match the one originally // there for the cursor. uassert(ErrorCodes::Unauthorized, str::stream() << "Requested getMore on namespace " << ns << ", but cursor " << cursorid << " belongs to namespace " << cc->nss().ns(), nss == cc->nss()); // A user can only call getMore on their own cursor. If there were multiple users // authenticated when the cursor was created, then at least one of them must be // authenticated in order to run getMore on the cursor. uassert(ErrorCodes::Unauthorized, str::stream() << "cursor id " << cursorid << " was not created by the authenticated user", AuthorizationSession::get(opCtx->getClient()) ->isCoauthorizedWith(cc->getAuthenticatedUsers())); *isCursorAuthorized = true; const auto replicationMode = repl::ReplicationCoordinator::get(opCtx)->getReplicationMode(); opCtx->recoveryUnit()->setReadConcernLevelAndReplicationMode(cc->getReadConcernLevel(), replicationMode); // TODO SERVER-33698: Remove kSnapshotReadConcern clause once we can guarantee that a // readConcern level snapshot getMore will have an established point-in-time WiredTiger // snapshot. if (replicationMode == repl::ReplicationCoordinator::modeReplSet && (cc->getReadConcernLevel() == repl::ReadConcernLevel::kMajorityReadConcern || cc->getReadConcernLevel() == repl::ReadConcernLevel::kSnapshotReadConcern)) { uassertStatusOK(opCtx->recoveryUnit()->obtainMajorityCommittedSnapshot()); } uassert(40548, "OP_GET_MORE operations are not supported on tailable aggregations. Only clients " "which support the getMore command can be used on tailable aggregations.", readLock || !cc->isAwaitData()); // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. if (cc->getLeftoverMaxTimeMicros() < Microseconds::max()) { uassert(40136, "Illegal attempt to set operation deadline within DBDirectClient", !opCtx->getClient()->isInDirectClient()); opCtx->setDeadlineAfterNowBy(cc->getLeftoverMaxTimeMicros()); } opCtx->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); uint64_t notifierVersion = 0; std::shared_ptr<CappedInsertNotifier> notifier; if (cc->isAwaitData()) { invariant(readLock->getCollection()->isCapped()); // Retrieve the notifier which we will wait on until new data arrives. We make sure // to do this in the lock because once we drop the lock it is possible for the // collection to become invalid. The notifier itself will outlive the collection if // the collection is dropped, as we keep a shared_ptr to it. notifier = readLock->getCollection()->getCappedInsertNotifier(); // Must get the version before we call generateBatch in case a write comes in after // that call and before we call wait on the notifier. notifierVersion = notifier->getVersion(); } PlanExecutor* exec = cc->getExecutor(); exec->reattachToOperationContext(opCtx); uassertStatusOK(exec->restoreState()); auto planSummary = Explain::getPlanSummary(exec); { stdx::lock_guard<Client> lk(*opCtx->getClient()); curOp.setPlanSummary_inlock(planSummary); // Ensure that the original query object is available in the slow query log, profiler // and currentOp. Upconvert _query to resemble a getMore command, and set the original // command or upconverted legacy query in the originatingCommand field. curOp.setOpDescription_inlock(upconvertGetMoreEntry(nss, cursorid, ntoreturn)); curOp.setOriginatingCommand_inlock(cc->getOriginatingCommandObj()); } PlanExecutor::ExecState state; // We report keysExamined and docsExamined to OpDebug for a given getMore operation. To // obtain these values we need to take a diff of the pre-execution and post-execution // metrics, as they accumulate over the course of a cursor's lifetime. PlanSummaryStats preExecutionStats; Explain::getSummaryStats(*exec, &preExecutionStats); generateBatch(ntoreturn, cc, &bb, &numResults, &state); // If this is an await data cursor, and we hit EOF without generating any results, then // we block waiting for new data to arrive. if (cc->isAwaitData() && state == PlanExecutor::IS_EOF && numResults == 0) { // Save the PlanExecutor and drop our locks. exec->saveState(); readLock.reset(); // Block waiting for data for up to 1 second. Time spent blocking is not counted towards // the total operation latency. curOp.pauseTimer(); Seconds timeout(1); notifier->waitUntil(notifierVersion, opCtx->getServiceContext()->getPreciseClockSource()->now() + timeout); notifier.reset(); curOp.resumeTimer(); // Reacquiring locks. readLock.emplace(opCtx, nss); uassertStatusOK(exec->restoreState()); // We woke up because either the timed_wait expired, or there was more data. Either // way, attempt to generate another batch of results. generateBatch(ntoreturn, cc, &bb, &numResults, &state); } PlanSummaryStats postExecutionStats; Explain::getSummaryStats(*exec, &postExecutionStats); postExecutionStats.totalKeysExamined -= preExecutionStats.totalKeysExamined; postExecutionStats.totalDocsExamined -= preExecutionStats.totalDocsExamined; curOp.debug().setPlanSummaryMetrics(postExecutionStats); // We do not report 'execStats' for aggregation or other globally managed cursors, both in // the original request and subsequent getMore. It would be useful to have this information // for an aggregation, but the source PlanExecutor could be destroyed before we know whether // we need execStats and we do not want to generate for all operations due to cost. if (!CursorManager::isGloballyManagedCursor(cursorid) && curOp.shouldDBProfile()) { BSONObjBuilder execStatsBob; Explain::getWinningPlanStats(exec, &execStatsBob); curOp.debug().execStats = execStatsBob.obj(); } // Our two possible ClientCursorPin cleanup paths are: // 1) If the cursor is not going to be saved, we call deleteUnderlying() on the pin. // 2) If the cursor is going to be saved, we simply let the pin go out of scope. In this // case, the pin's destructor will be invoked, which will call release() on the pin. // Because our ClientCursorPin is declared after our lock is declared, this will happen // under the lock if any locking was necessary. if (!shouldSaveCursorGetMore(state, exec, cc->isTailable())) { ccPin.getValue().deleteUnderlying(); // cc is now invalid, as is the executor cursorid = 0; cc = nullptr; curOp.debug().cursorExhausted = true; LOG(5) << "getMore NOT saving client cursor, ended with state " << PlanExecutor::statestr(state); } else { // Continue caching the ClientCursor. cc->incPos(numResults); exec->saveState(); exec->detachFromOperationContext(); LOG(5) << "getMore saving client cursor ended with state " << PlanExecutor::statestr(state); *exhaust = cc->queryOptions() & QueryOption_Exhaust; // We assume that cursors created through a DBDirectClient are always used from their // original OperationContext, so we do not need to move time to and from the cursor. if (!opCtx->getClient()->isInDirectClient()) { // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros(opCtx->getRemainingMaxTimeMicros()); } } } QueryResult::View qr = bb.buf(); qr.msgdata().setLen(bb.len()); qr.msgdata().setOperation(opReply); qr.setResultFlags(resultFlags); qr.setCursorId(cursorid); qr.setStartingFrom(startingResult); qr.setNReturned(numResults); LOG(5) << "getMore returned " << numResults << " results\n"; return Message(bb.release()); }