Esempio n. 1
0
/**
 * Called by db/instance.cpp.  This is the getMore entry point.
 */
Message getMore(OperationContext* opCtx,
                const char* ns,
                int ntoreturn,
                long long cursorid,
                bool* exhaust,
                bool* isCursorAuthorized) {
    invariant(ntoreturn >= 0);

    CurOp& curOp = *CurOp::get(opCtx);
    curOp.ensureStarted();

    // For testing, we may want to fail if we receive a getmore.
    if (MONGO_FAIL_POINT(failReceivedGetmore)) {
        MONGO_UNREACHABLE;
    }

    *exhaust = false;

    const NamespaceString nss(ns);

    // Cursors come in one of two flavors:
    // - Cursors owned by the collection cursor manager, such as those generated via the find
    //   command. For these cursors, we hold the appropriate collection lock for the duration of the
    //   getMore using AutoGetCollectionForRead.
    // - Cursors owned by the global cursor manager, such as those generated via the aggregate
    //   command. These cursors either hold no collection state or manage their collection state
    //   internally, so we acquire no locks.
    //
    // While we only need to acquire locks in the case of a cursor which is *not* globally owned, we
    // need to create an AutoStatsTracker in either case. This is responsible for updating
    // statistics in CurOp and Top. We avoid using AutoGetCollectionForReadCommand because we may
    // need to drop and reacquire locks when the cursor is awaitData, but we don't want to update
    // the stats twice.
    //
    // Note that we acquire our locks before our ClientCursorPin, in order to ensure that the pin's
    // destructor is called before the lock's destructor (if there is one) so that the cursor
    // cleanup can occur under the lock.
    UninterruptibleLockGuard noInterrupt(opCtx->lockState());
    boost::optional<AutoGetCollectionForRead> readLock;
    boost::optional<AutoStatsTracker> statsTracker;
    CursorManager* cursorManager;

    if (CursorManager::isGloballyManagedCursor(cursorid)) {
        cursorManager = CursorManager::getGlobalCursorManager();

        if (boost::optional<NamespaceString> nssForCurOp = nss.isGloballyManagedNamespace()
                ? nss.getTargetNSForGloballyManagedNamespace()
                : nss) {
            AutoGetDb autoDb(opCtx, nssForCurOp->db(), MODE_IS);
            const auto profilingLevel = autoDb.getDb()
                ? boost::optional<int>{autoDb.getDb()->getProfilingLevel()}
                : boost::none;
            statsTracker.emplace(opCtx, *nssForCurOp, Top::LockType::NotLocked, profilingLevel);
            auto view = autoDb.getDb()
                ? autoDb.getDb()->getViewCatalog()->lookup(opCtx, nssForCurOp->ns())
                : nullptr;
            uassert(
                ErrorCodes::CommandNotSupportedOnView,
                str::stream() << "Namespace " << nssForCurOp->ns()
                              << " is a view. OP_GET_MORE operations are not supported on views. "
                              << "Only clients which support the getMore command can be used to "
                                 "query views.",
                !view);
        }
    } else {
        readLock.emplace(opCtx, nss);
        const int doNotChangeProfilingLevel = 0;
        statsTracker.emplace(opCtx,
                             nss,
                             Top::LockType::ReadLocked,
                             readLock->getDb() ? readLock->getDb()->getProfilingLevel()
                                               : doNotChangeProfilingLevel);
        Collection* collection = readLock->getCollection();
        uassert(
            ErrorCodes::OperationFailed, "collection dropped between getMore calls", collection);
        cursorManager = collection->getCursorManager();

        // This checks to make sure the operation is allowed on a replicated node.  Since we are not
        // passing in a query object (necessary to check SlaveOK query option), we allow reads
        // whether we are PRIMARY or SECONDARY.
        uassertStatusOK(
            repl::ReplicationCoordinator::get(opCtx)->checkCanServeReadsFor(opCtx, nss, true));
    }

    LOG(5) << "Running getMore, cursorid: " << cursorid;

    // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it
    // doesn't time out.  Also informs ClientCursor that there is somebody actively holding the
    // CC, so don't delete it.
    auto ccPin = cursorManager->pinCursor(opCtx, cursorid);

    // These are set in the QueryResult msg we return.
    int resultFlags = ResultFlag_AwaitCapable;

    int numResults = 0;
    int startingResult = 0;

    const int InitialBufSize =
        512 + sizeof(QueryResult::Value) + FindCommon::kMaxBytesToReturnToClientAtOnce;

    BufBuilder bb(InitialBufSize);
    bb.skip(sizeof(QueryResult::Value));

    if (!ccPin.isOK()) {
        if (ccPin == ErrorCodes::CursorNotFound) {
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        } else {
            uassertStatusOK(ccPin.getStatus());
        }
    } else {
        ClientCursor* cc = ccPin.getValue().getCursor();

        // Check for spoofing of the ns such that it does not match the one originally
        // there for the cursor.
        uassert(ErrorCodes::Unauthorized,
                str::stream() << "Requested getMore on namespace " << ns << ", but cursor "
                              << cursorid
                              << " belongs to namespace "
                              << cc->nss().ns(),
                nss == cc->nss());

        // A user can only call getMore on their own cursor. If there were multiple users
        // authenticated when the cursor was created, then at least one of them must be
        // authenticated in order to run getMore on the cursor.
        uassert(ErrorCodes::Unauthorized,
                str::stream() << "cursor id " << cursorid
                              << " was not created by the authenticated user",
                AuthorizationSession::get(opCtx->getClient())
                    ->isCoauthorizedWith(cc->getAuthenticatedUsers()));

        *isCursorAuthorized = true;

        const auto replicationMode = repl::ReplicationCoordinator::get(opCtx)->getReplicationMode();
        opCtx->recoveryUnit()->setReadConcernLevelAndReplicationMode(cc->getReadConcernLevel(),
                                                                     replicationMode);

        // TODO SERVER-33698: Remove kSnapshotReadConcern clause once we can guarantee that a
        // readConcern level snapshot getMore will have an established point-in-time WiredTiger
        // snapshot.
        if (replicationMode == repl::ReplicationCoordinator::modeReplSet &&
            (cc->getReadConcernLevel() == repl::ReadConcernLevel::kMajorityReadConcern ||
             cc->getReadConcernLevel() == repl::ReadConcernLevel::kSnapshotReadConcern)) {
            uassertStatusOK(opCtx->recoveryUnit()->obtainMajorityCommittedSnapshot());
        }

        uassert(40548,
                "OP_GET_MORE operations are not supported on tailable aggregations. Only clients "
                "which support the getMore command can be used on tailable aggregations.",
                readLock || !cc->isAwaitData());

        // If the operation that spawned this cursor had a time limit set, apply leftover
        // time to this getmore.
        if (cc->getLeftoverMaxTimeMicros() < Microseconds::max()) {
            uassert(40136,
                    "Illegal attempt to set operation deadline within DBDirectClient",
                    !opCtx->getClient()->isInDirectClient());
            opCtx->setDeadlineAfterNowBy(cc->getLeftoverMaxTimeMicros());
        }
        opCtx->checkForInterrupt();  // May trigger maxTimeAlwaysTimeOut fail point.

        // What number result are we starting at?  Used to fill out the reply.
        startingResult = cc->pos();

        uint64_t notifierVersion = 0;
        std::shared_ptr<CappedInsertNotifier> notifier;
        if (cc->isAwaitData()) {
            invariant(readLock->getCollection()->isCapped());
            // Retrieve the notifier which we will wait on until new data arrives. We make sure
            // to do this in the lock because once we drop the lock it is possible for the
            // collection to become invalid. The notifier itself will outlive the collection if
            // the collection is dropped, as we keep a shared_ptr to it.
            notifier = readLock->getCollection()->getCappedInsertNotifier();

            // Must get the version before we call generateBatch in case a write comes in after
            // that call and before we call wait on the notifier.
            notifierVersion = notifier->getVersion();
        }

        PlanExecutor* exec = cc->getExecutor();
        exec->reattachToOperationContext(opCtx);
        uassertStatusOK(exec->restoreState());

        auto planSummary = Explain::getPlanSummary(exec);
        {
            stdx::lock_guard<Client> lk(*opCtx->getClient());
            curOp.setPlanSummary_inlock(planSummary);

            // Ensure that the original query object is available in the slow query log, profiler
            // and currentOp. Upconvert _query to resemble a getMore command, and set the original
            // command or upconverted legacy query in the originatingCommand field.
            curOp.setOpDescription_inlock(upconvertGetMoreEntry(nss, cursorid, ntoreturn));
            curOp.setOriginatingCommand_inlock(cc->getOriginatingCommandObj());
        }

        PlanExecutor::ExecState state;

        // We report keysExamined and docsExamined to OpDebug for a given getMore operation. To
        // obtain these values we need to take a diff of the pre-execution and post-execution
        // metrics, as they accumulate over the course of a cursor's lifetime.
        PlanSummaryStats preExecutionStats;
        Explain::getSummaryStats(*exec, &preExecutionStats);

        generateBatch(ntoreturn, cc, &bb, &numResults, &state);

        // If this is an await data cursor, and we hit EOF without generating any results, then
        // we block waiting for new data to arrive.
        if (cc->isAwaitData() && state == PlanExecutor::IS_EOF && numResults == 0) {
            // Save the PlanExecutor and drop our locks.
            exec->saveState();
            readLock.reset();

            // Block waiting for data for up to 1 second. Time spent blocking is not counted towards
            // the total operation latency.
            curOp.pauseTimer();
            Seconds timeout(1);
            notifier->waitUntil(notifierVersion,
                                opCtx->getServiceContext()->getPreciseClockSource()->now() +
                                    timeout);
            notifier.reset();
            curOp.resumeTimer();

            // Reacquiring locks.
            readLock.emplace(opCtx, nss);
            uassertStatusOK(exec->restoreState());

            // We woke up because either the timed_wait expired, or there was more data. Either
            // way, attempt to generate another batch of results.
            generateBatch(ntoreturn, cc, &bb, &numResults, &state);
        }

        PlanSummaryStats postExecutionStats;
        Explain::getSummaryStats(*exec, &postExecutionStats);
        postExecutionStats.totalKeysExamined -= preExecutionStats.totalKeysExamined;
        postExecutionStats.totalDocsExamined -= preExecutionStats.totalDocsExamined;
        curOp.debug().setPlanSummaryMetrics(postExecutionStats);

        // We do not report 'execStats' for aggregation or other globally managed cursors, both in
        // the original request and subsequent getMore. It would be useful to have this information
        // for an aggregation, but the source PlanExecutor could be destroyed before we know whether
        // we need execStats and we do not want to generate for all operations due to cost.
        if (!CursorManager::isGloballyManagedCursor(cursorid) && curOp.shouldDBProfile()) {
            BSONObjBuilder execStatsBob;
            Explain::getWinningPlanStats(exec, &execStatsBob);
            curOp.debug().execStats = execStatsBob.obj();
        }

        // Our two possible ClientCursorPin cleanup paths are:
        // 1) If the cursor is not going to be saved, we call deleteUnderlying() on the pin.
        // 2) If the cursor is going to be saved, we simply let the pin go out of scope. In this
        //    case, the pin's destructor will be invoked, which will call release() on the pin.
        //    Because our ClientCursorPin is declared after our lock is declared, this will happen
        //    under the lock if any locking was necessary.
        if (!shouldSaveCursorGetMore(state, exec, cc->isTailable())) {
            ccPin.getValue().deleteUnderlying();

            // cc is now invalid, as is the executor
            cursorid = 0;
            cc = nullptr;
            curOp.debug().cursorExhausted = true;

            LOG(5) << "getMore NOT saving client cursor, ended with state "
                   << PlanExecutor::statestr(state);
        } else {
            // Continue caching the ClientCursor.
            cc->incPos(numResults);
            exec->saveState();
            exec->detachFromOperationContext();
            LOG(5) << "getMore saving client cursor ended with state "
                   << PlanExecutor::statestr(state);

            *exhaust = cc->queryOptions() & QueryOption_Exhaust;

            // We assume that cursors created through a DBDirectClient are always used from their
            // original OperationContext, so we do not need to move time to and from the cursor.
            if (!opCtx->getClient()->isInDirectClient()) {
                // If the getmore had a time limit, remaining time is "rolled over" back to the
                // cursor (for use by future getmore ops).
                cc->setLeftoverMaxTimeMicros(opCtx->getRemainingMaxTimeMicros());
            }
        }
    }

    QueryResult::View qr = bb.buf();
    qr.msgdata().setLen(bb.len());
    qr.msgdata().setOperation(opReply);
    qr.setResultFlags(resultFlags);
    qr.setCursorId(cursorid);
    qr.setStartingFrom(startingResult);
    qr.setNReturned(numResults);
    LOG(5) << "getMore returned " << numResults << " results\n";
    return Message(bb.release());
}