Beispiel #1
0
/**
 * Called by db/instance.cpp.  This is the getMore entry point.
 */
QueryResult::View getMore(OperationContext* txn,
                          const char* ns,
                          int ntoreturn,
                          long long cursorid,
                          bool* exhaust,
                          bool* isCursorAuthorized) {
    invariant(ntoreturn >= 0);

    CurOp& curop = *CurOp::get(txn);

    // For testing, we may want to fail if we receive a getmore.
    if (MONGO_FAIL_POINT(failReceivedGetmore)) {
        invariant(0);
    }

    *exhaust = false;

    const NamespaceString nss(ns);

    // Depending on the type of cursor being operated on, we hold locks for the whole getMore,
    // or none of the getMore, or part of the getMore.  The three cases in detail:
    //
    // 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore.
    // 2) Cursor owned by global cursor manager: we don't lock anything.  These cursors don't own
    //    any collection state. These cursors are generated either by the listCollections or
    //    listIndexes commands, as these special cursor-generating commands operate over catalog
    //    data rather than targeting the data within a collection.
    // 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and
    //    "unpinCollLock".  This is because agg cursors handle locking internally (hence the
    //    release), but the pin and unpin of the cursor must occur under the collection lock.
    //    We don't use our AutoGetCollectionForRead "ctx" to relock, because
    //    AutoGetCollectionForRead checks the sharding version (and we want the relock for the
    //    unpin to succeed even if the sharding version has changed).
    //
    // Note that we declare our locks before our ClientCursorPin, in order to ensure that the
    // pin's destructor is called before the lock destructors (so that the unpin occurs under
    // the lock).
    unique_ptr<AutoGetCollectionForRead> ctx;
    unique_ptr<Lock::DBLock> unpinDBLock;
    unique_ptr<Lock::CollectionLock> unpinCollLock;

    CursorManager* cursorManager;
    if (nss.isListIndexesCursorNS() || nss.isListCollectionsCursorNS()) {
        // List collections and list indexes are special cursor-generating commands whose
        // cursors are managed globally, as they operate over catalog data rather than targeting
        // the data within a collection.
        cursorManager = CursorManager::getGlobalCursorManager();
    } else {
        ctx = stdx::make_unique<AutoGetCollectionForRead>(txn, nss);
        Collection* collection = ctx->getCollection();
        uassert(17356, "collection dropped between getMore calls", collection);
        cursorManager = collection->getCursorManager();
    }

    LOG(5) << "Running getMore, cursorid: " << cursorid << endl;

    // This checks to make sure the operation is allowed on a replicated node.  Since we are not
    // passing in a query object (necessary to check SlaveOK query option), the only state where
    // reads are allowed is PRIMARY (or master in master/slave).  This function uasserts if
    // reads are not okay.
    Status status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(txn, nss, true);
    uassertStatusOK(status);

    // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it
    // doesn't time out.  Also informs ClientCursor that there is somebody actively holding the
    // CC, so don't delete it.
    ClientCursorPin ccPin(cursorManager, cursorid);
    ClientCursor* cc = ccPin.c();
    // These are set in the QueryResult msg we return.
    int resultFlags = ResultFlag_AwaitCapable;

    int numResults = 0;
    int startingResult = 0;

    const int InitialBufSize =
        512 + sizeof(QueryResult::Value) + FindCommon::kMaxBytesToReturnToClientAtOnce;

    BufBuilder bb(InitialBufSize);
    bb.skip(sizeof(QueryResult::Value));

    if (NULL == cc) {
        cursorid = 0;
        resultFlags = ResultFlag_CursorNotFound;
    } else {
        // Check for spoofing of the ns such that it does not match the one originally
        // there for the cursor.
        uassert(ErrorCodes::Unauthorized,
                str::stream() << "Requested getMore on namespace " << ns << ", but cursor "
                              << cursorid << " belongs to namespace " << cc->ns(),
                ns == cc->ns());
        *isCursorAuthorized = true;

        if (cc->isReadCommitted())
            uassertStatusOK(txn->recoveryUnit()->setReadFromMajorityCommittedSnapshot());

        // Reset timeout timer on the cursor since the cursor is still in use.
        cc->setIdleTime(0);

        // If the operation that spawned this cursor had a time limit set, apply leftover
        // time to this getmore.
        curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros());
        txn->checkForInterrupt();  // May trigger maxTimeAlwaysTimeOut fail point.

        // Ensure that the original query or command object is available in the slow query log,
        // profiler, and currentOp.
        curop.debug().query = cc->getQuery();
        {
            stdx::lock_guard<Client> lk(*txn->getClient());
            curop.setQuery_inlock(cc->getQuery());
        }

        cc->updateSlaveLocation(txn);

        if (cc->isAggCursor()) {
            // Agg cursors handle their own locking internally.
            ctx.reset();  // unlocks
        }

        // If we're replaying the oplog, we save the last time that we read.
        Timestamp slaveReadTill;

        // What number result are we starting at?  Used to fill out the reply.
        startingResult = cc->pos();

        uint64_t notifierVersion = 0;
        std::shared_ptr<CappedInsertNotifier> notifier;
        if (isCursorAwaitData(cc)) {
            invariant(ctx->getCollection()->isCapped());
            // Retrieve the notifier which we will wait on until new data arrives. We make sure
            // to do this in the lock because once we drop the lock it is possible for the
            // collection to become invalid. The notifier itself will outlive the collection if
            // the collection is dropped, as we keep a shared_ptr to it.
            notifier = ctx->getCollection()->getCappedInsertNotifier();

            // Must get the version before we call generateBatch in case a write comes in after
            // that call and before we call wait on the notifier.
            notifierVersion = notifier->getVersion();
        }

        PlanExecutor* exec = cc->getExecutor();
        exec->reattachToOperationContext(txn);
        exec->restoreState();
        PlanExecutor::ExecState state;

        generateBatch(ntoreturn, cc, &bb, &numResults, &slaveReadTill, &state);

        // If this is an await data cursor, and we hit EOF without generating any results, then
        // we block waiting for new data to arrive.
        if (isCursorAwaitData(cc) && state == PlanExecutor::IS_EOF && numResults == 0) {
            // Save the PlanExecutor and drop our locks.
            exec->saveState();
            ctx.reset();

            // Block waiting for data for up to 1 second.
            Seconds timeout(1);
            notifier->wait(notifierVersion, timeout);
            notifier.reset();

            // Set expected latency to match wait time. This makes sure the logs aren't spammed
            // by awaitData queries that exceed slowms due to blocking on the CappedInsertNotifier.
            curop.setExpectedLatencyMs(durationCount<Milliseconds>(timeout));

            // Reacquiring locks.
            ctx = make_unique<AutoGetCollectionForRead>(txn, nss);
            exec->restoreState();

            // We woke up because either the timed_wait expired, or there was more data. Either
            // way, attempt to generate another batch of results.
            generateBatch(ntoreturn, cc, &bb, &numResults, &slaveReadTill, &state);
        }

        // We have to do this before re-acquiring locks in the agg case because
        // shouldSaveCursorGetMore() can make a network call for agg cursors.
        //
        // TODO: Getting rid of PlanExecutor::isEOF() in favor of PlanExecutor::IS_EOF would mean
        // that this network operation is no longer necessary.
        const bool shouldSaveCursor = shouldSaveCursorGetMore(state, exec, isCursorTailable(cc));

        // In order to deregister a cursor, we need to be holding the DB + collection lock and
        // if the cursor is aggregation, we release these locks.
        if (cc->isAggCursor()) {
            invariant(NULL == ctx.get());
            unpinDBLock = make_unique<Lock::DBLock>(txn->lockState(), nss.db(), MODE_IS);
            unpinCollLock = make_unique<Lock::CollectionLock>(txn->lockState(), nss.ns(), MODE_IS);
        }

        // Our two possible ClientCursorPin cleanup paths are:
        // 1) If the cursor is not going to be saved, we call deleteUnderlying() on the pin.
        // 2) If the cursor is going to be saved, we simply let the pin go out of scope.  In
        //    this case, the pin's destructor will be invoked, which will call release() on the
        //    pin.  Because our ClientCursorPin is declared after our lock is declared, this
        //    will happen under the lock.
        if (!shouldSaveCursor) {
            ccPin.deleteUnderlying();

            // cc is now invalid, as is the executor
            cursorid = 0;
            cc = NULL;
            curop.debug().cursorExhausted = true;

            LOG(5) << "getMore NOT saving client cursor, ended with state "
                   << PlanExecutor::statestr(state) << endl;
        } else {
            // Continue caching the ClientCursor.
            cc->incPos(numResults);
            exec->saveState();
            exec->detachFromOperationContext();
            LOG(5) << "getMore saving client cursor ended with state "
                   << PlanExecutor::statestr(state) << endl;

            // Possibly note slave's position in the oplog.
            if ((cc->queryOptions() & QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            *exhaust = cc->queryOptions() & QueryOption_Exhaust;

            // If the getmore had a time limit, remaining time is "rolled over" back to the
            // cursor (for use by future getmore ops).
            cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());
        }
    }

    QueryResult::View qr = bb.buf();
    qr.msgdata().setLen(bb.len());
    qr.msgdata().setOperation(opReply);
    qr.setResultFlags(resultFlags);
    qr.setCursorId(cursorid);
    qr.setStartingFrom(startingResult);
    qr.setNReturned(numResults);
    bb.decouple();
    LOG(5) << "getMore returned " << numResults << " results\n";
    return qr;
}
    bool run(OperationContext* txn,
             const std::string& dbname,
             BSONObj& cmdObj,
             int options,
             std::string& errmsg,
             BSONObjBuilder& result) override {
        // Counted as a getMore, not as a command.
        globalOpCounters.gotGetMore();

        if (txn->getClient()->isInDirectClient()) {
            return appendCommandStatus(
                result,
                Status(ErrorCodes::IllegalOperation, "Cannot run getMore command from eval()"));
        }

        StatusWith<GetMoreRequest> parseStatus = GetMoreRequest::parseFromBSON(dbname, cmdObj);
        if (!parseStatus.isOK()) {
            return appendCommandStatus(result, parseStatus.getStatus());
        }
        const GetMoreRequest& request = parseStatus.getValue();

        // Depending on the type of cursor being operated on, we hold locks for the whole
        // getMore, or none of the getMore, or part of the getMore.  The three cases in detail:
        //
        // 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore.
        // 2) Cursor owned by global cursor manager: we don't lock anything.  These cursors
        //    don't own any collection state. These cursors are generated either by the
        //    listCollections or listIndexes commands, as these special cursor-generating commands
        //    operate over catalog data rather than targeting the data within a collection.
        // 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and
        //    "unpinCollLock".  This is because agg cursors handle locking internally (hence the
        //    release), but the pin and unpin of the cursor must occur under the collection
        //    lock. We don't use our AutoGetCollectionForRead "ctx" to relock, because
        //    AutoGetCollectionForRead checks the sharding version (and we want the relock for
        //    the unpin to succeed even if the sharding version has changed).
        //
        // Note that we declare our locks before our ClientCursorPin, in order to ensure that
        // the pin's destructor is called before the lock destructors (so that the unpin occurs
        // under the lock).
        std::unique_ptr<AutoGetCollectionForRead> ctx;
        std::unique_ptr<Lock::DBLock> unpinDBLock;
        std::unique_ptr<Lock::CollectionLock> unpinCollLock;

        CursorManager* cursorManager;
        if (request.nss.isListIndexesCursorNS() || request.nss.isListCollectionsCursorNS()) {
            cursorManager = CursorManager::getGlobalCursorManager();
        } else {
            ctx = stdx::make_unique<AutoGetCollectionForRead>(txn, request.nss);
            Collection* collection = ctx->getCollection();
            if (!collection) {
                return appendCommandStatus(result,
                                           Status(ErrorCodes::OperationFailed,
                                                  "collection dropped between getMore calls"));
            }
            cursorManager = collection->getCursorManager();
        }

        ClientCursorPin ccPin(cursorManager, request.cursorid);
        ClientCursor* cursor = ccPin.c();
        if (!cursor) {
            // We didn't find the cursor.
            return appendCommandStatus(
                result,
                Status(ErrorCodes::CursorNotFound,
                       str::stream() << "Cursor not found, cursor id: " << request.cursorid));
        }

        if (request.nss.ns() != cursor->ns()) {
            return appendCommandStatus(
                result,
                Status(ErrorCodes::Unauthorized,
                       str::stream() << "Requested getMore on namespace '" << request.nss.ns()
                                     << "', but cursor belongs to a different namespace"));
        }

        if (request.nss.isOplog() && MONGO_FAIL_POINT(rsStopGetMoreCmd)) {
            return appendCommandStatus(
                result,
                Status(ErrorCodes::CommandFailed,
                       str::stream() << "getMore on " << request.nss.ns()
                                     << " rejected due to active fail point rsStopGetMoreCmd"));
        }

        // Validation related to awaitData.
        if (isCursorAwaitData(cursor)) {
            invariant(isCursorTailable(cursor));

            if (cursor->isAggCursor()) {
                Status status(ErrorCodes::BadValue,
                              "awaitData cannot be set on an aggregation cursor");
                return appendCommandStatus(result, status);
            }
        }

        // Validate term, if provided.
        if (request.term) {
            auto replCoord = repl::ReplicationCoordinator::get(txn);
            Status status = replCoord->updateTerm(*request.term);
            // Note: updateTerm returns ok if term stayed the same.
            if (!status.isOK()) {
                return appendCommandStatus(result, status);
            }
        }

        // On early return, get rid of the cursor.
        ScopeGuard cursorFreer = MakeGuard(&GetMoreCmd::cleanupCursor, txn, &ccPin, request);

        if (cursor->isReadCommitted())
            uassertStatusOK(txn->recoveryUnit()->setReadFromMajorityCommittedSnapshot());

        // Reset timeout timer on the cursor since the cursor is still in use.
        cursor->setIdleTime(0);

        const bool hasOwnMaxTime = CurOp::get(txn)->isMaxTimeSet();

        if (!hasOwnMaxTime) {
            // There is no time limit set directly on this getMore command. If the cursor is
            // awaitData, then we supply a default time of one second. Otherwise we roll over
            // any leftover time from the maxTimeMS of the operation that spawned this cursor,
            // applying it to this getMore.
            if (isCursorAwaitData(cursor)) {
                Seconds awaitDataTimeout(1);
                CurOp::get(txn)->setMaxTimeMicros(durationCount<Microseconds>(awaitDataTimeout));
            } else {
                CurOp::get(txn)->setMaxTimeMicros(cursor->getLeftoverMaxTimeMicros());
            }
        }
        txn->checkForInterrupt();  // May trigger maxTimeAlwaysTimeOut fail point.

        if (cursor->isAggCursor()) {
            // Agg cursors handle their own locking internally.
            ctx.reset();  // unlocks
        }

        PlanExecutor* exec = cursor->getExecutor();
        exec->reattachToOperationContext(txn);
        exec->restoreState();

        // If we're tailing a capped collection, retrieve a monotonically increasing insert
        // counter.
        uint64_t lastInsertCount = 0;
        if (isCursorAwaitData(cursor)) {
            invariant(ctx->getCollection()->isCapped());
            lastInsertCount = ctx->getCollection()->getCappedInsertNotifier()->getCount();
        }

        CursorId respondWithId = 0;
        BSONArrayBuilder nextBatch;
        BSONObj obj;
        PlanExecutor::ExecState state;
        long long numResults = 0;
        Status batchStatus = generateBatch(cursor, request, &nextBatch, &state, &numResults);
        if (!batchStatus.isOK()) {
            return appendCommandStatus(result, batchStatus);
        }

        // If this is an await data cursor, and we hit EOF without generating any results, then
        // we block waiting for new data to arrive.
        if (isCursorAwaitData(cursor) && state == PlanExecutor::IS_EOF && numResults == 0) {
            // Retrieve the notifier which we will wait on until new data arrives. We make sure
            // to do this in the lock because once we drop the lock it is possible for the
            // collection to become invalid. The notifier itself will outlive the collection if
            // the collection is dropped, as we keep a shared_ptr to it.
            auto notifier = ctx->getCollection()->getCappedInsertNotifier();

            // Save the PlanExecutor and drop our locks.
            exec->saveState();
            ctx.reset();

            // Block waiting for data.
            Microseconds timeout(CurOp::get(txn)->getRemainingMaxTimeMicros());
            notifier->waitForInsert(lastInsertCount, timeout);
            notifier.reset();

            ctx.reset(new AutoGetCollectionForRead(txn, request.nss));
            exec->restoreState();

            // We woke up because either the timed_wait expired, or there was more data. Either
            // way, attempt to generate another batch of results.
            batchStatus = generateBatch(cursor, request, &nextBatch, &state, &numResults);
            if (!batchStatus.isOK()) {
                return appendCommandStatus(result, batchStatus);
            }
        }

        if (shouldSaveCursorGetMore(state, exec, isCursorTailable(cursor))) {
            respondWithId = request.cursorid;

            exec->saveState();
            exec->detachFromOperationContext();

            // If maxTimeMS was set directly on the getMore rather than being rolled over
            // from a previous find, then don't roll remaining micros over to the next
            // getMore.
            if (!hasOwnMaxTime) {
                cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros());
            }

            cursor->incPos(numResults);
        } else {
            CurOp::get(txn)->debug().cursorExhausted = true;
        }

        appendGetMoreResponseObject(respondWithId, request.nss.ns(), nextBatch.arr(), &result);

        if (respondWithId) {
            cursorFreer.Dismiss();

            // If we are operating on an aggregation cursor, then we dropped our collection lock
            // earlier and need to reacquire it in order to clean up our ClientCursorPin.
            if (cursor->isAggCursor()) {
                invariant(NULL == ctx.get());
                unpinDBLock.reset(new Lock::DBLock(txn->lockState(), request.nss.db(), MODE_IS));
                unpinCollLock.reset(
                    new Lock::CollectionLock(txn->lockState(), request.nss.ns(), MODE_IS));
            }
        }

        return true;
    }