Beispiel #1
0
    std::string runQuery(OperationContext* txn,
                         QueryMessage& q,
                         const NamespaceString& nss,
                         CurOp& curop,
                         Message &result) {
        // Validate the namespace.
        uassert(16256, str::stream() << "Invalid ns [" << nss.ns() << "]", nss.isValid());
        invariant(!nss.isCommand());

        // Set curop information.
        beginQueryOp(nss, q.query, q.ntoreturn, q.ntoskip, &curop);

        // Parse the qm into a CanonicalQuery.
        std::auto_ptr<CanonicalQuery> cq;
        {
            CanonicalQuery* cqRaw;
            Status canonStatus = CanonicalQuery::canonicalize(q,
                                                              &cqRaw,
                                                              WhereCallbackReal(txn, nss.db()));
            if (!canonStatus.isOK()) {
                uasserted(17287, str::stream() << "Can't canonicalize query: "
                                               << canonStatus.toString());
            }
            cq.reset(cqRaw);
        }
        invariant(cq.get());

        LOG(5) << "Running query:\n" << cq->toString();
        LOG(2) << "Running query: " << cq->toStringShort();

        // Parse, canonicalize, plan, transcribe, and get a plan executor.
        AutoGetCollectionForRead ctx(txn, nss);
        Collection* collection = ctx.getCollection();

        const int dbProfilingLevel = ctx.getDb() ? ctx.getDb()->getProfilingLevel() :
                                                   serverGlobalParams.defaultProfile;

        // We have a parsed query. Time to get the execution plan for it.
        std::unique_ptr<PlanExecutor> exec;
        {
            PlanExecutor* rawExec;
            Status execStatus = getExecutorFind(txn,
                                                collection,
                                                nss,
                                                cq.release(),
                                                PlanExecutor::YIELD_AUTO,
                                                &rawExec);
            uassertStatusOK(execStatus);
            exec.reset(rawExec);
        }
        const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed();

        // If it's actually an explain, do the explain and return rather than falling through
        // to the normal query execution loop.
        if (pq.isExplain()) {
            BufBuilder bb;
            bb.skip(sizeof(QueryResult::Value));

            BSONObjBuilder explainBob;
            Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob);

            // Add the resulting object to the return buffer.
            BSONObj explainObj = explainBob.obj();
            bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

            // TODO: Does this get overwritten/do we really need to set this twice?
            curop.debug().query = q.query;

            // Set query result fields.
            QueryResult::View qr = bb.buf();
            bb.decouple();
            qr.setResultFlagsToOk();
            qr.msgdata().setLen(bb.len());
            curop.debug().responseLength = bb.len();
            qr.msgdata().setOperation(opReply);
            qr.setCursorId(0);
            qr.setStartingFrom(0);
            qr.setNReturned(1);
            result.setData(qr.view2ptr(), true);
            return "";
        }

        // We freak out later if this changes before we're done with the query.
        const ChunkVersion shardingVersionAtStart = shardingState.getVersion(nss.ns());

        // Handle query option $maxTimeMS (not used with commands).
        curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000);
        txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

        // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set.
        bool slaveOK = pq.isSlaveOk() || pq.hasReadPref();
        Status serveReadsStatus = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(
                txn,
                nss,
                slaveOK);
        uassertStatusOK(serveReadsStatus);

        // Run the query.
        // bb is used to hold query results
        // this buffer should contain either requested documents per query or
        // explain information, but not both
        BufBuilder bb(32768);
        bb.skip(sizeof(QueryResult::Value));

        // How many results have we obtained from the executor?
        int numResults = 0;

        // If we're replaying the oplog, we save the last time that we read.
        Timestamp slaveReadTill;

        BSONObj obj;
        PlanExecutor::ExecState state;
        // uint64_t numMisplacedDocs = 0;

        // Get summary info about which plan the executor is using.
        curop.debug().planSummary = Explain::getPlanSummary(exec.get());

        while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
            // Add result to output buffer.
            bb.appendBuf((void*)obj.objdata(), obj.objsize());

            // Count the result.
            ++numResults;

            // Possibly note slave's position in the oplog.
            if (pq.isOplogReplay()) {
                BSONElement e = obj["ts"];
                if (Date == e.type() || bsonTimestamp == e.type()) {
                    slaveReadTill = e.timestamp();
                }
            }

            if (enoughForFirstBatch(pq, numResults, bb.len())) {
                LOG(5) << "Enough for first batch, wantMore=" << pq.wantMore()
                       << " numToReturn=" << pq.getNumToReturn()
                       << " numResults=" << numResults
                       << endl;
                break;
            }
        }

        // If we cache the executor later, we want to deregister it as it receives notifications
        // anyway by virtue of being cached.
        //
        // If we don't cache the executor later, we are deleting it, so it must be deregistered.
        //
        // So, no matter what, deregister the executor.
        exec->deregisterExec();

        // Caller expects exceptions thrown in certain cases.
        if (PlanExecutor::FAILURE == state) {
            scoped_ptr<PlanStageStats> stats(exec->getStats());
            error() << "Plan executor error, stats: "
                    << Explain::statsToBSON(*stats);
            uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj));
        }

        // TODO: Currently, chunk ranges are kept around until all ClientCursors created while the
        // chunk belonged on this node are gone. Separating chunk lifetime management from
        // ClientCursor should allow this check to go away.
        if (!shardingState.getVersion(nss.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // if the version changed during the query we might be missing some data and its safe to
            // send this as mongos can resend at this point
            throw SendStaleConfigException(nss.ns(), "version changed during initial query",
                                           shardingVersionAtStart,
                                           shardingState.getVersion(nss.ns()));
        }

        // Fill out curop based on query results. If we have a cursorid, we will fill out curop with
        // this cursorid later.
        long long ccId = 0;

        if (shouldSaveCursor(txn, collection, state, exec.get())) {
            // We won't use the executor until it's getMore'd.
            exec->saveState();

            // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
            // inserted into a global map by its ctor.
            ClientCursor* cc = new ClientCursor(collection->getCursorManager(),
                                                exec.release(),
                                                nss.ns(),
                                                pq.getOptions(),
                                                pq.getFilter());
            ccId = cc->cursorid();

            if (txn->getClient()->isInDirectClient()) {
                cc->setUnownedRecoveryUnit(txn->recoveryUnit());
            }
            else if (state == PlanExecutor::IS_EOF && pq.isTailable()) {
                // Don't stash the RU for tailable cursors at EOF, let them get a new RU on their
                // next getMore.
            }
            else {
                // We stash away the RecoveryUnit in the ClientCursor.  It's used for subsequent
                // getMore requests.  The calling OpCtx gets a fresh RecoveryUnit.
                txn->recoveryUnit()->abandonSnapshot();
                cc->setOwnedRecoveryUnit(txn->releaseRecoveryUnit());
                StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine();
                invariant(txn->setRecoveryUnit(storageEngine->newRecoveryUnit(),
                                               OperationContext::kNotInUnitOfWork)
                          == OperationContext::kNotInUnitOfWork);
            }

            LOG(5) << "caching executor with cursorid " << ccId
                   << " after returning " << numResults << " results" << endl;

            // TODO document
            if (pq.isOplogReplay() && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            // TODO document
            if (pq.isExhaust()) {
                curop.debug().exhaust = true;
            }

            cc->setPos(numResults);

            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());

            endQueryOp(cc->getExecutor(), dbProfilingLevel, numResults, ccId, &curop);
        }
        else {
            LOG(5) << "Not caching executor but returning " << numResults << " results.\n";
            endQueryOp(exec.get(), dbProfilingLevel, numResults, ccId, &curop);
        }

        // Add the results from the query into the output buffer.
        result.appendData(bb.buf(), bb.len());
        bb.decouple();

        // Fill out the output buffer's header.
        QueryResult::View qr = result.header().view2ptr();
        qr.setCursorId(ccId);
        qr.setResultFlagsToOk();
        qr.msgdata().setOperation(opReply);
        qr.setStartingFrom(0);
        qr.setNReturned(numResults);

        // curop.debug().exhaust is set above.
        return curop.debug().exhaust ? nss.ns() : "";
    }
Beispiel #2
0
    /**
     * Runs a query using the following steps:
     *   --Parsing.
     *   --Acquire locks.
     *   --Plan query, obtaining an executor that can run it.
     *   --Generate the first batch.
     *   --Save state for getMore, transferring ownership of the executor to a ClientCursor.
     *   --Generate response to send to the client.
     */
    bool run(OperationContext* txn,
             const std::string& dbname,
             BSONObj& cmdObj,
             int options,
             std::string& errmsg,
             BSONObjBuilder& result) override {
        const std::string fullns = parseNs(dbname, cmdObj);
        const NamespaceString nss(fullns);
        if (!nss.isValid() || nss.isCommand() || nss.isSpecialCommand()) {
            return appendCommandStatus(result,
                                       {ErrorCodes::InvalidNamespace,
                                        str::stream() << "Invalid collection name: " << nss.ns()});
        }

        // Although it is a command, a find command gets counted as a query.
        globalOpCounters.gotQuery();

        if (txn->getClient()->isInDirectClient()) {
            return appendCommandStatus(
                result,
                Status(ErrorCodes::IllegalOperation, "Cannot run find command from eval()"));
        }

        // Parse the command BSON to a LiteParsedQuery.
        const bool isExplain = false;
        auto lpqStatus = LiteParsedQuery::makeFromFindCommand(nss, cmdObj, isExplain);
        if (!lpqStatus.isOK()) {
            return appendCommandStatus(result, lpqStatus.getStatus());
        }

        auto& lpq = lpqStatus.getValue();

        // Validate term before acquiring locks, if provided.
        if (auto term = lpq->getReplicationTerm()) {
            auto replCoord = repl::ReplicationCoordinator::get(txn);
            Status status = replCoord->updateTerm(txn, *term);
            // Note: updateTerm returns ok if term stayed the same.
            if (!status.isOK()) {
                return appendCommandStatus(result, status);
            }
        }

        // Fill out curop information.
        //
        // We pass negative values for 'ntoreturn' and 'ntoskip' to indicate that these values
        // should be omitted from the log line. Limit and skip information is already present in the
        // find command parameters, so these fields are redundant.
        const int ntoreturn = -1;
        const int ntoskip = -1;
        beginQueryOp(txn, nss, cmdObj, ntoreturn, ntoskip);

        // Finish the parsing step by using the LiteParsedQuery to create a CanonicalQuery.
        ExtensionsCallbackReal extensionsCallback(txn, &nss);
        auto statusWithCQ = CanonicalQuery::canonicalize(lpq.release(), extensionsCallback);
        if (!statusWithCQ.isOK()) {
            return appendCommandStatus(result, statusWithCQ.getStatus());
        }
        std::unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());

        ShardingState* const shardingState = ShardingState::get(txn);

        if (OperationShardVersion::get(txn).hasShardVersion() && shardingState->enabled()) {
            ChunkVersion receivedVersion = OperationShardVersion::get(txn).getShardVersion(nss);
            ChunkVersion latestVersion;
            // Wait for migration completion to get the correct chunk version.
            const int maxTimeoutSec = 30;
            int timeoutSec = cq->getParsed().getMaxTimeMS() / 1000;
            if (!timeoutSec || timeoutSec > maxTimeoutSec) {
                timeoutSec = maxTimeoutSec;
            }

            if (!shardingState->waitTillNotInCriticalSection(timeoutSec)) {
                uasserted(ErrorCodes::LockTimeout, "Timeout while waiting for migration commit");
            }

            // If the received version is newer than the version cached in 'shardingState', then we
            // have to refresh 'shardingState' from the config servers. We do this before acquiring
            // locks so that we don't hold locks while waiting on the network.
            uassertStatusOK(shardingState->refreshMetadataIfNeeded(
                txn, nss.ns(), receivedVersion, &latestVersion));
        }

        // Acquire locks.
        AutoGetCollectionForRead ctx(txn, nss);
        Collection* collection = ctx.getCollection();

        const int dbProfilingLevel =
            ctx.getDb() ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile;

        // It is possible that the sharding version will change during yield while we are
        // retrieving a plan executor. If this happens we will throw an error and mongos will
        // retry.
        const ChunkVersion shardingVersionAtStart = shardingState->getVersion(nss.ns());

        // Get the execution plan for the query.
        auto statusWithPlanExecutor =
            getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO);
        if (!statusWithPlanExecutor.isOK()) {
            return appendCommandStatus(result, statusWithPlanExecutor.getStatus());
        }

        std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue());

        if (!collection) {
            // No collection. Just fill out curop indicating that there were zero results and
            // there is no ClientCursor id, and then return.
            const long long numResults = 0;
            const CursorId cursorId = 0;
            endQueryOp(txn, collection, *exec, dbProfilingLevel, numResults, cursorId);
            appendCursorResponseObject(cursorId, nss.ns(), BSONArray(), &result);
            return true;
        }

        const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed();

        // Stream query results, adding them to a BSONArray as we go.
        BSONArrayBuilder firstBatch;
        BSONObj obj;
        PlanExecutor::ExecState state = PlanExecutor::ADVANCED;
        long long numResults = 0;
        while (!FindCommon::enoughForFirstBatch(pq, numResults, firstBatch.len()) &&
               PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
            // If adding this object will cause us to exceed the BSON size limit, then we stash
            // it for later.
            if (firstBatch.len() + obj.objsize() > BSONObjMaxUserSize && numResults > 0) {
                exec->enqueue(obj);
                break;
            }

            // Add result to output buffer.
            firstBatch.append(obj);
            numResults++;
        }

        // Throw an assertion if query execution fails for any reason.
        if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) {
            const std::unique_ptr<PlanStageStats> stats(exec->getStats());
            error() << "Plan executor error during find command: " << PlanExecutor::statestr(state)
                    << ", stats: " << Explain::statsToBSON(*stats);

            return appendCommandStatus(result,
                                       Status(ErrorCodes::OperationFailed,
                                              str::stream()
                                                  << "Executor error during find command: "
                                                  << WorkingSetCommon::toStatusString(obj)));
        }

        // TODO: Currently, chunk ranges are kept around until all ClientCursors created while the
        // chunk belonged on this node are gone. Separating chunk lifetime management from
        // ClientCursor should allow this check to go away.
        if (!shardingState->getVersion(nss.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // Version changed while retrieving a PlanExecutor. Terminate the operation,
            // signaling that mongos should retry.
            throw SendStaleConfigException(nss.ns(),
                                           "version changed during find command",
                                           shardingVersionAtStart,
                                           shardingState->getVersion(nss.ns()));
        }

        // Set up the cursor for getMore.
        CursorId cursorId = 0;
        if (shouldSaveCursor(txn, collection, state, exec.get())) {
            // Register the execution plan inside a ClientCursor. Ownership of the PlanExecutor is
            // transferred to the ClientCursor.
            //
            // First unregister the PlanExecutor so it can be re-registered with ClientCursor.
            exec->deregisterExec();

            // Create a ClientCursor containing this plan executor. We don't have to worry about
            // leaking it as it's inserted into a global map by its ctor.
            ClientCursor* cursor =
                new ClientCursor(collection->getCursorManager(),
                                 exec.release(),
                                 nss.ns(),
                                 txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(),
                                 pq.getOptions(),
                                 pq.getFilter());
            cursorId = cursor->cursorid();

            invariant(!exec);
            PlanExecutor* cursorExec = cursor->getExecutor();

            // State will be restored on getMore.
            cursorExec->saveState();
            cursorExec->detachFromOperationContext();

            cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros());
            cursor->setPos(numResults);

            // Fill out curop based on the results.
            endQueryOp(txn, collection, *cursorExec, dbProfilingLevel, numResults, cursorId);
        } else {
            endQueryOp(txn, collection, *exec, dbProfilingLevel, numResults, cursorId);
        }

        // Generate the response object to send to the client.
        appendCursorResponseObject(cursorId, nss.ns(), firstBatch.arr(), &result);
        return true;
    }
Beispiel #3
0
    /**
     * Runs a query using the following steps:
     *   1) Parsing.
     *   2) Acquire locks.
     *   3) Plan query, obtaining an executor that can run it.
     *   4) Setup a cursor for the query, which may be used on subsequent getMores.
     *   5) Generate the first batch.
     *   6) Save state for getMore.
     *   7) Generate response to send to the client.
     *
     * TODO: Rather than using the sharding version available in thread-local storage (i.e. the
     *       call to ShardingState::needCollectionMetadata() below), shard version information
     *       should be passed as part of the command parameter.
     */
    bool run(OperationContext* txn,
             const std::string& dbname,
             BSONObj& cmdObj,
             int options,
             std::string& errmsg,
             BSONObjBuilder& result) override {
        const std::string fullns = parseNs(dbname, cmdObj);
        const NamespaceString nss(fullns);
        if (!nss.isValid()) {
            return appendCommandStatus(result,
                                       {ErrorCodes::InvalidNamespace,
                                        str::stream() << "Invalid collection name: " << nss.ns()});
        }

        // Although it is a command, a find command gets counted as a query.
        globalOpCounters.gotQuery();

        if (txn->getClient()->isInDirectClient()) {
            return appendCommandStatus(
                result,
                Status(ErrorCodes::IllegalOperation, "Cannot run find command from eval()"));
        }

        // 1a) Parse the command BSON to a LiteParsedQuery.
        const bool isExplain = false;
        auto lpqStatus = LiteParsedQuery::makeFromFindCommand(nss, cmdObj, isExplain);
        if (!lpqStatus.isOK()) {
            return appendCommandStatus(result, lpqStatus.getStatus());
        }

        auto& lpq = lpqStatus.getValue();

        // Validate term, if provided.
        if (auto term = lpq->getReplicationTerm()) {
            auto replCoord = repl::ReplicationCoordinator::get(txn);
            Status status = replCoord->updateTerm(*term);
            // Note: updateTerm returns ok if term stayed the same.
            if (!status.isOK()) {
                return appendCommandStatus(result, status);
            }
        }

        // Fill out curop information.
        long long ntoreturn = lpq->getBatchSize().value_or(0);
        beginQueryOp(txn, nss, cmdObj, ntoreturn, lpq->getSkip());

        // 1b) Finish the parsing step by using the LiteParsedQuery to create a CanonicalQuery.
        WhereCallbackReal whereCallback(txn, nss.db());
        auto statusWithCQ = CanonicalQuery::canonicalize(lpq.release(), whereCallback);
        if (!statusWithCQ.isOK()) {
            return appendCommandStatus(result, statusWithCQ.getStatus());
        }
        std::unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());

        // 2) Acquire locks.
        AutoGetCollectionForRead ctx(txn, nss);
        Collection* collection = ctx.getCollection();

        const int dbProfilingLevel =
            ctx.getDb() ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile;

        ShardingState* const shardingState = ShardingState::get(txn);

        // It is possible that the sharding version will change during yield while we are
        // retrieving a plan executor. If this happens we will throw an error and mongos will
        // retry.
        const ChunkVersion shardingVersionAtStart = shardingState->getVersion(nss.ns());

        // 3) Get the execution plan for the query.
        auto statusWithPlanExecutor =
            getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO);
        if (!statusWithPlanExecutor.isOK()) {
            return appendCommandStatus(result, statusWithPlanExecutor.getStatus());
        }

        std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue());

        // TODO: Currently, chunk ranges are kept around until all ClientCursors created while
        // the chunk belonged on this node are gone. Separating chunk lifetime management from
        // ClientCursor should allow this check to go away.
        if (!shardingState->getVersion(nss.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // Version changed while retrieving a PlanExecutor. Terminate the operation,
            // signaling that mongos should retry.
            throw SendStaleConfigException(nss.ns(),
                                           "version changed during find command",
                                           shardingVersionAtStart,
                                           shardingState->getVersion(nss.ns()));
        }

        if (!collection) {
            // No collection. Just fill out curop indicating that there were zero results and
            // there is no ClientCursor id, and then return.
            const long long numResults = 0;
            const CursorId cursorId = 0;
            endQueryOp(txn, *exec, dbProfilingLevel, numResults, cursorId);
            appendCursorResponseObject(cursorId, nss.ns(), BSONArray(), &result);
            return true;
        }

        const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed();

        // 4) If possible, register the execution plan inside a ClientCursor, and pin that
        // cursor. In this case, ownership of the PlanExecutor is transferred to the
        // ClientCursor, and 'exec' becomes null.
        //
        // First unregister the PlanExecutor so it can be re-registered with ClientCursor.
        exec->deregisterExec();

        // Create a ClientCursor containing this plan executor. We don't have to worry
        // about leaking it as it's inserted into a global map by its ctor.
        ClientCursor* cursor =
            new ClientCursor(collection->getCursorManager(),
                             exec.release(),
                             nss.ns(),
                             txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(),
                             pq.getOptions(),
                             pq.getFilter());
        CursorId cursorId = cursor->cursorid();
        ClientCursorPin ccPin(collection->getCursorManager(), cursorId);

        // On early return, get rid of the the cursor.
        ScopeGuard cursorFreer = MakeGuard(&ClientCursorPin::deleteUnderlying, ccPin);

        invariant(!exec);
        PlanExecutor* cursorExec = cursor->getExecutor();

        // 5) Stream query results, adding them to a BSONArray as we go.
        BSONArrayBuilder firstBatch;
        BSONObj obj;
        PlanExecutor::ExecState state;
        long long numResults = 0;
        while (!enoughForFirstBatch(pq, numResults, firstBatch.len()) &&
               PlanExecutor::ADVANCED == (state = cursorExec->getNext(&obj, NULL))) {
            // If adding this object will cause us to exceed the BSON size limit, then we stash
            // it for later.
            if (firstBatch.len() + obj.objsize() > BSONObjMaxUserSize && numResults > 0) {
                cursorExec->enqueue(obj);
                break;
            }

            // Add result to output buffer.
            firstBatch.append(obj);
            numResults++;
        }

        // Throw an assertion if query execution fails for any reason.
        if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) {
            const std::unique_ptr<PlanStageStats> stats(cursorExec->getStats());
            error() << "Plan executor error during find command: " << PlanExecutor::statestr(state)
                    << ", stats: " << Explain::statsToBSON(*stats);

            return appendCommandStatus(result,
                                       Status(ErrorCodes::OperationFailed,
                                              str::stream()
                                                  << "Executor error during find command: "
                                                  << WorkingSetCommon::toStatusString(obj)));
        }

        // 6) Set up the cursor for getMore.
        if (shouldSaveCursor(txn, collection, state, cursorExec)) {
            // State will be restored on getMore.
            cursorExec->saveState();
            cursorExec->detachFromOperationContext();

            cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros());
            cursor->setPos(numResults);
        } else {
            cursorId = 0;
        }

        // Fill out curop based on the results.
        endQueryOp(txn, *cursorExec, dbProfilingLevel, numResults, cursorId);

        // 7) Generate the response object to send to the client.
        appendCursorResponseObject(cursorId, nss.ns(), firstBatch.arr(), &result);
        if (cursorId) {
            cursorFreer.Dismiss();
        }
        return true;
    }
Beispiel #4
0
    /**
     * Runs a query using the following steps:
     *   --Parsing.
     *   --Acquire locks.
     *   --Plan query, obtaining an executor that can run it.
     *   --Generate the first batch.
     *   --Save state for getMore, transferring ownership of the executor to a ClientCursor.
     *   --Generate response to send to the client.
     */
    bool run(OperationContext* txn,
             const std::string& dbname,
             BSONObj& cmdObj,
             int options,
             std::string& errmsg,
             BSONObjBuilder& result) override {
        const NamespaceString nss(parseNs(dbname, cmdObj));
        if (!nss.isValid() || nss.isCommand() || nss.isSpecialCommand()) {
            return appendCommandStatus(result,
                                       {ErrorCodes::InvalidNamespace,
                                        str::stream() << "Invalid collection name: " << nss.ns()});
        }

        // Although it is a command, a find command gets counted as a query.
        globalOpCounters.gotQuery();

        if (txn->getClient()->isInDirectClient()) {
            return appendCommandStatus(
                result,
                Status(ErrorCodes::IllegalOperation, "Cannot run find command from eval()"));
        }

        // Parse the command BSON to a QueryRequest.
        const bool isExplain = false;
        auto qrStatus = QueryRequest::makeFromFindCommand(nss, cmdObj, isExplain);
        if (!qrStatus.isOK()) {
            return appendCommandStatus(result, qrStatus.getStatus());
        }

        auto& qr = qrStatus.getValue();

        // Validate term before acquiring locks, if provided.
        if (auto term = qr->getReplicationTerm()) {
            auto replCoord = repl::ReplicationCoordinator::get(txn);
            Status status = replCoord->updateTerm(txn, *term);
            // Note: updateTerm returns ok if term stayed the same.
            if (!status.isOK()) {
                return appendCommandStatus(result, status);
            }
        }

        // Fill out curop information.
        //
        // We pass negative values for 'ntoreturn' and 'ntoskip' to indicate that these values
        // should be omitted from the log line. Limit and skip information is already present in the
        // find command parameters, so these fields are redundant.
        const int ntoreturn = -1;
        const int ntoskip = -1;
        beginQueryOp(txn, nss, cmdObj, ntoreturn, ntoskip);

        // Finish the parsing step by using the QueryRequest to create a CanonicalQuery.
        ExtensionsCallbackReal extensionsCallback(txn, &nss);
        auto statusWithCQ = CanonicalQuery::canonicalize(txn, std::move(qr), extensionsCallback);
        if (!statusWithCQ.isOK()) {
            return appendCommandStatus(result, statusWithCQ.getStatus());
        }
        std::unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());

        // Acquire locks.
        AutoGetCollectionForRead ctx(txn, nss);
        Collection* collection = ctx.getCollection();

        // Get the execution plan for the query.
        auto statusWithPlanExecutor =
            getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO);
        if (!statusWithPlanExecutor.isOK()) {
            return appendCommandStatus(result, statusWithPlanExecutor.getStatus());
        }

        std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue());

        {
            stdx::lock_guard<Client>(*txn->getClient());
            CurOp::get(txn)->setPlanSummary_inlock(Explain::getPlanSummary(exec.get()));
        }

        if (!collection) {
            // No collection. Just fill out curop indicating that there were zero results and
            // there is no ClientCursor id, and then return.
            const long long numResults = 0;
            const CursorId cursorId = 0;
            endQueryOp(txn, collection, *exec, numResults, cursorId);
            appendCursorResponseObject(cursorId, nss.ns(), BSONArray(), &result);
            return true;
        }

        const QueryRequest& originalQR = exec->getCanonicalQuery()->getQueryRequest();

        // Stream query results, adding them to a BSONArray as we go.
        CursorResponseBuilder firstBatch(/*isInitialResponse*/ true, &result);
        BSONObj obj;
        PlanExecutor::ExecState state = PlanExecutor::ADVANCED;
        long long numResults = 0;
        while (!FindCommon::enoughForFirstBatch(originalQR, numResults) &&
               PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
            // If we can't fit this result inside the current batch, then we stash it for later.
            if (!FindCommon::haveSpaceForNext(obj, numResults, firstBatch.bytesUsed())) {
                exec->enqueue(obj);
                break;
            }

            // Add result to output buffer.
            firstBatch.append(obj);
            numResults++;
        }

        // Throw an assertion if query execution fails for any reason.
        if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) {
            firstBatch.abandon();
            error() << "Plan executor error during find command: " << PlanExecutor::statestr(state)
                    << ", stats: " << Explain::getWinningPlanStats(exec.get());

            return appendCommandStatus(result,
                                       Status(ErrorCodes::OperationFailed,
                                              str::stream()
                                                  << "Executor error during find command: "
                                                  << WorkingSetCommon::toStatusString(obj)));
        }

        // Before saving the cursor, ensure that whatever plan we established happened with the
        // expected collection version
        auto css = CollectionShardingState::get(txn, nss);
        css->checkShardVersionOrThrow(txn);

        // Set up the cursor for getMore.
        CursorId cursorId = 0;
        if (shouldSaveCursor(txn, collection, state, exec.get())) {
            // Register the execution plan inside a ClientCursor. Ownership of the PlanExecutor is
            // transferred to the ClientCursor.
            //
            // First unregister the PlanExecutor so it can be re-registered with ClientCursor.
            exec->deregisterExec();

            // Create a ClientCursor containing this plan executor. We don't have to worry about
            // leaking it as it's inserted into a global map by its ctor.
            ClientCursor* cursor =
                new ClientCursor(collection->getCursorManager(),
                                 exec.release(),
                                 nss.ns(),
                                 txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(),
                                 originalQR.getOptions(),
                                 cmdObj.getOwned());
            cursorId = cursor->cursorid();

            invariant(!exec);
            PlanExecutor* cursorExec = cursor->getExecutor();

            // State will be restored on getMore.
            cursorExec->saveState();
            cursorExec->detachFromOperationContext();

            cursor->setLeftoverMaxTimeMicros(txn->getRemainingMaxTimeMicros());
            cursor->setPos(numResults);

            // Fill out curop based on the results.
            endQueryOp(txn, collection, *cursorExec, numResults, cursorId);
        } else {
            endQueryOp(txn, collection, *exec, numResults, cursorId);
        }

        // Generate the response object to send to the client.
        firstBatch.done(cursorId, nss.ns());
        return true;
    }