Example #1
0
    static void handleCursorCommand(CursorId id, BSONObj& cmdObj, BSONObjBuilder& result) {
        BSONElement batchSizeElem = cmdObj.getFieldDotted("cursor.batchSize");
        const long long batchSize = batchSizeElem.isNumber()
                                    ? batchSizeElem.numberLong()
                                    : 101; // same as query

        ClientCursorPin pin(id);
        ClientCursor* cursor = pin.c();

        massert(16958, "Cursor shouldn't have been deleted",
                cursor);

        verify(cursor->isAggCursor);
        PipelineRunner* runner = dynamic_cast<PipelineRunner*>(cursor->getRunner());
        verify(runner);
        try {
            const string cursorNs = cursor->ns(); // we need this after cursor may have been deleted

            // can't use result BSONObjBuilder directly since it won't handle exceptions correctly.
            BSONArrayBuilder resultsArray;
            const int byteLimit = MaxBytesToReturnToClientAtOnce;
            BSONObj next;
            for (int objCount = 0; objCount < batchSize; objCount++) {
                // The initial getNext() on a PipelineRunner may be very expensive so we don't do it
                // when batchSize is 0 since that indicates a desire for a fast return.
                if (runner->getNext(&next, NULL) != Runner::RUNNER_ADVANCED) {
                    pin.deleteUnderlying();
                    id = 0;
                    cursor = NULL; // make it an obvious error to use cursor after this point
                    break;
                }

                if (resultsArray.len() + next.objsize() > byteLimit) {
                    // too big. next will be the first doc in the second batch
                    runner->pushBack(next);
                    break;
                }

                resultsArray.append(next);
            }

            if (cursor) {
                // If a time limit was set on the pipeline, remaining time is "rolled over" to the
                // cursor (for use by future getmore ops).
                cursor->setLeftoverMaxTimeMicros( cc().curop()->getRemainingMaxTimeMicros() );
            }

            BSONObjBuilder cursorObj(result.subobjStart("cursor"));
            cursorObj.append("id", id);
            cursorObj.append("ns", cursorNs);
            cursorObj.append("firstBatch", resultsArray.arr());
            cursorObj.done();
        }
        catch (...) {
            // Clean up cursor on way out of scope.
            pin.deleteUnderlying();
            throw;
        }
    }
Example #2
0
    bool Pipeline::run(BSONObjBuilder &result, string &errmsg) {
        massert(16600, "should not have an empty pipeline",
                !sources.empty());

        /* chain together the sources we found */
        DocumentSource* prevSource = sources.front().get();
        for(SourceContainer::iterator iter(sources.begin() + 1),
                                      listEnd(sources.end());
                                    iter != listEnd;
                                    ++iter) {
            intrusive_ptr<DocumentSource> pTemp(*iter);
            pTemp->setSource(prevSource);
            prevSource = pTemp.get();
        }

        /*
          Iterate through the resulting documents, and add them to the result.
          We do this even if we're doing an explain, in order to capture
          the document counts and other stats.  However, we don't capture
          the result documents for explain.
        */
        if (explain) {
            if (!pCtx->getInRouter())
                writeExplainShard(result);
            else {
                writeExplainMongos(result);
            }
        }
        else {
            // the array in which the aggregation results reside
            // cant use subArrayStart() due to error handling
            BSONArrayBuilder resultArray;
            DocumentSource* finalSource = sources.back().get();
            for(bool hasDoc = !finalSource->eof(); hasDoc; hasDoc = finalSource->advance()) {
                Document pDocument(finalSource->getCurrent());

                /* add the document to the result set */
                BSONObjBuilder documentBuilder (resultArray.subobjStart());
                pDocument->toBson(&documentBuilder);
                documentBuilder.doneFast();
                // object will be too large, assert. the extra 1KB is for headers
                uassert(16389,
                        str::stream() << "aggregation result exceeds maximum document size ("
                                      << BSONObjMaxUserSize / (1024 * 1024) << "MB)",
                        resultArray.len() < BSONObjMaxUserSize - 1024);
            }

            resultArray.done();
            result.appendArray("result", resultArray.arr());
        }

    return true;
    }
Example #3
0
    static void handleCursorCommand(CursorId id, BSONObj& cmdObj, BSONObjBuilder& result) {
        BSONElement batchSizeElem = cmdObj.getFieldDotted("cursor.batchSize");
        const long long batchSize = batchSizeElem.isNumber()
                                    ? batchSizeElem.numberLong()
                                    : 101; // same as query

        // Using limited cursor API that ignores many edge cases. Should be sufficient for commands.
        ClientCursor::Pin pin(id);
        ClientCursor* cursor = pin.c();

        massert(16958, "Cursor shouldn't have been deleted",
                cursor);

        // Make sure this cursor won't disappear on us
        fassert(16959, !cursor->c()->shouldDestroyOnNSDeletion());
        fassert(16960, !cursor->c()->requiresLock());

        try {
            // can't use result BSONObjBuilder directly since it won't handle exceptions correctly.
            BSONArrayBuilder resultsArray;
            const int byteLimit = MaxBytesToReturnToClientAtOnce;
            for (int objs = 0;
                    objs < batchSize && cursor->ok() && resultsArray.len() <= byteLimit;
                    objs++) {
                // TODO may need special logic if cursor->current() would cause results to be > 16MB
                resultsArray.append(cursor->current());
                cursor->advance();
            }

            // The initial ok() on a cursor may be very expensive so we don't do it when batchSize
            // is 0 since that indicates a desire for a fast return.
            if (batchSize != 0 && !cursor->ok()) {
                // There is no more data. Kill the cursor.
                pin.release();
                ClientCursor::erase(id);
                id = 0;
            }

            BSONObjBuilder cursorObj(result.subobjStart("cursor"));
            cursorObj.append("id", id);
            cursorObj.append("ns", cursor->ns());
            cursorObj.append("firstBatch", resultsArray.arr());
            cursorObj.done();
        }
        catch (...) {
            // Clean up cursor on way out of scope.
            pin.release();
            ClientCursor::erase(id);
            throw;
        }
    }
Example #4
0
void Pipeline::run(BSONObjBuilder& result) {
    // We should not get here in the explain case.
    verify(!pCtx->isExplain);

    // the array in which the aggregation results reside
    // cant use subArrayStart() due to error handling
    BSONArrayBuilder resultArray;
    while (auto next = getNext()) {
        // Add the document to the result set.
        BSONObjBuilder documentBuilder(resultArray.subobjStart());
        next->toBson(&documentBuilder);
        documentBuilder.doneFast();
        // Object will be too large, assert. The extra 1KB is for headers.
        uassert(16389,
                str::stream() << "aggregation result exceeds maximum document size ("
                              << BSONObjMaxUserSize / (1024 * 1024)
                              << "MB)",
                resultArray.len() < BSONObjMaxUserSize - 1024);
    }

    resultArray.done();
    result.appendArray("result", resultArray.arr());
}
Example #5
0
    void Pipeline::run(BSONObjBuilder& result) {
        /*
          Iterate through the resulting documents, and add them to the result.
          We do this even if we're doing an explain, in order to capture
          the document counts and other stats.  However, we don't capture
          the result documents for explain.
        */
        if (explain) {
            if (!pCtx->getInRouter())
                writeExplainShard(result);
            else {
                writeExplainMongos(result);
            }
        }
        else {
            // the array in which the aggregation results reside
            // cant use subArrayStart() due to error handling
            BSONArrayBuilder resultArray;
            DocumentSource* finalSource = sources.back().get();
            for (bool hasDoc = !finalSource->eof(); hasDoc; hasDoc = finalSource->advance()) {
                Document pDocument(finalSource->getCurrent());

                /* add the document to the result set */
                BSONObjBuilder documentBuilder (resultArray.subobjStart());
                pDocument->toBson(&documentBuilder);
                documentBuilder.doneFast();
                // object will be too large, assert. the extra 1KB is for headers
                uassert(16389,
                        str::stream() << "aggregation result exceeds maximum document size ("
                                      << BSONObjMaxUserSize / (1024 * 1024) << "MB)",
                        resultArray.len() < BSONObjMaxUserSize - 1024);
            }

            resultArray.done();
            result.appendArray("result", resultArray.arr());
        }
    }
Example #6
0
/**
 * Returns true if we need to keep a ClientCursor saved for this pipeline (for future getMore
 * requests).  Otherwise, returns false.
 */
static bool handleCursorCommand(OperationContext* txn,
                                const string& ns,
                                ClientCursorPin* pin,
                                PlanExecutor* exec,
                                const BSONObj& cmdObj,
                                BSONObjBuilder& result) {
    ClientCursor* cursor = pin ? pin->c() : NULL;
    if (pin) {
        invariant(cursor);
        invariant(cursor->getExecutor() == exec);
        invariant(cursor->isAggCursor());
    }

    const long long defaultBatchSize = 101;  // Same as query.
    long long batchSize;
    uassertStatusOK(Command::parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize));

    // can't use result BSONObjBuilder directly since it won't handle exceptions correctly.
    BSONArrayBuilder resultsArray;
    BSONObj next;
    for (int objCount = 0; objCount < batchSize; objCount++) {
        // The initial getNext() on a PipelineProxyStage may be very expensive so we don't
        // do it when batchSize is 0 since that indicates a desire for a fast return.
        PlanExecutor::ExecState state;
        if ((state = exec->getNext(&next, NULL)) == PlanExecutor::IS_EOF) {
            // make it an obvious error to use cursor or executor after this point
            cursor = NULL;
            exec = NULL;
            break;
        }

        uassert(34426,
                "Plan executor error during aggregation: " + WorkingSetCommon::toStatusString(next),
                PlanExecutor::ADVANCED == state);

        // If adding this object will cause us to exceed the message size limit, then we stash it
        // for later.
        if (!FindCommon::haveSpaceForNext(next, objCount, resultsArray.len())) {
            exec->enqueue(next);
            break;
        }

        resultsArray.append(next);
    }

    // NOTE: exec->isEOF() can have side effects such as writing by $out. However, it should
    // be relatively quick since if there was no pin then the input is empty. Also, this
    // violates the contract for batchSize==0. Sharding requires a cursor to be returned in that
    // case. This is ok for now however, since you can't have a sharded collection that doesn't
    // exist.
    const bool canReturnMoreBatches = pin;
    if (!canReturnMoreBatches && exec && !exec->isEOF()) {
        // msgasserting since this shouldn't be possible to trigger from today's aggregation
        // language. The wording assumes that the only reason pin would be null is if the
        // collection doesn't exist.
        msgasserted(
            17391,
            str::stream() << "Aggregation has more results than fit in initial batch, but can't "
                          << "create cursor since collection " << ns << " doesn't exist");
    }

    if (cursor) {
        // If a time limit was set on the pipeline, remaining time is "rolled over" to the
        // cursor (for use by future getmore ops).
        cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros());

        CurOp::get(txn)->debug().cursorid = cursor->cursorid();

        // Cursor needs to be in a saved state while we yield locks for getmore. State
        // will be restored in getMore().
        exec->saveState();
        exec->detachFromOperationContext();
    }

    const long long cursorId = cursor ? cursor->cursorid() : 0LL;
    appendCursorResponseObject(cursorId, ns, resultsArray.arr(), &result);

    return static_cast<bool>(cursor);
}
Example #7
0
    bool run(OperationContext* txn,
             const string& dbname,
             BSONObj& cmdObj,
             int,
             string& errmsg,
             BSONObjBuilder& result) {
        BSONElement first = cmdObj.firstElement();
        uassert(28528,
                str::stream() << "Argument to listIndexes must be of type String, not "
                              << typeName(first.type()),
                first.type() == String);
        StringData collectionName = first.valueStringData();
        uassert(28529,
                str::stream() << "Argument to listIndexes must be a collection name, "
                              << "not the empty string",
                !collectionName.empty());
        const NamespaceString ns(dbname, collectionName);

        const long long defaultBatchSize = std::numeric_limits<long long>::max();
        long long batchSize;
        Status parseCursorStatus = parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize);
        if (!parseCursorStatus.isOK()) {
            return appendCommandStatus(result, parseCursorStatus);
        }

        AutoGetCollectionForRead autoColl(txn, ns);
        if (!autoColl.getDb()) {
            return appendCommandStatus(result,
                                       Status(ErrorCodes::NamespaceNotFound, "no database"));
        }

        const Collection* collection = autoColl.getCollection();
        if (!collection) {
            return appendCommandStatus(result,
                                       Status(ErrorCodes::NamespaceNotFound, "no collection"));
        }

        const CollectionCatalogEntry* cce = collection->getCatalogEntry();
        invariant(cce);

        vector<string> indexNames;
        MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
            indexNames.clear();
            cce->getAllIndexes(txn, &indexNames);
        }
        MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns());

        std::unique_ptr<WorkingSet> ws(new WorkingSet());
        std::unique_ptr<QueuedDataStage> root(new QueuedDataStage(ws.get()));

        for (size_t i = 0; i < indexNames.size(); i++) {
            BSONObj indexSpec;
            MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
                indexSpec = cce->getIndexSpec(txn, indexNames[i]);
            }
            MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns());

            WorkingSetID id = ws->allocate();
            WorkingSetMember* member = ws->get(id);
            member->keyData.clear();
            member->loc = RecordId();
            member->obj = Snapshotted<BSONObj>(SnapshotId(), indexSpec.getOwned());
            member->transitionToOwnedObj();
            root->pushBack(id);
        }

        std::string cursorNamespace = str::stream() << dbname << ".$cmd." << name << "."
                                                    << ns.coll();
        dassert(NamespaceString(cursorNamespace).isValid());
        dassert(NamespaceString(cursorNamespace).isListIndexesCursorNS());
        dassert(ns == NamespaceString(cursorNamespace).getTargetNSForListIndexes());

        auto statusWithPlanExecutor = PlanExecutor::make(
            txn, std::move(ws), std::move(root), cursorNamespace, PlanExecutor::YIELD_MANUAL);
        if (!statusWithPlanExecutor.isOK()) {
            return appendCommandStatus(result, statusWithPlanExecutor.getStatus());
        }
        std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue());

        BSONArrayBuilder firstBatch;

        const int byteLimit = MaxBytesToReturnToClientAtOnce;
        for (long long objCount = 0; objCount < batchSize && firstBatch.len() < byteLimit;
             objCount++) {
            BSONObj next;
            PlanExecutor::ExecState state = exec->getNext(&next, NULL);
            if (state == PlanExecutor::IS_EOF) {
                break;
            }
            invariant(state == PlanExecutor::ADVANCED);
            firstBatch.append(next);
        }

        CursorId cursorId = 0LL;
        if (!exec->isEOF()) {
            exec->saveState();
            ClientCursor* cursor = new ClientCursor(
                CursorManager::getGlobalCursorManager(), exec.release(), cursorNamespace);
            cursorId = cursor->cursorid();
        }

        appendCursorResponseObject(cursorId, cursorNamespace, firstBatch.arr(), &result);

        return true;
    }
    bool run(OperationContext* txn,
             const string& dbname,
             BSONObj& jsobj,
             int,
             string& errmsg,
             BSONObjBuilder& result) {
        unique_ptr<MatchExpression> matcher;
        BSONElement filterElt = jsobj["filter"];
        if (!filterElt.eoo()) {
            if (filterElt.type() != mongo::Object) {
                return appendCommandStatus(
                    result, Status(ErrorCodes::BadValue, "\"filter\" must be an object"));
            }
            StatusWithMatchExpression statusWithMatcher =
                MatchExpressionParser::parse(filterElt.Obj());
            if (!statusWithMatcher.isOK()) {
                return appendCommandStatus(result, statusWithMatcher.getStatus());
            }
            matcher = std::move(statusWithMatcher.getValue());
        }

        const long long defaultBatchSize = std::numeric_limits<long long>::max();
        long long batchSize;
        Status parseCursorStatus = parseCommandCursorOptions(jsobj, defaultBatchSize, &batchSize);
        if (!parseCursorStatus.isOK()) {
            return appendCommandStatus(result, parseCursorStatus);
        }

        ScopedTransaction scopedXact(txn, MODE_IS);
        AutoGetDb autoDb(txn, dbname, MODE_S);

        const Database* d = autoDb.getDb();
        const DatabaseCatalogEntry* dbEntry = NULL;

        list<string> names;
        if (d) {
            dbEntry = d->getDatabaseCatalogEntry();
            dbEntry->getCollectionNamespaces(&names);
            names.sort();
        }

        auto ws = make_unique<WorkingSet>();
        auto root = make_unique<QueuedDataStage>(txn, ws.get());

        for (std::list<std::string>::const_iterator i = names.begin(); i != names.end(); ++i) {
            const std::string& ns = *i;

            StringData collection = nsToCollectionSubstring(ns);
            if (collection == "system.namespaces") {
                continue;
            }

            BSONObjBuilder b;
            b.append("name", collection);

            CollectionOptions options =
                dbEntry->getCollectionCatalogEntry(ns)->getCollectionOptions(txn);
            b.append("options", options.toBSON());

            BSONObj maybe = b.obj();
            if (matcher && !matcher->matchesBSON(maybe)) {
                continue;
            }

            WorkingSetID id = ws->allocate();
            WorkingSetMember* member = ws->get(id);
            member->keyData.clear();
            member->loc = RecordId();
            member->obj = Snapshotted<BSONObj>(SnapshotId(), maybe);
            member->transitionToOwnedObj();
            root->pushBack(id);
        }

        std::string cursorNamespace = str::stream() << dbname << ".$cmd." << name;
        dassert(NamespaceString(cursorNamespace).isValid());
        dassert(NamespaceString(cursorNamespace).isListCollectionsCursorNS());

        auto statusWithPlanExecutor = PlanExecutor::make(
            txn, std::move(ws), std::move(root), cursorNamespace, PlanExecutor::YIELD_MANUAL);
        if (!statusWithPlanExecutor.isOK()) {
            return appendCommandStatus(result, statusWithPlanExecutor.getStatus());
        }
        unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue());

        BSONArrayBuilder firstBatch;

        const int byteLimit = FindCommon::kMaxBytesToReturnToClientAtOnce;
        for (long long objCount = 0; objCount < batchSize && firstBatch.len() < byteLimit;
             objCount++) {
            BSONObj next;
            PlanExecutor::ExecState state = exec->getNext(&next, NULL);
            if (state == PlanExecutor::IS_EOF) {
                break;
            }
            invariant(state == PlanExecutor::ADVANCED);
            firstBatch.append(next);
        }

        CursorId cursorId = 0LL;
        if (!exec->isEOF()) {
            exec->saveState();
            exec->detachFromOperationContext();
            ClientCursor* cursor =
                new ClientCursor(CursorManager::getGlobalCursorManager(),
                                 exec.release(),
                                 cursorNamespace,
                                 txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot());
            cursorId = cursor->cursorid();
        }

        appendCursorResponseObject(cursorId, cursorNamespace, firstBatch.arr(), &result);

        return true;
    }
    bool run(OperationContext* txn,
             const string& dbname,
             BSONObj& jsobj,
             int,
             string& errmsg,
             BSONObjBuilder& result,
             bool /*fromRepl*/) {
        boost::scoped_ptr<MatchExpression> matcher;
        BSONElement filterElt = jsobj["filter"];
        if (!filterElt.eoo()) {
            if (filterElt.type() != mongo::Object) {
                return appendCommandStatus(result,
                                           Status(ErrorCodes::TypeMismatch,
                                                  str::stream()
                                                      << "\"filter\" must be of type Object, not "
                                                      << typeName(filterElt.type())));
            }
            StatusWithMatchExpression statusWithMatcher =
                MatchExpressionParser::parse(filterElt.Obj());
            if (!statusWithMatcher.isOK()) {
                return appendCommandStatus(result, statusWithMatcher.getStatus());
            }
            matcher.reset(statusWithMatcher.getValue());
        }

        const long long defaultBatchSize = std::numeric_limits<long long>::max();
        long long batchSize;
        Status parseCursorStatus = parseCommandCursorOptions(jsobj, defaultBatchSize, &batchSize);
        if (!parseCursorStatus.isOK()) {
            return appendCommandStatus(result, parseCursorStatus);
        }

        ScopedTransaction scopedXact(txn, MODE_IS);
        AutoGetDb autoDb(txn, dbname, MODE_S);

        const Database* d = autoDb.getDb();
        const DatabaseCatalogEntry* dbEntry = NULL;

        list<string> names;
        if (d) {
            dbEntry = d->getDatabaseCatalogEntry();
            dbEntry->getCollectionNamespaces(&names);
            names.sort();
        }

        std::auto_ptr<WorkingSet> ws(new WorkingSet());
        std::auto_ptr<QueuedDataStage> root(new QueuedDataStage(ws.get()));

        for (std::list<std::string>::const_iterator i = names.begin(); i != names.end(); ++i) {
            const std::string& ns = *i;

            StringData collection = nsToCollectionSubstring(ns);
            if (collection == "system.namespaces") {
                continue;
            }

            BSONObjBuilder b;
            b.append("name", collection);

            CollectionOptions options =
                dbEntry->getCollectionCatalogEntry(ns)->getCollectionOptions(txn);
            b.append("options", options.toBSON());

            BSONObj maybe = b.obj();
            if (matcher && !matcher->matchesBSON(maybe)) {
                continue;
            }

            WorkingSetMember member;
            member.state = WorkingSetMember::OWNED_OBJ;
            member.keyData.clear();
            member.loc = RecordId();
            member.obj = Snapshotted<BSONObj>(SnapshotId(), maybe);
            root->pushBack(member);
        }

        std::string cursorNamespace = str::stream() << dbname << ".$cmd." << name;
        dassert(NamespaceString(cursorNamespace).isValid());
        dassert(NamespaceString(cursorNamespace).isListCollectionsGetMore());

        PlanExecutor* rawExec;
        Status makeStatus = PlanExecutor::make(txn,
                                               ws.release(),
                                               root.release(),
                                               cursorNamespace,
                                               PlanExecutor::YIELD_MANUAL,
                                               &rawExec);
        std::auto_ptr<PlanExecutor> exec(rawExec);
        if (!makeStatus.isOK()) {
            return appendCommandStatus(result, makeStatus);
        }

        BSONArrayBuilder firstBatch;

        const int byteLimit = MaxBytesToReturnToClientAtOnce;
        for (long long objCount = 0; objCount < batchSize && firstBatch.len() < byteLimit;
             objCount++) {
            BSONObj next;
            PlanExecutor::ExecState state = exec->getNext(&next, NULL);
            if (state == PlanExecutor::IS_EOF) {
                break;
            }
            invariant(state == PlanExecutor::ADVANCED);
            firstBatch.append(next);
        }

        CursorId cursorId = 0LL;
        if (!exec->isEOF()) {
            exec->saveState();
            ClientCursor* cursor = new ClientCursor(
                CursorManager::getGlobalCursorManager(), exec.release(), cursorNamespace);
            cursorId = cursor->cursorid();
        }

        Command::appendCursorResponseObject(cursorId, cursorNamespace, firstBatch.arr(), &result);

        return true;
    }
Example #10
0
    /**
     * Runs a query using the following steps:
     *   1) Parsing.
     *   2) Acquire locks.
     *   3) Plan query, obtaining an executor that can run it.
     *   4) Setup a cursor for the query, which may be used on subsequent getMores.
     *   5) Generate the first batch.
     *   6) Save state for getMore.
     *   7) Generate response to send to the client.
     *
     * TODO: Rather than using the sharding version available in thread-local storage (i.e. the
     *       call to ShardingState::needCollectionMetadata() below), shard version information
     *       should be passed as part of the command parameter.
     */
    bool run(OperationContext* txn,
             const std::string& dbname,
             BSONObj& cmdObj,
             int options,
             std::string& errmsg,
             BSONObjBuilder& result) override {
        const std::string fullns = parseNs(dbname, cmdObj);
        const NamespaceString nss(fullns);
        if (!nss.isValid()) {
            return appendCommandStatus(result,
                                       {ErrorCodes::InvalidNamespace,
                                        str::stream() << "Invalid collection name: " << nss.ns()});
        }

        // Although it is a command, a find command gets counted as a query.
        globalOpCounters.gotQuery();

        if (txn->getClient()->isInDirectClient()) {
            return appendCommandStatus(
                result,
                Status(ErrorCodes::IllegalOperation, "Cannot run find command from eval()"));
        }

        // 1a) Parse the command BSON to a LiteParsedQuery.
        const bool isExplain = false;
        auto lpqStatus = LiteParsedQuery::makeFromFindCommand(nss, cmdObj, isExplain);
        if (!lpqStatus.isOK()) {
            return appendCommandStatus(result, lpqStatus.getStatus());
        }

        auto& lpq = lpqStatus.getValue();

        // Validate term, if provided.
        if (auto term = lpq->getReplicationTerm()) {
            auto replCoord = repl::ReplicationCoordinator::get(txn);
            Status status = replCoord->updateTerm(*term);
            // Note: updateTerm returns ok if term stayed the same.
            if (!status.isOK()) {
                return appendCommandStatus(result, status);
            }
        }

        // Fill out curop information.
        long long ntoreturn = lpq->getBatchSize().value_or(0);
        beginQueryOp(txn, nss, cmdObj, ntoreturn, lpq->getSkip());

        // 1b) Finish the parsing step by using the LiteParsedQuery to create a CanonicalQuery.
        WhereCallbackReal whereCallback(txn, nss.db());
        auto statusWithCQ = CanonicalQuery::canonicalize(lpq.release(), whereCallback);
        if (!statusWithCQ.isOK()) {
            return appendCommandStatus(result, statusWithCQ.getStatus());
        }
        std::unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());

        // 2) Acquire locks.
        AutoGetCollectionForRead ctx(txn, nss);
        Collection* collection = ctx.getCollection();

        const int dbProfilingLevel =
            ctx.getDb() ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile;

        ShardingState* const shardingState = ShardingState::get(txn);

        // It is possible that the sharding version will change during yield while we are
        // retrieving a plan executor. If this happens we will throw an error and mongos will
        // retry.
        const ChunkVersion shardingVersionAtStart = shardingState->getVersion(nss.ns());

        // 3) Get the execution plan for the query.
        auto statusWithPlanExecutor =
            getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO);
        if (!statusWithPlanExecutor.isOK()) {
            return appendCommandStatus(result, statusWithPlanExecutor.getStatus());
        }

        std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue());

        // TODO: Currently, chunk ranges are kept around until all ClientCursors created while
        // the chunk belonged on this node are gone. Separating chunk lifetime management from
        // ClientCursor should allow this check to go away.
        if (!shardingState->getVersion(nss.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // Version changed while retrieving a PlanExecutor. Terminate the operation,
            // signaling that mongos should retry.
            throw SendStaleConfigException(nss.ns(),
                                           "version changed during find command",
                                           shardingVersionAtStart,
                                           shardingState->getVersion(nss.ns()));
        }

        if (!collection) {
            // No collection. Just fill out curop indicating that there were zero results and
            // there is no ClientCursor id, and then return.
            const long long numResults = 0;
            const CursorId cursorId = 0;
            endQueryOp(txn, *exec, dbProfilingLevel, numResults, cursorId);
            appendCursorResponseObject(cursorId, nss.ns(), BSONArray(), &result);
            return true;
        }

        const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed();

        // 4) If possible, register the execution plan inside a ClientCursor, and pin that
        // cursor. In this case, ownership of the PlanExecutor is transferred to the
        // ClientCursor, and 'exec' becomes null.
        //
        // First unregister the PlanExecutor so it can be re-registered with ClientCursor.
        exec->deregisterExec();

        // Create a ClientCursor containing this plan executor. We don't have to worry
        // about leaking it as it's inserted into a global map by its ctor.
        ClientCursor* cursor =
            new ClientCursor(collection->getCursorManager(),
                             exec.release(),
                             nss.ns(),
                             txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(),
                             pq.getOptions(),
                             pq.getFilter());
        CursorId cursorId = cursor->cursorid();
        ClientCursorPin ccPin(collection->getCursorManager(), cursorId);

        // On early return, get rid of the the cursor.
        ScopeGuard cursorFreer = MakeGuard(&ClientCursorPin::deleteUnderlying, ccPin);

        invariant(!exec);
        PlanExecutor* cursorExec = cursor->getExecutor();

        // 5) Stream query results, adding them to a BSONArray as we go.
        BSONArrayBuilder firstBatch;
        BSONObj obj;
        PlanExecutor::ExecState state;
        long long numResults = 0;
        while (!enoughForFirstBatch(pq, numResults, firstBatch.len()) &&
               PlanExecutor::ADVANCED == (state = cursorExec->getNext(&obj, NULL))) {
            // If adding this object will cause us to exceed the BSON size limit, then we stash
            // it for later.
            if (firstBatch.len() + obj.objsize() > BSONObjMaxUserSize && numResults > 0) {
                cursorExec->enqueue(obj);
                break;
            }

            // Add result to output buffer.
            firstBatch.append(obj);
            numResults++;
        }

        // Throw an assertion if query execution fails for any reason.
        if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) {
            const std::unique_ptr<PlanStageStats> stats(cursorExec->getStats());
            error() << "Plan executor error during find command: " << PlanExecutor::statestr(state)
                    << ", stats: " << Explain::statsToBSON(*stats);

            return appendCommandStatus(result,
                                       Status(ErrorCodes::OperationFailed,
                                              str::stream()
                                                  << "Executor error during find command: "
                                                  << WorkingSetCommon::toStatusString(obj)));
        }

        // 6) Set up the cursor for getMore.
        if (shouldSaveCursor(txn, collection, state, cursorExec)) {
            // State will be restored on getMore.
            cursorExec->saveState();
            cursorExec->detachFromOperationContext();

            cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros());
            cursor->setPos(numResults);
        } else {
            cursorId = 0;
        }

        // Fill out curop based on the results.
        endQueryOp(txn, *cursorExec, dbProfilingLevel, numResults, cursorId);

        // 7) Generate the response object to send to the client.
        appendCursorResponseObject(cursorId, nss.ns(), firstBatch.arr(), &result);
        if (cursorId) {
            cursorFreer.Dismiss();
        }
        return true;
    }
Example #11
0
    static void handleCursorCommand(OperationContext* txn,
                                    const string& ns,
                                    ClientCursorPin* pin,
                                    PlanExecutor* exec,
                                    const BSONObj& cmdObj,
                                    BSONObjBuilder& result) {

        ClientCursor* cursor = pin ? pin->c() : NULL;
        if (pin) {
            invariant(cursor);
            invariant(cursor->getExecutor() == exec);
            invariant(cursor->isAggCursor());
        }

        BSONElement batchSizeElem = cmdObj.getFieldDotted("cursor.batchSize");
        const long long batchSize = batchSizeElem.isNumber()
                                    ? batchSizeElem.numberLong()
                                    : 101; // same as query

        // can't use result BSONObjBuilder directly since it won't handle exceptions correctly.
        BSONArrayBuilder resultsArray;
        const int byteLimit = MaxBytesToReturnToClientAtOnce;
        BSONObj next;
        for (int objCount = 0; objCount < batchSize; objCount++) {
            // The initial getNext() on a PipelineProxyStage may be very expensive so we don't
            // do it when batchSize is 0 since that indicates a desire for a fast return.
            if (exec->getNext(&next, NULL) != PlanExecutor::ADVANCED) {
                if (pin) pin->deleteUnderlying();
                // make it an obvious error to use cursor or executor after this point
                cursor = NULL;
                exec = NULL;
                break;
            }

            if (resultsArray.len() + next.objsize() > byteLimit) {
                // Get the pipeline proxy stage wrapped by this PlanExecutor.
                PipelineProxyStage* proxy = static_cast<PipelineProxyStage*>(exec->getRootStage());
                // too big. next will be the first doc in the second batch
                proxy->pushBack(next);
                break;
            }

            resultsArray.append(next);
        }

        // NOTE: exec->isEOF() can have side effects such as writing by $out. However, it should
        // be relatively quick since if there was no pin then the input is empty. Also, this
        // violates the contract for batchSize==0. Sharding requires a cursor to be returned in that
        // case. This is ok for now however, since you can't have a sharded collection that doesn't
        // exist.
        const bool canReturnMoreBatches = pin;
        if (!canReturnMoreBatches && exec && !exec->isEOF()) {
            // msgasserting since this shouldn't be possible to trigger from today's aggregation
            // language. The wording assumes that the only reason pin would be null is if the
            // collection doesn't exist.
            msgasserted(17391, str::stream()
                << "Aggregation has more results than fit in initial batch, but can't "
                << "create cursor since collection " << ns << " doesn't exist");
        }

        if (cursor) {
            // If a time limit was set on the pipeline, remaining time is "rolled over" to the
            // cursor (for use by future getmore ops).
            cursor->setLeftoverMaxTimeMicros( txn->getCurOp()->getRemainingMaxTimeMicros() );

            // We stash away the RecoveryUnit in the ClientCursor.  It's used for subsequent
            // getMore requests.  The calling OpCtx gets a fresh RecoveryUnit.
            cursor->setOwnedRecoveryUnit(txn->releaseRecoveryUnit());
            StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine();
            txn->setRecoveryUnit(storageEngine->newRecoveryUnit());

            // Cursor needs to be in a saved state while we yield locks for getmore. State
            // will be restored in getMore().
            exec->saveState();
        }

        BSONObjBuilder cursorObj(result.subobjStart("cursor"));
        cursorObj.append("id", cursor ? cursor->cursorid() : 0LL);
        cursorObj.append("ns", ns);
        cursorObj.append("firstBatch", resultsArray.arr());
        cursorObj.done();
    }
Example #12
0
    static void handleCursorCommand(CursorId id, BSONObj& cmdObj, BSONObjBuilder& result) {
        BSONElement batchSizeElem = cmdObj.getFieldDotted("cursor.batchSize");
        const long long batchSize = batchSizeElem.isNumber()
                                    ? batchSizeElem.numberLong()
                                    : 101; // same as query

        // Using limited cursor API that ignores many edge cases. Should be sufficient for commands.
        ClientCursorPin pin(id);
        ClientCursor* cursor = pin.c();

        massert(16958, "Cursor shouldn't have been deleted",
                cursor);

        // Make sure this cursor won't disappear on us
        fassert(16959, !cursor->c()->shouldDestroyOnNSDeletion());
        fassert(16960, !cursor->c()->requiresLock());

        try {
            const string cursorNs = cursor->ns(); // we need this after cursor may have been deleted

            // can't use result BSONObjBuilder directly since it won't handle exceptions correctly.
            BSONArrayBuilder resultsArray;
            const int byteLimit = MaxBytesToReturnToClientAtOnce;
            for (int objCount = 0; objCount < batchSize && cursor->ok(); objCount++) {
                BSONObj current = cursor->current();
                if (resultsArray.len() + current.objsize() > byteLimit)
                    break; // too big. current will be the first doc in the second batch

                resultsArray.append(current);
                cursor->advance();
            }

            // The initial ok() on a cursor may be very expensive so we don't do it when batchSize
            // is 0 since that indicates a desire for a fast return.
            if (batchSize != 0 && !cursor->ok()) {
                // There is no more data. Kill the cursor.
                pin.release();
                ClientCursor::erase(id);
                id = 0;
                cursor = NULL; // make it an obvious error to use cursor after this point
            }

            if (cursor) {
                // If a time limit was set on the pipeline, remaining time is "rolled over" to the
                // cursor (for use by future getmore ops).
                cursor->setLeftoverMaxTimeMicros( cc().curop()->getRemainingMaxTimeMicros() );
            }

            BSONObjBuilder cursorObj(result.subobjStart("cursor"));
            cursorObj.append("id", id);
            cursorObj.append("ns", cursorNs);
            cursorObj.append("firstBatch", resultsArray.arr());
            cursorObj.done();
        }
        catch (...) {
            // Clean up cursor on way out of scope.
            pin.release();
            ClientCursor::erase(id);
            throw;
        }
    }
Example #13
0
    bool run(OperationContext* txn,
             const string& dbname,
             BSONObj& jsobj,
             int,
             string& errmsg,
             BSONObjBuilder& result) {
        unique_ptr<MatchExpression> matcher;
        BSONElement filterElt = jsobj["filter"];
        if (!filterElt.eoo()) {
            if (filterElt.type() != mongo::Object) {
                return appendCommandStatus(
                    result, Status(ErrorCodes::BadValue, "\"filter\" must be an object"));
            }
            StatusWithMatchExpression statusWithMatcher = MatchExpressionParser::parse(
                filterElt.Obj(), ExtensionsCallbackDisallowExtensions());
            if (!statusWithMatcher.isOK()) {
                return appendCommandStatus(result, statusWithMatcher.getStatus());
            }
            matcher = std::move(statusWithMatcher.getValue());
        }

        const long long defaultBatchSize = std::numeric_limits<long long>::max();
        long long batchSize;
        Status parseCursorStatus = parseCommandCursorOptions(jsobj, defaultBatchSize, &batchSize);
        if (!parseCursorStatus.isOK()) {
            return appendCommandStatus(result, parseCursorStatus);
        }

        ScopedTransaction scopedXact(txn, MODE_IS);
        AutoGetDb autoDb(txn, dbname, MODE_S);

        const Database* db = autoDb.getDb();

        auto ws = make_unique<WorkingSet>();
        auto root = make_unique<QueuedDataStage>(txn, ws.get());

        if (db) {
            if (auto collNames = _getExactNameMatches(matcher.get())) {
                for (auto&& collName : *collNames) {
                    auto nss = NamespaceString(db->name(), collName);
                    _addWorkingSetMember(
                        txn, db->getCollection(nss), matcher.get(), ws.get(), root.get());
                }
            } else {
                for (auto&& collection : *db) {
                    _addWorkingSetMember(txn, collection, matcher.get(), ws.get(), root.get());
                }
            }
        }

        std::string cursorNamespace = str::stream() << dbname << ".$cmd." << name;
        dassert(NamespaceString(cursorNamespace).isValid());
        dassert(NamespaceString(cursorNamespace).isListCollectionsCursorNS());

        auto statusWithPlanExecutor = PlanExecutor::make(
            txn, std::move(ws), std::move(root), cursorNamespace, PlanExecutor::YIELD_MANUAL);
        if (!statusWithPlanExecutor.isOK()) {
            return appendCommandStatus(result, statusWithPlanExecutor.getStatus());
        }
        unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue());

        BSONArrayBuilder firstBatch;

        for (long long objCount = 0; objCount < batchSize; objCount++) {
            BSONObj next;
            PlanExecutor::ExecState state = exec->getNext(&next, NULL);
            if (state == PlanExecutor::IS_EOF) {
                break;
            }
            invariant(state == PlanExecutor::ADVANCED);

            // If we can't fit this result inside the current batch, then we stash it for later.
            if (!FindCommon::haveSpaceForNext(next, objCount, firstBatch.len())) {
                exec->enqueue(next);
                break;
            }

            firstBatch.append(next);
        }

        CursorId cursorId = 0LL;
        if (!exec->isEOF()) {
            exec->saveState();
            exec->detachFromOperationContext();
            ClientCursor* cursor =
                new ClientCursor(CursorManager::getGlobalCursorManager(),
                                 exec.release(),
                                 cursorNamespace,
                                 txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot());
            cursorId = cursor->cursorid();
        }

        appendCursorResponseObject(cursorId, cursorNamespace, firstBatch.arr(), &result);

        return true;
    }
Example #14
0
    /**
     * Runs a query using the following steps:
     *   --Parsing.
     *   --Acquire locks.
     *   --Plan query, obtaining an executor that can run it.
     *   --Generate the first batch.
     *   --Save state for getMore, transferring ownership of the executor to a ClientCursor.
     *   --Generate response to send to the client.
     */
    bool run(OperationContext* txn,
             const std::string& dbname,
             BSONObj& cmdObj,
             int options,
             std::string& errmsg,
             BSONObjBuilder& result) override {
        const std::string fullns = parseNs(dbname, cmdObj);
        const NamespaceString nss(fullns);
        if (!nss.isValid() || nss.isCommand() || nss.isSpecialCommand()) {
            return appendCommandStatus(result,
                                       {ErrorCodes::InvalidNamespace,
                                        str::stream() << "Invalid collection name: " << nss.ns()});
        }

        // Although it is a command, a find command gets counted as a query.
        globalOpCounters.gotQuery();

        if (txn->getClient()->isInDirectClient()) {
            return appendCommandStatus(
                result,
                Status(ErrorCodes::IllegalOperation, "Cannot run find command from eval()"));
        }

        // Parse the command BSON to a LiteParsedQuery.
        const bool isExplain = false;
        auto lpqStatus = LiteParsedQuery::makeFromFindCommand(nss, cmdObj, isExplain);
        if (!lpqStatus.isOK()) {
            return appendCommandStatus(result, lpqStatus.getStatus());
        }

        auto& lpq = lpqStatus.getValue();

        // Validate term before acquiring locks, if provided.
        if (auto term = lpq->getReplicationTerm()) {
            auto replCoord = repl::ReplicationCoordinator::get(txn);
            Status status = replCoord->updateTerm(txn, *term);
            // Note: updateTerm returns ok if term stayed the same.
            if (!status.isOK()) {
                return appendCommandStatus(result, status);
            }
        }

        // Fill out curop information.
        //
        // We pass negative values for 'ntoreturn' and 'ntoskip' to indicate that these values
        // should be omitted from the log line. Limit and skip information is already present in the
        // find command parameters, so these fields are redundant.
        const int ntoreturn = -1;
        const int ntoskip = -1;
        beginQueryOp(txn, nss, cmdObj, ntoreturn, ntoskip);

        // Finish the parsing step by using the LiteParsedQuery to create a CanonicalQuery.
        ExtensionsCallbackReal extensionsCallback(txn, &nss);
        auto statusWithCQ = CanonicalQuery::canonicalize(lpq.release(), extensionsCallback);
        if (!statusWithCQ.isOK()) {
            return appendCommandStatus(result, statusWithCQ.getStatus());
        }
        std::unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue());

        ShardingState* const shardingState = ShardingState::get(txn);

        if (OperationShardVersion::get(txn).hasShardVersion() && shardingState->enabled()) {
            ChunkVersion receivedVersion = OperationShardVersion::get(txn).getShardVersion(nss);
            ChunkVersion latestVersion;
            // Wait for migration completion to get the correct chunk version.
            const int maxTimeoutSec = 30;
            int timeoutSec = cq->getParsed().getMaxTimeMS() / 1000;
            if (!timeoutSec || timeoutSec > maxTimeoutSec) {
                timeoutSec = maxTimeoutSec;
            }

            if (!shardingState->waitTillNotInCriticalSection(timeoutSec)) {
                uasserted(ErrorCodes::LockTimeout, "Timeout while waiting for migration commit");
            }

            // If the received version is newer than the version cached in 'shardingState', then we
            // have to refresh 'shardingState' from the config servers. We do this before acquiring
            // locks so that we don't hold locks while waiting on the network.
            uassertStatusOK(shardingState->refreshMetadataIfNeeded(
                txn, nss.ns(), receivedVersion, &latestVersion));
        }

        // Acquire locks.
        AutoGetCollectionForRead ctx(txn, nss);
        Collection* collection = ctx.getCollection();

        const int dbProfilingLevel =
            ctx.getDb() ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile;

        // It is possible that the sharding version will change during yield while we are
        // retrieving a plan executor. If this happens we will throw an error and mongos will
        // retry.
        const ChunkVersion shardingVersionAtStart = shardingState->getVersion(nss.ns());

        // Get the execution plan for the query.
        auto statusWithPlanExecutor =
            getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO);
        if (!statusWithPlanExecutor.isOK()) {
            return appendCommandStatus(result, statusWithPlanExecutor.getStatus());
        }

        std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue());

        if (!collection) {
            // No collection. Just fill out curop indicating that there were zero results and
            // there is no ClientCursor id, and then return.
            const long long numResults = 0;
            const CursorId cursorId = 0;
            endQueryOp(txn, collection, *exec, dbProfilingLevel, numResults, cursorId);
            appendCursorResponseObject(cursorId, nss.ns(), BSONArray(), &result);
            return true;
        }

        const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed();

        // Stream query results, adding them to a BSONArray as we go.
        BSONArrayBuilder firstBatch;
        BSONObj obj;
        PlanExecutor::ExecState state = PlanExecutor::ADVANCED;
        long long numResults = 0;
        while (!FindCommon::enoughForFirstBatch(pq, numResults, firstBatch.len()) &&
               PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
            // If adding this object will cause us to exceed the BSON size limit, then we stash
            // it for later.
            if (firstBatch.len() + obj.objsize() > BSONObjMaxUserSize && numResults > 0) {
                exec->enqueue(obj);
                break;
            }

            // Add result to output buffer.
            firstBatch.append(obj);
            numResults++;
        }

        // Throw an assertion if query execution fails for any reason.
        if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) {
            const std::unique_ptr<PlanStageStats> stats(exec->getStats());
            error() << "Plan executor error during find command: " << PlanExecutor::statestr(state)
                    << ", stats: " << Explain::statsToBSON(*stats);

            return appendCommandStatus(result,
                                       Status(ErrorCodes::OperationFailed,
                                              str::stream()
                                                  << "Executor error during find command: "
                                                  << WorkingSetCommon::toStatusString(obj)));
        }

        // TODO: Currently, chunk ranges are kept around until all ClientCursors created while the
        // chunk belonged on this node are gone. Separating chunk lifetime management from
        // ClientCursor should allow this check to go away.
        if (!shardingState->getVersion(nss.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // Version changed while retrieving a PlanExecutor. Terminate the operation,
            // signaling that mongos should retry.
            throw SendStaleConfigException(nss.ns(),
                                           "version changed during find command",
                                           shardingVersionAtStart,
                                           shardingState->getVersion(nss.ns()));
        }

        // Set up the cursor for getMore.
        CursorId cursorId = 0;
        if (shouldSaveCursor(txn, collection, state, exec.get())) {
            // Register the execution plan inside a ClientCursor. Ownership of the PlanExecutor is
            // transferred to the ClientCursor.
            //
            // First unregister the PlanExecutor so it can be re-registered with ClientCursor.
            exec->deregisterExec();

            // Create a ClientCursor containing this plan executor. We don't have to worry about
            // leaking it as it's inserted into a global map by its ctor.
            ClientCursor* cursor =
                new ClientCursor(collection->getCursorManager(),
                                 exec.release(),
                                 nss.ns(),
                                 txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(),
                                 pq.getOptions(),
                                 pq.getFilter());
            cursorId = cursor->cursorid();

            invariant(!exec);
            PlanExecutor* cursorExec = cursor->getExecutor();

            // State will be restored on getMore.
            cursorExec->saveState();
            cursorExec->detachFromOperationContext();

            cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros());
            cursor->setPos(numResults);

            // Fill out curop based on the results.
            endQueryOp(txn, collection, *cursorExec, dbProfilingLevel, numResults, cursorId);
        } else {
            endQueryOp(txn, collection, *exec, dbProfilingLevel, numResults, cursorId);
        }

        // Generate the response object to send to the client.
        appendCursorResponseObject(cursorId, nss.ns(), firstBatch.arr(), &result);
        return true;
    }
        /**
         * Generates the next batch of results for a ClientCursor.
         *
         * TODO: Do we need to support some equivalent of OP_REPLY responseFlags?
         *
         * TODO: Is it possible to support awaitData?
         */
        bool run(OperationContext* txn,
                 const std::string& dbname,
                 BSONObj& cmdObj,
                 int options,
                 std::string& errmsg,
                 BSONObjBuilder& result) override {
            // Counted as a getMore, not as a command.
            globalOpCounters.gotGetMore();

            if (txn->getClient()->isInDirectClient()) {
                return appendCommandStatus(result,
                                           Status(ErrorCodes::IllegalOperation,
                                                  "Cannot run getMore command from eval()"));
            }

            StatusWith<GetMoreRequest> parseStatus = GetMoreRequest::parseFromBSON(dbname, cmdObj);
            if (!parseStatus.isOK()) {
                return appendCommandStatus(result, parseStatus.getStatus());
            }
            const GetMoreRequest& request = parseStatus.getValue();

            // Depending on the type of cursor being operated on, we hold locks for the whole
            // getMore, or none of the getMore, or part of the getMore.  The three cases in detail:
            //
            // 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore.
            // 2) Cursor owned by global cursor manager: we don't lock anything.  These cursors
            //    don't own any collection state.
            // 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and
            //    "unpinCollLock".  This is because agg cursors handle locking internally (hence the
            //    release), but the pin and unpin of the cursor must occur under the collection
            //    lock. We don't use our AutoGetCollectionForRead "ctx" to relock, because
            //    AutoGetCollectionForRead checks the sharding version (and we want the relock for
            //    the unpin to succeed even if the sharding version has changed).
            //
            // Note that we declare our locks before our ClientCursorPin, in order to ensure that
            // the pin's destructor is called before the lock destructors (so that the unpin occurs
            // under the lock).
            std::unique_ptr<AutoGetCollectionForRead> ctx;
            std::unique_ptr<Lock::DBLock> unpinDBLock;
            std::unique_ptr<Lock::CollectionLock> unpinCollLock;

            CursorManager* cursorManager;
            CursorManager* globalCursorManager = CursorManager::getGlobalCursorManager();
            if (globalCursorManager->ownsCursorId(request.cursorid)) {
                cursorManager = globalCursorManager;
            }
            else {
                ctx.reset(new AutoGetCollectionForRead(txn, request.nss));
                Collection* collection = ctx->getCollection();
                if (!collection) {
                    return appendCommandStatus(result,
                                               Status(ErrorCodes::OperationFailed,
                                                      "collection dropped between getMore calls"));
                }
                cursorManager = collection->getCursorManager();
            }

            ClientCursorPin ccPin(cursorManager, request.cursorid);
            ClientCursor* cursor = ccPin.c();
            if (!cursor) {
                // We didn't find the cursor.
                return appendCommandStatus(result, Status(ErrorCodes::CursorNotFound, str::stream()
                    << "Cursor not found, cursor id: " << request.cursorid));
            }

            if (request.nss.ns() != cursor->ns()) {
                return appendCommandStatus(result, Status(ErrorCodes::Unauthorized, str::stream()
                    << "Requested getMore on namespace '" << request.nss.ns()
                    << "', but cursor belongs to a different namespace"));
            }

            // On early return, get rid of the the cursor.
            ScopeGuard cursorFreer = MakeGuard(&ClientCursorPin::deleteUnderlying, ccPin);

            if (!cursor->hasRecoveryUnit()) {
                // Start using a new RecoveryUnit.
                cursor->setOwnedRecoveryUnit(
                    getGlobalServiceContext()->getGlobalStorageEngine()->newRecoveryUnit());
            }

            // Swap RecoveryUnit(s) between the ClientCursor and OperationContext.
            ScopedRecoveryUnitSwapper ruSwapper(cursor, txn);

            // Reset timeout timer on the cursor since the cursor is still in use.
            cursor->setIdleTime(0);

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            txn->getCurOp()->setMaxTimeMicros(cursor->getLeftoverMaxTimeMicros());
            txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

            if (cursor->isAggCursor()) {
                // Agg cursors handle their own locking internally.
                ctx.reset(); // unlocks
            }

            PlanExecutor* exec = cursor->getExecutor();
            exec->restoreState(txn);

            // TODO: Handle result sets larger than 16MB.
            BSONArrayBuilder nextBatch;
            BSONObj obj;
            PlanExecutor::ExecState state;
            int numResults = 0;
            while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
                // Add result to output buffer.
                nextBatch.append(obj);
                numResults++;

                if (enoughForGetMore(request.batchSize, numResults, nextBatch.len())) {
                    break;
                }
            }

            // If we are operating on an aggregation cursor, then we dropped our collection lock
            // earlier and need to reacquire it in order to clean up our ClientCursorPin.
            //
            // TODO: We need to ensure that this relock happens if we release the pin above in
            // response to PlanExecutor::getNext() throwing an exception.
            if (cursor->isAggCursor()) {
                invariant(NULL == ctx.get());
                unpinDBLock.reset(new Lock::DBLock(txn->lockState(), request.nss.db(), MODE_IS));
                unpinCollLock.reset(
                    new Lock::CollectionLock(txn->lockState(), request.nss.ns(), MODE_IS));
            }

            // Fail the command if the PlanExecutor reports execution failure.
            if (PlanExecutor::FAILURE == state) {
                const std::unique_ptr<PlanStageStats> stats(exec->getStats());
                error() << "GetMore executor error, stats: " << Explain::statsToBSON(*stats);
                return appendCommandStatus(result,
                                           Status(ErrorCodes::OperationFailed,
                                                  str::stream() << "GetMore executor error: "
                                                  << WorkingSetCommon::toStatusString(obj)));
            }

            CursorId respondWithId = 0;
            if (shouldSaveCursorGetMore(state, exec, isCursorTailable(cursor))) {
                respondWithId = request.cursorid;

                exec->saveState();

                cursor->setLeftoverMaxTimeMicros(txn->getCurOp()->getRemainingMaxTimeMicros());
                cursor->incPos(numResults);

                if (isCursorTailable(cursor) && state == PlanExecutor::IS_EOF) {
                    // Rather than swapping their existing RU into the client cursor, tailable
                    // cursors should get a new recovery unit.
                    ruSwapper.dismiss();
                }
            }
            else {
                txn->getCurOp()->debug().cursorExhausted = true;
            }

            appendGetMoreResponseObject(respondWithId, request.nss.ns(), nextBatch.arr(), &result);
            if (respondWithId) {
                cursorFreer.Dismiss();
            }
            return true;
        }
Example #16
0
    static void handleCursorCommand(const string& ns,
                                    ClientCursorPin* pin,
                                    PipelineRunner* runner,
                                    const BSONObj& cmdObj,
                                    BSONObjBuilder& result) {

        ClientCursor* cursor = pin ? pin->c() : NULL;
        if (pin) {
            invariant(cursor);
            invariant(cursor->getRunner() == runner);
            invariant(cursor->isAggCursor);
        }

        BSONElement batchSizeElem = cmdObj.getFieldDotted("cursor.batchSize");
        const long long batchSize = batchSizeElem.isNumber()
                                    ? batchSizeElem.numberLong()
                                    : 101; // same as query

        // can't use result BSONObjBuilder directly since it won't handle exceptions correctly.
        BSONArrayBuilder resultsArray;
        const int byteLimit = MaxBytesToReturnToClientAtOnce;
        BSONObj next;
        for (int objCount = 0; objCount < batchSize; objCount++) {
            // The initial getNext() on a PipelineRunner may be very expensive so we don't
            // do it when batchSize is 0 since that indicates a desire for a fast return.
            if (runner->getNext(&next, NULL) != Runner::RUNNER_ADVANCED) {
                if (pin) pin->deleteUnderlying();
                // make it an obvious error to use cursor or runner after this point
                cursor = NULL;
                runner = NULL;
                break;
            }

            if (resultsArray.len() + next.objsize() > byteLimit) {
                // too big. next will be the first doc in the second batch
                runner->pushBack(next);
                break;
            }

            resultsArray.append(next);
        }

        // NOTE: runner->isEOF() can have side effects such as writing by $out. However, it should
        // be relatively quick since if there was no pin then the input is empty. Also, this
        // violates the contract for batchSize==0. Sharding requires a cursor to be returned in that
        // case. This is ok for now however, since you can't have a sharded collection that doesn't
        // exist.
        const bool canReturnMoreBatches = pin;
        if (!canReturnMoreBatches && runner && !runner->isEOF()) {
            // msgasserting since this shouldn't be possible to trigger from today's aggregation
            // language. The wording assumes that the only reason pin would be null is if the
            // collection doesn't exist.
            msgasserted(17391, str::stream()
                << "Aggregation has more results than fit in initial batch, but can't "
                << "create cursor since collection " << ns << " doesn't exist");
        }

        if (cursor) {
            // If a time limit was set on the pipeline, remaining time is "rolled over" to the
            // cursor (for use by future getmore ops).
            cursor->setLeftoverMaxTimeMicros( cc().curop()->getRemainingMaxTimeMicros() );
        }

        BSONObjBuilder cursorObj(result.subobjStart("cursor"));
        cursorObj.append("id", cursor ? cursor->cursorid() : 0LL);
        cursorObj.append("ns", ns);
        cursorObj.append("firstBatch", resultsArray.arr());
        cursorObj.done();
    }