Esempio n. 1
0
        virtual bool run(const string &db, BSONObj &cmdObj, int options, string &errmsg,
                         BSONObjBuilder &result, bool fromRepl) {

            string ns = parseNs(db, cmdObj);

            intrusive_ptr<ExpressionContext> pCtx =
                new ExpressionContext(InterruptStatusMongod::status, NamespaceString(ns));

            /* try to parse the command; if this fails, then we didn't run */
            intrusive_ptr<Pipeline> pPipeline = Pipeline::parseCommand(errmsg, cmdObj, pCtx);
            if (!pPipeline.get())
                return false;

            if (pPipeline->getSplitMongodPipeline()) {
                // This is only used in testing
                return executeSplitPipeline(result, errmsg, ns, db, pPipeline, pCtx);
            }

#if _DEBUG
            // This is outside of the if block to keep the object alive until the pipeline is finished.
            BSONObj parsed;
            if (!pPipeline->isExplain() && !pCtx->inShard) {
                // Make sure all operations round-trip through Pipeline::toBson()
                // correctly by reparsing every command on DEBUG builds. This is
                // important because sharded aggregations rely on this ability.
                // Skipping when inShard because this has already been through the
                // transformation (and this unsets pCtx->inShard).
                parsed = pPipeline->serialize().toBson();
                pPipeline = Pipeline::parseCommand(errmsg, parsed, pCtx);
                verify(pPipeline);
            }
#endif

            // This does the mongod-specific stuff like creating a cursor
            PipelineD::prepareCursorSource(pPipeline, nsToDatabase(ns), pCtx);
            pPipeline->stitch();

            if (isCursorCommand(cmdObj)) {
                CursorId id;
                {
                    // Set up cursor
                    Client::ReadContext ctx(ns);
                    shared_ptr<Cursor> cursor(new PipelineCursor(pPipeline));
                    // cc will be owned by cursor manager
                    ClientCursor* cc = new ClientCursor(0, cursor, ns, cmdObj.getOwned());
                    id = cc->cursorid();
                }

                handleCursorCommand(id, cmdObj, result);
            }
            else {
                pPipeline->run(result);
            }

            if (DocumentSourceOut* out = dynamic_cast<DocumentSourceOut*>(pPipeline->output())) {
                result.append("outputNs", out->getOutputNs());
            }

            return true;
        }
Esempio n. 2
0
    void run() {
        OldClientWriteContext ctx(&_txn, nss.ns());
        insert(BSON("a" << 1 << "b" << 1));

        Collection* collection = ctx.getCollection();

        BSONObj filterObj = fromjson("{_id: {$gt: 0}, b: {$gt: 0}}");
        PlanExecutor* exec = makeCollScanExec(collection, filterObj);

        // Make a client cursor from the runner.
        ClientCursor* cc =
            new ClientCursor(collection->getCursorManager(), exec, nss.ns(), false, 0, BSONObj());
        ClientCursorPin ccPin(collection->getCursorManager(), cc->cursorid());

        // If the cursor is pinned, it sticks around,
        // even after invalidation.
        ASSERT_EQUALS(1U, numCursors());
        const std::string invalidateReason("InvalidatePinned Test");
        collection->getCursorManager()->invalidateAll(false, invalidateReason);
        ASSERT_EQUALS(1U, numCursors());

        // The invalidation should have killed the runner.
        BSONObj objOut;
        ASSERT_EQUALS(PlanExecutor::DEAD, exec->getNext(&objOut, NULL));
        ASSERT(WorkingSetCommon::isValidStatusMemberObject(objOut));
        const Status status = WorkingSetCommon::getMemberObjectStatus(objOut);
        ASSERT(status.reason().find(invalidateReason) != string::npos);

        // Deleting the underlying cursor should cause the
        // number of cursors to return to 0.
        ccPin.deleteUnderlying();
        ASSERT_EQUALS(0U, numCursors());
    }
            void run() {
                Client::WriteContext ctx(&_txn, ns());
                insert(BSON("a" << 1 << "b" << 1));

                Collection* collection = ctx.ctx().db()->getCollection(&_txn, ns());

                BSONObj filterObj = fromjson("{_id: {$gt: 0}, b: {$gt: 0}}");
                SingleSolutionRunner* ssr = makeCollScanRunner(ctx.ctx(),filterObj);

                // Make a client cursor from the runner.
                ClientCursor* cc = new ClientCursor(collection,
                                                    ssr, 0, BSONObj());
                ClientCursorPin ccPin(collection,cc->cursorid());

                // If the cursor is pinned, it sticks around,
                // even after invalidation.
                ASSERT_EQUALS(1U, numCursors());
                collection->cursorCache()->invalidateAll(false);
                ASSERT_EQUALS(1U, numCursors());

                // The invalidation should have killed the runner.
                BSONObj objOut;
                ASSERT_EQUALS(Runner::RUNNER_DEAD, ssr->getNext(&objOut, NULL));

                // Deleting the underlying cursor should cause the
                // number of cursors to return to 0.
                ccPin.deleteUnderlying();
                ASSERT_EQUALS(0U, numCursors());
            }
Esempio n. 4
0
    Value DocumentSourceCursor::serialize(bool explain) const {
        // we never parse a documentSourceCursor, so we only serialize for explain
        if (!explain)
            return Value();

        Lock::DBRead lk(_ns);
        Client::Context ctx(_ns, storageGlobalParams.dbpath, /*doVersion=*/false);

        ClientCursorPin pin(_cursorId);
        ClientCursor* cursor = pin.c();

        uassert(17135, "Cursor deleted. Was the collection or database dropped?",
                cursor);

        Runner* runner = cursor->getRunner();
        runner->restoreState();

        return Value(DOC(getSourceName() <<
            DOC("query" << Value(_query)
             << "sort" << (!_sort.isEmpty() ? Value(_sort) : Value())
             << "limit" << (_limit ? Value(_limit->getLimit()) : Value())
             << "fields" << (!_projection.isEmpty() ? Value(_projection) : Value())
             // << "indexOnly" << canUseCoveredIndex(cursor)
             // << "cursorType" << cursor->c()->toString()
        ))); // TODO get more plan information
    }
Esempio n. 5
0
    bool ClientCursor::eraseIfAuthorized(CursorId id) {
        std::string ns;
        {
            recursive_scoped_lock lock(ccmutex);
            ClientCursor* cursor = find_inlock(id);
            if (!cursor) {
                return false;
            }
            ns = cursor->ns();
        }

        // Can't be in a lock when checking authorization
        if (!cc().getAuthorizationManager()->checkAuthorization(ns, ActionType::killCursors)) {
            return false;
        }

        // It is safe to lookup the cursor again after temporarily releasing the mutex because
        // of 2 invariants: that the cursor ID won't be re-used in a short period of time, and that
        // the namespace associated with a cursor cannot change.
        recursive_scoped_lock lock(ccmutex);
        ClientCursor* cursor = find_inlock(id);
        if (!cursor) {
            // Cursor was deleted in another thread since we found it earlier in this function.
            return false;
        }
        if (cursor->ns() != ns) {
            warning() << "Cursor namespace changed. Previous ns: " << ns << ", current ns: "
                    << cursor->ns() << endl;
            return false;
        }

        return _erase_inlock(cursor);
    }
Esempio n. 6
0
    static void handleCursorCommand(CursorId id, BSONObj& cmdObj, BSONObjBuilder& result) {
        BSONElement batchSizeElem = cmdObj.getFieldDotted("cursor.batchSize");
        const long long batchSize = batchSizeElem.isNumber()
                                    ? batchSizeElem.numberLong()
                                    : 101; // same as query

        ClientCursorPin pin(id);
        ClientCursor* cursor = pin.c();

        massert(16958, "Cursor shouldn't have been deleted",
                cursor);

        verify(cursor->isAggCursor);
        PipelineRunner* runner = dynamic_cast<PipelineRunner*>(cursor->getRunner());
        verify(runner);
        try {
            const string cursorNs = cursor->ns(); // we need this after cursor may have been deleted

            // can't use result BSONObjBuilder directly since it won't handle exceptions correctly.
            BSONArrayBuilder resultsArray;
            const int byteLimit = MaxBytesToReturnToClientAtOnce;
            BSONObj next;
            for (int objCount = 0; objCount < batchSize; objCount++) {
                // The initial getNext() on a PipelineRunner may be very expensive so we don't do it
                // when batchSize is 0 since that indicates a desire for a fast return.
                if (runner->getNext(&next, NULL) != Runner::RUNNER_ADVANCED) {
                    pin.deleteUnderlying();
                    id = 0;
                    cursor = NULL; // make it an obvious error to use cursor after this point
                    break;
                }

                if (resultsArray.len() + next.objsize() > byteLimit) {
                    // too big. next will be the first doc in the second batch
                    runner->pushBack(next);
                    break;
                }

                resultsArray.append(next);
            }

            if (cursor) {
                // If a time limit was set on the pipeline, remaining time is "rolled over" to the
                // cursor (for use by future getmore ops).
                cursor->setLeftoverMaxTimeMicros( cc().curop()->getRemainingMaxTimeMicros() );
            }

            BSONObjBuilder cursorObj(result.subobjStart("cursor"));
            cursorObj.append("id", id);
            cursorObj.append("ns", cursorNs);
            cursorObj.append("firstBatch", resultsArray.arr());
            cursorObj.done();
        }
        catch (...) {
            // Clean up cursor on way out of scope.
            pin.deleteUnderlying();
            throw;
        }
    }
Esempio n. 7
0
    void DocumentSourceCursor::loadBatch() {
        if (!_cursorId) {
            dispose();
            return;
        }

        // We have already validated the sharding version when we constructed the cursor
        // so we shouldn't check it again.
        Lock::DBRead lk(_ns);
        Client::Context ctx(_ns, storageGlobalParams.dbpath, /*doVersion=*/false);

        ClientCursorPin pin(_cursorId);
        ClientCursor* cursor = pin.c();

        uassert(16950, "Cursor deleted. Was the collection or database dropped?",
                cursor);

        Runner* runner = cursor->getRunner();
        runner->restoreState();

        int memUsageBytes = 0;
        BSONObj obj;
        Runner::RunnerState state;
        while ((state = runner->getNext(&obj, NULL)) == Runner::RUNNER_ADVANCED) {
            // TODO SERVER-11831: consider using documentFromBsonWithDeps(obj, _dependencies)

            _currentBatch.push_back(Document(obj));

            if (_limit) {
                if (++_docsAddedToBatches == _limit->getLimit()) {
                    break;
                }
                verify(_docsAddedToBatches < _limit->getLimit());
            }

            memUsageBytes += _currentBatch.back().getApproximateSize();

            if (memUsageBytes > MaxBytesToReturnToClientAtOnce) {
                // End this batch and prepare cursor for yielding.
                runner->saveState();
                cc().curop()->yielded();
                return;
            }
        }

        // If we got here, there won't be any more documents, so destroy the cursor and runner.
        _cursorId = 0;
        pin.deleteUnderlying();

        uassert(16028, "collection or index disappeared when cursor yielded",
                state != Runner::RUNNER_DEAD);

        uassert(17285, "cursor encountered an error",
                state != Runner::RUNNER_ERROR);

        massert(17286, str::stream() << "Unexpected return from Runner::getNext: " << state,
                state == Runner::RUNNER_EOF || state == Runner::RUNNER_ADVANCED);
    }
Esempio n. 8
0
    /* must call this on a delete so we clean up the cursors. */
    void ClientCursor::aboutToDelete(const DiskLoc& dl) {
        recursive_scoped_lock lock(ccmutex);

        Database *db = cc().database();
        assert(db);

        aboutToDeleteForSharding( db , dl );

        CCByLoc& bl = db->ccByLoc;
        CCByLoc::iterator j = bl.lower_bound(dl);
        CCByLoc::iterator stop = bl.upper_bound(dl);
        if ( j == stop )
            return;

        vector<ClientCursor*> toAdvance;

        while ( 1 ) {
            toAdvance.push_back(j->second);
            DEV assert( j->first == dl );
            ++j;
            if ( j == stop )
                break;
        }

        wassert( toAdvance.size() < 5000 );
        
        for ( vector<ClientCursor*>::iterator i = toAdvance.begin(); i != toAdvance.end(); ++i ){
            ClientCursor* cc = *i;
            wassert(cc->_db == db);
            
            if ( cc->_doingDeletes ) continue;

            Cursor *c = cc->c.get();
            if ( c->capped() ){
                delete cc;
                continue;
            }
            
            c->checkLocation();
            DiskLoc tmp1 = c->refLoc();
            if ( tmp1 != dl ) {
                /* this might indicate a failure to call ClientCursor::updateLocation() */
                problem() << "warning: cursor loc " << tmp1 << " does not match byLoc position " << dl << " !" << endl;
            }
            c->advance();
            if ( c->eof() ) {
                // advanced to end
                // leave ClientCursor in place so next getMore doesn't fail
                // still need to mark new location though
                cc->updateLocation();
            }
            else {
                wassert( c->refLoc() != dl );
                cc->updateLocation();
            }
        }
    }
Esempio n. 9
0
        virtual bool run(const string &db, BSONObj &cmdObj, int options, string &errmsg,
                         BSONObjBuilder &result, bool fromRepl) {

            string ns = parseNs(db, cmdObj);

            intrusive_ptr<ExpressionContext> pCtx =
                new ExpressionContext(InterruptStatusMongod::status, NamespaceString(ns));
            pCtx->tempDir = storageGlobalParams.dbpath + "/_tmp";

            /* try to parse the command; if this fails, then we didn't run */
            intrusive_ptr<Pipeline> pPipeline = Pipeline::parseCommand(errmsg, cmdObj, pCtx);
            if (!pPipeline.get())
                return false;

#if _DEBUG
            // This is outside of the if block to keep the object alive until the pipeline is finished.
            BSONObj parsed;
            if (!pPipeline->isExplain() && !pCtx->inShard) {
                // Make sure all operations round-trip through Pipeline::toBson()
                // correctly by reparsing every command on DEBUG builds. This is
                // important because sharded aggregations rely on this ability.
                // Skipping when inShard because this has already been through the
                // transformation (and this unsets pCtx->inShard).
                parsed = pPipeline->serialize().toBson();
                pPipeline = Pipeline::parseCommand(errmsg, parsed, pCtx);
                verify(pPipeline);
            }
#endif

            // This does the mongod-specific stuff like creating a cursor
            PipelineD::prepareCursorSource(pPipeline, pCtx);
            pPipeline->stitch();

            if (pPipeline->isExplain()) {
                result << "stages" << Value(pPipeline->writeExplainOps());
                return true; // don't do any actual execution
            }

            if (isCursorCommand(cmdObj)) {
                CursorId id;
                {
                    // Set up cursor
                    Client::ReadContext ctx(ns);
                    ClientCursor* cc = new ClientCursor(new PipelineRunner(pPipeline));
                    cc->isAggCursor = true; // enable special locking and ns deletion behavior
                    id = cc->cursorid();
                }

                handleCursorCommand(id, cmdObj, result);
            }
            else {
                pPipeline->run(result);
            }

            return true;
        }
Esempio n. 10
0
    bool ClientCursor::erase( CursorId id ) {
        recursive_scoped_lock lock( ccmutex );
        ClientCursor *cursor = find_inlock( id );
        if ( ! cursor )
            return false;

        if ( ! cc().getAuthenticationInfo()->isAuthorizedReads( nsToDatabase( cursor->ns() ) ) )
            return false;

        assert( cursor->_pinValue < 100 ); // mustn't have an active ClientCursor::Pointer
        delete cursor;
        return true;
    }
Esempio n. 11
0
    void ClientCursor::idleTimeReport(unsigned millis) {
        bool foundSomeToTimeout = false;

        // two passes so that we don't need to readlock unless we really do some timeouts
        // we assume here that incrementing _idleAgeMillis outside readlock is ok.
        {
            recursive_scoped_lock lock(ccmutex);
            {
                unsigned sz = clientCursorsById.size();
                static time_t last;
                if( sz >= 100000 ) { 
                    if( time(0) - last > 300 ) {
                        last = time(0);
                        log() << "warning number of open cursors is very large: " << sz << endl;
                    }
                }
            }
            for ( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end();  ) {
                CCById::iterator j = i;
                i++;
                if( j->second->shouldTimeout( millis ) ) {
                    foundSomeToTimeout = true;
                }
            }
        }

        if( foundSomeToTimeout ) {
            Lock::GlobalRead lk;

            recursive_scoped_lock cclock(ccmutex);
            CCById::const_iterator it = clientCursorsById.begin();
            while (it != clientCursorsById.end()) {
                ClientCursor* cc = it->second;
                if( cc->shouldTimeout(0) ) {
                    numberTimedOut++;
                    LOG(1) << "killing old cursor " << cc->_cursorid << ' ' << cc->_ns
                           << " idle:" << cc->idleTime() << "ms\n";
                    ClientCursor* toDelete = it->second;
                    CursorId id = toDelete->cursorid();
                    // This is what winds up removing it from the map.
                    delete toDelete;
                    it = clientCursorsById.upper_bound(id);
                }
                else {
                    ++it;
                }
            }
        }
    }
Esempio n. 12
0
    Value DocumentSourceCursor::serialize(bool explain) const {
        // we never parse a documentSourceCursor, so we only serialize for explain
        if (!explain)
            return Value();

        Status explainStatus(ErrorCodes::InternalError, "");
        scoped_ptr<TypeExplain> plan;
        {
            Lock::DBRead lk(_ns);
            Client::Context ctx(_ns, storageGlobalParams.dbpath, /*doVersion=*/false);

            ClientCursorPin pin(_cursorId);
            ClientCursor* cursor = pin.c();

            uassert(17135, "Cursor deleted. Was the collection or database dropped?",
                    cursor);

            Runner* runner = cursor->getRunner();
            runner->restoreState();

            TypeExplain* explainRaw;
            explainStatus = runner->getExplainPlan(&explainRaw);
            if (explainStatus.isOK())
                plan.reset(explainRaw);

            runner->saveState();
        }

        MutableDocument out;
        out["query"] = Value(_query);

        if (!_sort.isEmpty())
            out["sort"] = Value(_sort);

        if (_limit)
            out["limit"] = Value(_limit->getLimit());

        if (!_projection.isEmpty())
            out["fields"] = Value(_projection);

        if (explainStatus.isOK()) {
            out["plan"] = Value(extractInfo(plan));
        } else {
            out["planError"] = Value(explainStatus.toString());
        }


        return out.freezeToValue();
    }
Esempio n. 13
0
    bool ClientCursor::eraseIfAuthorized(CursorId id) {
        recursive_scoped_lock lock(ccmutex);
        ClientCursor* cursor = find_inlock(id);
        if (!cursor) {
            return false;
        }

        if (!cc().getAuthorizationManager()->checkAuthorization(cursor->ns(),
                                                                ActionType::find)
                || !cc().getAuthenticationInfo()->isAuthorizedReads(nsToDatabase(cursor->ns()))) {
            return false;
        }

        return _erase_inlock(cursor);
    }
Esempio n. 14
0
    /* must call this on a delete so we clean up the cursors. */
    void ClientCursor::aboutToDelete(const DiskLoc& dl) {
        recursive_scoped_lock lock(ccmutex);

        CCByLoc::iterator j = byLoc.lower_bound(dl);
        CCByLoc::iterator stop = byLoc.upper_bound(dl);
        if ( j == stop )
            return;

        vector<ClientCursor*> toAdvance;

        while ( 1 ) {
            toAdvance.push_back(j->second);
            DEV assert( j->first == dl );
            ++j;
            if ( j == stop )
                break;
        }

        wassert( toAdvance.size() < 5000 );
        
        for ( vector<ClientCursor*>::iterator i = toAdvance.begin(); i != toAdvance.end(); ++i ){
            ClientCursor* cc = *i;
            
            if ( cc->_doingDeletes ) continue;

            Cursor *c = cc->c.get();
            if ( c->capped() ){
                delete cc;
                continue;
            }
            
            c->checkLocation();
            DiskLoc tmp1 = c->refLoc();
            if ( tmp1 != dl ) {
                /* this might indicate a failure to call ClientCursor::updateLocation() */
                problem() << "warning: cursor loc " << tmp1 << " does not match byLoc position " << dl << " !" << endl;
            }
            c->advance();
            if ( c->eof() ) {
                // advanced to end -- delete cursor
                delete cc;
            }
            else {
                wassert( c->refLoc() != dl );
                cc->updateLocation();
            }
        }
    }
Esempio n. 15
0
        /**
         * Called via a ScopeGuard on early return in order to ensure that the ClientCursor gets
         * cleaned up properly.
         */
        static void cleanupCursor(OperationContext* txn,
                                  ClientCursorPin* ccPin,
                                  const GetMoreRequest& request) {
            ClientCursor* cursor = ccPin->c();

            std::unique_ptr<Lock::DBLock> unpinDBLock;
            std::unique_ptr<Lock::CollectionLock> unpinCollLock;

            if (cursor->isAggCursor()) {
                unpinDBLock.reset(new Lock::DBLock(txn->lockState(), request.nss.db(), MODE_IS));
                unpinCollLock.reset(
                    new Lock::CollectionLock(txn->lockState(), request.nss.ns(), MODE_IS));
            }

            ccPin->deleteUnderlying();
        }
Esempio n. 16
0
    bool ClientCursor::eraseIfAuthorized(CursorId id) {
        NamespaceString ns;
        {
            recursive_scoped_lock lock(ccmutex);
            ClientCursor* cursor = find_inlock(id);
            if (!cursor) {
                audit::logKillCursorsAuthzCheck(
                        &cc(),
                        NamespaceString(),
                        id,
                        ErrorCodes::CursorNotFound);
                return false;
            }
            ns = NamespaceString(cursor->ns());
        }

        // Can't be in a lock when checking authorization
        const bool isAuthorized = cc().getAuthorizationSession()->isAuthorizedForActionsOnNamespace(
                ns, ActionType::killCursors);
        audit::logKillCursorsAuthzCheck(
                &cc(),
                ns,
                id,
                isAuthorized ? ErrorCodes::OK : ErrorCodes::Unauthorized);
        if (!isAuthorized) {
            return false;
        }

        // It is safe to lookup the cursor again after temporarily releasing the mutex because
        // of 2 invariants: that the cursor ID won't be re-used in a short period of time, and that
        // the namespace associated with a cursor cannot change.
        recursive_scoped_lock lock(ccmutex);
        ClientCursor* cursor = find_inlock(id);
        if (!cursor) {
            // Cursor was deleted in another thread since we found it earlier in this function.
            return false;
        }
        if (ns != cursor->ns()) {
            warning() << "Cursor namespace changed. Previous ns: " << ns << ", current ns: "
                    << cursor->ns() << endl;
            return false;
        }

        _erase_inlock(cursor);
        return true;
    }
Esempio n. 17
0
    bool ClientCursor::erase( CursorId id ) {
        recursive_scoped_lock lock( ccmutex );
        ClientCursor *cursor = find_inlock( id );
        if ( ! cursor )
            return false;

        if ( ! cc().getAuthenticationInfo()->isAuthorizedReads( nsToDatabase( cursor->ns() ) ) )
            return false;

        // Must not have an active ClientCursor::Pin.
        massert( 16089,
                str::stream() << "Cannot kill active cursor " << id,
                cursor->_pinValue < 100 );
        
        delete cursor;
        return true;
    }
Esempio n. 18
0
 /* called every 4 seconds.  millis is amount of idle time passed since the last call -- could be zero */
 void ClientCursor::idleTimeReport(unsigned millis) {
     LockedIterator i;
     unsigned sz = clientCursorsById.size();
     if (sz >= 100000) { 
         RATELIMITED(300000) log() << "warning number of open cursors is very large: " << sz << endl;
     }
     while (i.ok()) {
         ClientCursor *cc = i.current();
         if (cc->shouldTimeout(millis)) {
             LOG(1) << "killing old cursor " << cc->_cursorid << ' ' << cc->_ns
                    << " idle:" << cc->idleTime() << "ms" << endl;
             i.deleteAndAdvance();
         } else {
             i.advance();
         }
     }
 }
Esempio n. 19
0
    /* called every 4 seconds.  millis is amount of idle time passed since the last call -- could be zero */
    void ClientCursor::idleTimeReport(unsigned millis) {
        bool foundSomeToTimeout = false;

        // two passes so that we don't need to readlock unless we really do some timeouts
        // we assume here that incrementing _idleAgeMillis outside readlock is ok.
        {
            recursive_scoped_lock lock(ccmutex);
            {
                unsigned sz = clientCursorsById.size();
                static time_t last;
                if( sz >= 100000 ) { 
                    if( time(0) - last > 300 ) {
                        last = time(0);
                        log() << "warning number of open cursors is very large: " << sz << endl;
                    }
                }
            }
            for ( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end();  ) {
                CCById::iterator j = i;
                i++;
                if( j->second->shouldTimeout( millis ) ) {
                    foundSomeToTimeout = true;
                }
            }
        }

        if( foundSomeToTimeout ) {
            // todo: ideally all readlocks automatically note what we are locking for so this 
            // can be reported in currentop command. e.g. something like:
            //   readlock lk("", "timeout cursors");
            readlock lk("");
            for( LockedIterator i; i.ok(); ) {
                ClientCursor *cc = i.current();
                if( cc->shouldTimeout(0) ) {
                    numberTimedOut++;
                    LOG(1) << "killing old cursor " << cc->_cursorid << ' ' << cc->_ns
                           << " idle:" << cc->idleTime() << "ms\n";
                    i.deleteAndAdvance();
                }
                else {
                    i.advance();
                }
            }
        }
    }
Esempio n. 20
0
    static void handleCursorCommand(CursorId id, BSONObj& cmdObj, BSONObjBuilder& result) {
        BSONElement batchSizeElem = cmdObj.getFieldDotted("cursor.batchSize");
        const long long batchSize = batchSizeElem.isNumber()
                                    ? batchSizeElem.numberLong()
                                    : 101; // same as query

        // Using limited cursor API that ignores many edge cases. Should be sufficient for commands.
        ClientCursor::Pin pin(id);
        ClientCursor* cursor = pin.c();

        massert(16958, "Cursor shouldn't have been deleted",
                cursor);

        // Make sure this cursor won't disappear on us
        fassert(16959, !cursor->c()->shouldDestroyOnNSDeletion());
        fassert(16960, !cursor->c()->requiresLock());

        try {
            // can't use result BSONObjBuilder directly since it won't handle exceptions correctly.
            BSONArrayBuilder resultsArray;
            const int byteLimit = MaxBytesToReturnToClientAtOnce;
            for (int objs = 0;
                    objs < batchSize && cursor->ok() && resultsArray.len() <= byteLimit;
                    objs++) {
                // TODO may need special logic if cursor->current() would cause results to be > 16MB
                resultsArray.append(cursor->current());
                cursor->advance();
            }

            // The initial ok() on a cursor may be very expensive so we don't do it when batchSize
            // is 0 since that indicates a desire for a fast return.
            if (batchSize != 0 && !cursor->ok()) {
                // There is no more data. Kill the cursor.
                pin.release();
                ClientCursor::erase(id);
                id = 0;
            }

            BSONObjBuilder cursorObj(result.subobjStart("cursor"));
            cursorObj.append("id", id);
            cursorObj.append("ns", cursor->ns());
            cursorObj.append("firstBatch", resultsArray.arr());
            cursorObj.done();
        }
        catch (...) {
            // Clean up cursor on way out of scope.
            pin.release();
            ClientCursor::erase(id);
            throw;
        }
    }
Esempio n. 21
0
    // ns is either a full namespace or "dbname." when invalidating for a whole db
    void ClientCursor::invalidate(const StringData &ns) {
        Lock::assertWriteLocked(ns);
        size_t dotpos = ns.find('.');
        verify(dotpos != string::npos);
        bool isDB = (dotpos + 1) == ns.size(); // first (and only) dot is the last char

        {
            //cout << "\nTEMP invalidate " << ns << endl;
            Database *db = cc().database();
            verify(db);
            verify( ns.startsWith(db->name()) );

            for( LockedIterator i; i.ok(); ) {
                ClientCursor *cc = i.current();

                bool shouldDelete = false;
                if (cc->c()->shouldDestroyOnNSDeletion() && cc->_db == db) {
                    if (isDB) {
                        // already checked that db matched above
                        dassert( StringData(cc->_ns).startsWith(ns) );
                        shouldDelete = true;
                    }
                    else {
                        if ( ns == cc->_ns )
                            shouldDelete = true;
                    }
                }

                if ( shouldDelete ) {
                    i.deleteAndAdvance();
                }
                else {
                    i.advance();
                }
            }
        }
    }
Esempio n. 22
0
/**
 * Returns true if we need to keep a ClientCursor saved for this pipeline (for future getMore
 * requests).  Otherwise, returns false.
 */
static bool handleCursorCommand(OperationContext* txn,
                                const string& ns,
                                ClientCursorPin* pin,
                                PlanExecutor* exec,
                                const BSONObj& cmdObj,
                                BSONObjBuilder& result) {
    ClientCursor* cursor = pin ? pin->c() : NULL;
    if (pin) {
        invariant(cursor);
        invariant(cursor->getExecutor() == exec);
        invariant(cursor->isAggCursor());
    }

    const long long defaultBatchSize = 101;  // Same as query.
    long long batchSize;
    uassertStatusOK(Command::parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize));

    // can't use result BSONObjBuilder directly since it won't handle exceptions correctly.
    BSONArrayBuilder resultsArray;
    BSONObj next;
    for (int objCount = 0; objCount < batchSize; objCount++) {
        // The initial getNext() on a PipelineProxyStage may be very expensive so we don't
        // do it when batchSize is 0 since that indicates a desire for a fast return.
        PlanExecutor::ExecState state;
        if ((state = exec->getNext(&next, NULL)) == PlanExecutor::IS_EOF) {
            // make it an obvious error to use cursor or executor after this point
            cursor = NULL;
            exec = NULL;
            break;
        }

        uassert(34426,
                "Plan executor error during aggregation: " + WorkingSetCommon::toStatusString(next),
                PlanExecutor::ADVANCED == state);

        // If adding this object will cause us to exceed the message size limit, then we stash it
        // for later.
        if (!FindCommon::haveSpaceForNext(next, objCount, resultsArray.len())) {
            exec->enqueue(next);
            break;
        }

        resultsArray.append(next);
    }

    // NOTE: exec->isEOF() can have side effects such as writing by $out. However, it should
    // be relatively quick since if there was no pin then the input is empty. Also, this
    // violates the contract for batchSize==0. Sharding requires a cursor to be returned in that
    // case. This is ok for now however, since you can't have a sharded collection that doesn't
    // exist.
    const bool canReturnMoreBatches = pin;
    if (!canReturnMoreBatches && exec && !exec->isEOF()) {
        // msgasserting since this shouldn't be possible to trigger from today's aggregation
        // language. The wording assumes that the only reason pin would be null is if the
        // collection doesn't exist.
        msgasserted(
            17391,
            str::stream() << "Aggregation has more results than fit in initial batch, but can't "
                          << "create cursor since collection " << ns << " doesn't exist");
    }

    if (cursor) {
        // If a time limit was set on the pipeline, remaining time is "rolled over" to the
        // cursor (for use by future getmore ops).
        cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros());

        CurOp::get(txn)->debug().cursorid = cursor->cursorid();

        // Cursor needs to be in a saved state while we yield locks for getmore. State
        // will be restored in getMore().
        exec->saveState();
        exec->detachFromOperationContext();
    }

    const long long cursorId = cursor ? cursor->cursorid() : 0LL;
    appendCursorResponseObject(cursorId, ns, resultsArray.arr(), &result);

    return static_cast<bool>(cursor);
}
Esempio n. 23
0
    virtual bool run(OperationContext* txn,
                     const string& db,
                     BSONObj& cmdObj,
                     int options,
                     string& errmsg,
                     BSONObjBuilder& result) {
        const std::string ns = parseNs(db, cmdObj);
        if (nsToCollectionSubstring(ns).empty()) {
            errmsg = "missing collection name";
            return false;
        }
        NamespaceString nss(ns);

        intrusive_ptr<ExpressionContext> pCtx = new ExpressionContext(txn, nss);
        pCtx->tempDir = storageGlobalParams.dbpath + "/_tmp";

        /* try to parse the command; if this fails, then we didn't run */
        intrusive_ptr<Pipeline> pPipeline = Pipeline::parseCommand(errmsg, cmdObj, pCtx);
        if (!pPipeline.get())
            return false;

        // Save and reset the write concern so that it doesn't get changed accidentally by
        // DBDirectClient.
        auto oldWC = txn->getWriteConcern();
        ON_BLOCK_EXIT([txn, oldWC] { txn->setWriteConcern(oldWC); });

        // This is outside of the if block to keep the object alive until the pipeline is finished.
        BSONObj parsed;
        if (kDebugBuild && !pPipeline->isExplain() && !pCtx->inShard) {
            // Make sure all operations round-trip through Pipeline::toBson() correctly by
            // reparsing every command in debug builds. This is important because sharded
            // aggregations rely on this ability.  Skipping when inShard because this has
            // already been through the transformation (and this unsets pCtx->inShard).
            parsed = pPipeline->serialize().toBson();
            pPipeline = Pipeline::parseCommand(errmsg, parsed, pCtx);
            verify(pPipeline);
        }

        unique_ptr<ClientCursorPin> pin;  // either this OR the exec will be non-null
        unique_ptr<PlanExecutor> exec;
        {
            // This will throw if the sharding version for this connection is out of date. The
            // lock must be held continuously from now until we have we created both the output
            // ClientCursor and the input executor. This ensures that both are using the same
            // sharding version that we synchronize on here. This is also why we always need to
            // create a ClientCursor even when we aren't outputting to a cursor. See the comment
            // on ShardFilterStage for more details.
            AutoGetCollectionForRead ctx(txn, nss.ns());

            Collection* collection = ctx.getCollection();

            // This does mongod-specific stuff like creating the input PlanExecutor and adding
            // it to the front of the pipeline if needed.
            std::shared_ptr<PlanExecutor> input =
                PipelineD::prepareCursorSource(txn, collection, nss, pPipeline, pCtx);
            pPipeline->stitch();

            if (collection && input) {
                // Record the indexes used by the input executor. Retrieval of summary stats for a
                // PlanExecutor is normally done post execution. DocumentSourceCursor however will
                // destroy the input PlanExecutor once the result set has been exhausted. For
                // that reason we need to collect the indexes used prior to plan execution.
                PlanSummaryStats stats;
                Explain::getSummaryStats(*input, &stats);
                collection->infoCache()->notifyOfQuery(txn, stats.indexesUsed);

                // TODO SERVER-23265: Confirm whether this is the correct place to gather all
                // metrics. There is no harm adding here for the time being.
                CurOp::get(txn)->debug().setPlanSummaryMetrics(stats);
            }

            // Create the PlanExecutor which returns results from the pipeline. The WorkingSet
            // ('ws') and the PipelineProxyStage ('proxy') will be owned by the created
            // PlanExecutor.
            auto ws = make_unique<WorkingSet>();
            auto proxy = make_unique<PipelineProxyStage>(txn, pPipeline, input, ws.get());

            auto statusWithPlanExecutor = (NULL == collection)
                ? PlanExecutor::make(
                      txn, std::move(ws), std::move(proxy), nss.ns(), PlanExecutor::YIELD_MANUAL)
                : PlanExecutor::make(
                      txn, std::move(ws), std::move(proxy), collection, PlanExecutor::YIELD_MANUAL);
            invariant(statusWithPlanExecutor.isOK());
            exec = std::move(statusWithPlanExecutor.getValue());

            if (!collection && input) {
                // If we don't have a collection, we won't be able to register any executors, so
                // make sure that the input PlanExecutor (likely wrapping an EOFStage) doesn't
                // need to be registered.
                invariant(!input->collection());
            }

            if (collection) {
                const bool isAggCursor = true;  // enable special locking behavior
                ClientCursor* cursor =
                    new ClientCursor(collection->getCursorManager(),
                                     exec.release(),
                                     nss.ns(),
                                     txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(),
                                     0,
                                     cmdObj.getOwned(),
                                     isAggCursor);
                pin.reset(new ClientCursorPin(collection->getCursorManager(), cursor->cursorid()));
                // Don't add any code between here and the start of the try block.
            }

            // At this point, it is safe to release the collection lock.
            // - In the case where we have a collection: we will need to reacquire the
            //   collection lock later when cleaning up our ClientCursorPin.
            // - In the case where we don't have a collection: our PlanExecutor won't be
            //   registered, so it will be safe to clean it up outside the lock.
            invariant(NULL == exec.get() || NULL == exec->collection());
        }

        try {
            // Unless set to true, the ClientCursor created above will be deleted on block exit.
            bool keepCursor = false;

            const bool isCursorCommand = !cmdObj["cursor"].eoo();

            // If both explain and cursor are specified, explain wins.
            if (pPipeline->isExplain()) {
                result << "stages" << Value(pPipeline->writeExplainOps());
            } else if (isCursorCommand) {
                keepCursor = handleCursorCommand(txn,
                                                 nss.ns(),
                                                 pin.get(),
                                                 pin ? pin->c()->getExecutor() : exec.get(),
                                                 cmdObj,
                                                 result);
            } else {
                pPipeline->run(result);
            }

            // Clean up our ClientCursorPin, if needed.  We must reacquire the collection lock
            // in order to do so.
            if (pin) {
                // We acquire locks here with DBLock and CollectionLock instead of using
                // AutoGetCollectionForRead.  AutoGetCollectionForRead will throw if the
                // sharding version is out of date, and we don't care if the sharding version
                // has changed.
                Lock::DBLock dbLock(txn->lockState(), nss.db(), MODE_IS);
                Lock::CollectionLock collLock(txn->lockState(), nss.ns(), MODE_IS);
                if (keepCursor) {
                    pin->release();
                } else {
                    pin->deleteUnderlying();
                }
            }
        } catch (...) {
            // On our way out of scope, we clean up our ClientCursorPin if needed.
            if (pin) {
                Lock::DBLock dbLock(txn->lockState(), nss.db(), MODE_IS);
                Lock::CollectionLock collLock(txn->lockState(), nss.ns(), MODE_IS);
                pin->deleteUnderlying();
            }
            throw;
        }
        // Any code that needs the cursor pinned must be inside the try block, above.

        return true;
    }
Esempio n. 24
0
    bool run(OperationContext* txn,
             const string& dbname,
             BSONObj& cmdObj,
             int,
             string& errmsg,
             BSONObjBuilder& result) {
        BSONElement first = cmdObj.firstElement();
        uassert(28528,
                str::stream() << "Argument to listIndexes must be of type String, not "
                              << typeName(first.type()),
                first.type() == String);
        StringData collectionName = first.valueStringData();
        uassert(28529,
                str::stream() << "Argument to listIndexes must be a collection name, "
                              << "not the empty string",
                !collectionName.empty());
        const NamespaceString ns(dbname, collectionName);

        const long long defaultBatchSize = std::numeric_limits<long long>::max();
        long long batchSize;
        Status parseCursorStatus = parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize);
        if (!parseCursorStatus.isOK()) {
            return appendCommandStatus(result, parseCursorStatus);
        }

        AutoGetCollectionForRead autoColl(txn, ns);
        if (!autoColl.getDb()) {
            return appendCommandStatus(result,
                                       Status(ErrorCodes::NamespaceNotFound, "no database"));
        }

        const Collection* collection = autoColl.getCollection();
        if (!collection) {
            return appendCommandStatus(result,
                                       Status(ErrorCodes::NamespaceNotFound, "no collection"));
        }

        const CollectionCatalogEntry* cce = collection->getCatalogEntry();
        invariant(cce);

        vector<string> indexNames;
        MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
            indexNames.clear();
            cce->getAllIndexes(txn, &indexNames);
        }
        MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns());

        std::unique_ptr<WorkingSet> ws(new WorkingSet());
        std::unique_ptr<QueuedDataStage> root(new QueuedDataStage(ws.get()));

        for (size_t i = 0; i < indexNames.size(); i++) {
            BSONObj indexSpec;
            MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN {
                indexSpec = cce->getIndexSpec(txn, indexNames[i]);
            }
            MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns());

            WorkingSetID id = ws->allocate();
            WorkingSetMember* member = ws->get(id);
            member->keyData.clear();
            member->loc = RecordId();
            member->obj = Snapshotted<BSONObj>(SnapshotId(), indexSpec.getOwned());
            member->transitionToOwnedObj();
            root->pushBack(id);
        }

        std::string cursorNamespace = str::stream() << dbname << ".$cmd." << name << "."
                                                    << ns.coll();
        dassert(NamespaceString(cursorNamespace).isValid());
        dassert(NamespaceString(cursorNamespace).isListIndexesCursorNS());
        dassert(ns == NamespaceString(cursorNamespace).getTargetNSForListIndexes());

        auto statusWithPlanExecutor = PlanExecutor::make(
            txn, std::move(ws), std::move(root), cursorNamespace, PlanExecutor::YIELD_MANUAL);
        if (!statusWithPlanExecutor.isOK()) {
            return appendCommandStatus(result, statusWithPlanExecutor.getStatus());
        }
        std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue());

        BSONArrayBuilder firstBatch;

        const int byteLimit = MaxBytesToReturnToClientAtOnce;
        for (long long objCount = 0; objCount < batchSize && firstBatch.len() < byteLimit;
             objCount++) {
            BSONObj next;
            PlanExecutor::ExecState state = exec->getNext(&next, NULL);
            if (state == PlanExecutor::IS_EOF) {
                break;
            }
            invariant(state == PlanExecutor::ADVANCED);
            firstBatch.append(next);
        }

        CursorId cursorId = 0LL;
        if (!exec->isEOF()) {
            exec->saveState();
            ClientCursor* cursor = new ClientCursor(
                CursorManager::getGlobalCursorManager(), exec.release(), cursorNamespace);
            cursorId = cursor->cursorid();
        }

        appendCursorResponseObject(cursorId, cursorNamespace, firstBatch.arr(), &result);

        return true;
    }
Esempio n. 25
0
 void ClientCursor::LockedIterator::deleteAndAdvance() {
     ClientCursor *cc = current();
     CursorId id = cc->cursorid();
     delete cc;
     _i = clientCursorsById.upper_bound( id );
 }
Esempio n. 26
0
    /* must call this on a delete so we clean up the cursors. */
    void ClientCursor::aboutToDelete(const DiskLoc& dl) {
        NoPageFaultsAllowed npfa;

        recursive_scoped_lock lock(ccmutex);

        Database *db = cc().database();
        verify(db);

        aboutToDeleteForSharding( db , dl );

        CCByLoc& bl = db->ccByLoc;
        CCByLoc::iterator j = bl.lower_bound(ByLocKey::min(dl));
        CCByLoc::iterator stop = bl.upper_bound(ByLocKey::max(dl));
        if ( j == stop )
            return;

        vector<ClientCursor*> toAdvance;

        while ( 1 ) {
            toAdvance.push_back(j->second);
            DEV verify( j->first.loc == dl );
            ++j;
            if ( j == stop )
                break;
        }

        if( toAdvance.size() >= 3000 ) {
            log() << "perf warning MPW101: " << toAdvance.size() << " cursors for one diskloc "
                  << dl.toString()
                  << ' ' << toAdvance[1000]->_ns
                  << ' ' << toAdvance[2000]->_ns
                  << ' ' << toAdvance[1000]->_pinValue
                  << ' ' << toAdvance[2000]->_pinValue
                  << ' ' << toAdvance[1000]->_pos
                  << ' ' << toAdvance[2000]->_pos
                  << ' ' << toAdvance[1000]->_idleAgeMillis
                  << ' ' << toAdvance[2000]->_idleAgeMillis
                  << ' ' << toAdvance[1000]->_doingDeletes
                  << ' ' << toAdvance[2000]->_doingDeletes
                  << endl;
            //wassert( toAdvance.size() < 5000 );
        }

        for ( vector<ClientCursor*>::iterator i = toAdvance.begin(); i != toAdvance.end(); ++i ) {
            ClientCursor* cc = *i;
            wassert(cc->_db == db);

            if ( cc->_doingDeletes ) continue;

            Cursor *c = cc->_c.get();
            if ( c->capped() ) {
                /* note we cannot advance here. if this condition occurs, writes to the oplog
                   have "caught" the reader.  skipping ahead, the reader would miss postentially
                   important data.
                   */
                delete cc;
                continue;
            }

            c->recoverFromYield();
            DiskLoc tmp1 = c->refLoc();
            if ( tmp1 != dl ) {
                // This might indicate a failure to call ClientCursor::prepareToYield() but it can
                // also happen during correct operation, see SERVER-2009.
                problem() << "warning: cursor loc " << tmp1 << " does not match byLoc position " << dl << " !" << endl;
            }
            else {
                c->advance();
            }
            while (!c->eof() && c->refLoc() == dl) {
                /* We don't delete at EOF because we want to return "no more results" rather than "no such cursor".
                 * The loop is to handle MultiKey indexes where the deleted record is pointed to by multiple adjacent keys.
                 * In that case we need to advance until we get to the next distinct record or EOF.
                 * SERVER-4154
                 * SERVER-5198
                 * But see SERVER-5725.
                 */
                c->advance();
            }
            cc->updateLocation();
        }
    }
Esempio n. 27
0
    std::string newRunQuery(OperationContext* txn,
                            Message& m,
                            QueryMessage& q,
                            CurOp& curop,
                            Message &result,
                            bool fromDBDirectClient) {
        // Validate the namespace.
        const char *ns = q.ns;
        uassert(16332, "can't have an empty ns", ns[0]);

        const NamespaceString nsString(ns);
        uassert(16256, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid());

        // Set curop information.
        curop.debug().ns = ns;
        curop.debug().ntoreturn = q.ntoreturn;
        curop.debug().query = q.query;
        curop.setQuery(q.query);

        // If the query is really a command, run it.
        if (nsString.isCommand()) {
            int nToReturn = q.ntoreturn;
            uassert(16979, str::stream() << "bad numberToReturn (" << nToReturn
                                         << ") for $cmd type ns - can only be 1 or -1",
                    nToReturn == 1 || nToReturn == -1);

            curop.markCommand();

            BufBuilder bb;
            bb.skip(sizeof(QueryResult::Value));

            BSONObjBuilder cmdResBuf;
            if (!runCommands(txn, ns, q.query, curop, bb, cmdResBuf, false, q.queryOptions)) {
                uasserted(13530, "bad or malformed command request?");
            }

            curop.debug().iscommand = true;
            // TODO: Does this get overwritten/do we really need to set this twice?
            curop.debug().query = q.query;

            QueryResult::View qr = bb.buf();
            bb.decouple();
            qr.setResultFlagsToOk();
            qr.msgdata().setLen(bb.len());
            curop.debug().responseLength = bb.len();
            qr.msgdata().setOperation(opReply);
            qr.setCursorId(0);
            qr.setStartingFrom(0);
            qr.setNReturned(1);
            result.setData(qr.view2ptr(), true);
            return "";
        }

        const NamespaceString nss(q.ns);

        // Parse the qm into a CanonicalQuery.
        CanonicalQuery* cq;
        Status canonStatus = CanonicalQuery::canonicalize(
                                    q, &cq, WhereCallbackReal(txn, StringData(nss.db())));
        if (!canonStatus.isOK()) {
            uasserted(17287, str::stream() << "Can't canonicalize query: " << canonStatus.toString());
        }

        QLOG() << "Running query:\n" << cq->toString();
        LOG(2) << "Running query: " << cq->toStringShort();

        // Parse, canonicalize, plan, transcribe, and get a plan executor.
        PlanExecutor* rawExec = NULL;

        // We use this a lot below.
        const LiteParsedQuery& pq = cq->getParsed();

        AutoGetCollectionForRead ctx(txn, nss);

        const int dbProfilingLevel = (ctx.getDb() != NULL) ? ctx.getDb()->getProfilingLevel() :
                                                             serverGlobalParams.defaultProfile;

        Collection* collection = ctx.getCollection();

        // We'll now try to get the query executor that will execute this query for us. There
        // are a few cases in which we know upfront which executor we should get and, therefore,
        // we shortcut the selection process here.
        //
        // (a) If the query is over a collection that doesn't exist, we use an EOFStage.
        //
        // (b) if the query is a replication's initial sync one, we use a specifically designed
        // stage that skips extents faster (see details in exec/oplogstart.h).
        //
        // Otherwise we go through the selection of which executor is most suited to the
        // query + run-time context at hand.
        Status status = Status::OK();
        if (NULL != collection && pq.getOptions().oplogReplay) {
            // Takes ownership of 'cq'.
            status = getOplogStartHack(txn, collection, cq, &rawExec);
        }
        else {
            size_t options = QueryPlannerParams::DEFAULT;
            if (shardingState.needCollectionMetadata(pq.ns())) {
                options |= QueryPlannerParams::INCLUDE_SHARD_FILTER;
            }
            // Takes ownership of 'cq'.
            status = getExecutor(txn, collection, cq, PlanExecutor::YIELD_AUTO, &rawExec, options);
        }

        if (!status.isOK()) {
            // NOTE: Do not access cq as getExecutor has deleted it.
            uasserted(17007, "Unable to execute query: " + status.reason());
        }

        verify(NULL != rawExec);
        auto_ptr<PlanExecutor> exec(rawExec);

        // If it's actually an explain, do the explain and return rather than falling through
        // to the normal query execution loop.
        if (pq.isExplain()) {
            BufBuilder bb;
            bb.skip(sizeof(QueryResult::Value));

            BSONObjBuilder explainBob;
            Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob);

            // Add the resulting object to the return buffer.
            BSONObj explainObj = explainBob.obj();
            bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

            curop.debug().iscommand = true;
            // TODO: Does this get overwritten/do we really need to set this twice?
            curop.debug().query = q.query;

            // Set query result fields.
            QueryResult::View qr = bb.buf();
            bb.decouple();
            qr.setResultFlagsToOk();
            qr.msgdata().setLen(bb.len());
            curop.debug().responseLength = bb.len();
            qr.msgdata().setOperation(opReply);
            qr.setCursorId(0);
            qr.setStartingFrom(0);
            qr.setNReturned(1);
            result.setData(qr.view2ptr(), true);
            return "";
        }

        // We freak out later if this changes before we're done with the query.
        const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns());

        // Handle query option $maxTimeMS (not used with commands).
        curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000);
        txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

        // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set.
        bool slaveOK = pq.getOptions().slaveOk || pq.hasReadPref();
        status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(
                txn,
                NamespaceString(cq->ns()),
                slaveOK);
        uassertStatusOK(status);

        // If this exists, the collection is sharded.
        // If it doesn't exist, we can assume we're not sharded.
        // If we're sharded, we might encounter data that is not consistent with our sharding state.
        // We must ignore this data.
        CollectionMetadataPtr collMetadata;
        if (!shardingState.needCollectionMetadata(pq.ns())) {
            collMetadata = CollectionMetadataPtr();
        }
        else {
            collMetadata = shardingState.getCollectionMetadata(pq.ns());
        }

        // Run the query.
        // bb is used to hold query results
        // this buffer should contain either requested documents per query or
        // explain information, but not both
        BufBuilder bb(32768);
        bb.skip(sizeof(QueryResult::Value));

        // How many results have we obtained from the executor?
        int numResults = 0;

        // If we're replaying the oplog, we save the last time that we read.
        OpTime slaveReadTill;

        // Do we save the PlanExecutor in a ClientCursor for getMore calls later?
        bool saveClientCursor = false;

        BSONObj obj;
        PlanExecutor::ExecState state;
        // uint64_t numMisplacedDocs = 0;

        // Get summary info about which plan the executor is using.
        curop.debug().planSummary = Explain::getPlanSummary(exec.get());

        while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
            // Add result to output buffer.
            bb.appendBuf((void*)obj.objdata(), obj.objsize());

            // Count the result.
            ++numResults;

            // Possibly note slave's position in the oplog.
            if (pq.getOptions().oplogReplay) {
                BSONElement e = obj["ts"];
                if (Date == e.type() || Timestamp == e.type()) {
                    slaveReadTill = e._opTime();
                }
            }

            // TODO: only one type of 2d search doesn't support this.  We need a way to pull it out
            // of CanonicalQuery. :(
            const bool supportsGetMore = true;
            if (!supportsGetMore && (enough(pq, numResults)
                                     || bb.len() >= MaxBytesToReturnToClientAtOnce)) {
                break;
            }
            else if (enoughForFirstBatch(pq, numResults, bb.len())) {
                QLOG() << "Enough for first batch, wantMore=" << pq.wantMore()
                       << " numToReturn=" << pq.getNumToReturn()
                       << " numResults=" << numResults
                       << endl;
                // If only one result requested assume it's a findOne() and don't save the cursor.
                if (pq.wantMore() && 1 != pq.getNumToReturn()) {
                    QLOG() << " executor EOF=" << exec->isEOF() << endl;
                    saveClientCursor = !exec->isEOF();
                }
                break;
            }
        }

        // If we cache the executor later, we want to deregister it as it receives notifications
        // anyway by virtue of being cached.
        //
        // If we don't cache the executor later, we are deleting it, so it must be deregistered.
        //
        // So, no matter what, deregister the executor.
        exec->deregisterExec();

        // Caller expects exceptions thrown in certain cases.
        if (PlanExecutor::EXEC_ERROR == state) {
            scoped_ptr<PlanStageStats> stats(exec->getStats());
            error() << "Plan executor error, stats: "
                    << Explain::statsToBSON(*stats);
            uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj));
        }

        // Why save a dead executor?
        if (PlanExecutor::DEAD == state) {
            saveClientCursor = false;
        }
        else if (pq.getOptions().tailable) {
            // If we're tailing a capped collection, we don't bother saving the cursor if the
            // collection is empty. Otherwise, the semantics of the tailable cursor is that the
            // client will keep trying to read from it. So we'll keep it around.
            if (collection && collection->numRecords(txn) != 0 && pq.getNumToReturn() != 1) {
                saveClientCursor = true;
            }
        }

        // TODO(greg): This will go away soon.
        if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // if the version changed during the query we might be missing some data and its safe to
            // send this as mongos can resend at this point
            throw SendStaleConfigException(pq.ns(), "version changed during initial query",
                                           shardingVersionAtStart,
                                           shardingState.getVersion(pq.ns()));
        }

        const logger::LogComponent queryLogComponent = logger::LogComponent::kQuery;
        const logger::LogSeverity logLevelOne = logger::LogSeverity::Debug(1);

        PlanSummaryStats summaryStats;
        Explain::getSummaryStats(exec.get(), &summaryStats);

        curop.debug().ntoskip = pq.getSkip();
        curop.debug().nreturned = numResults;
        curop.debug().scanAndOrder = summaryStats.hasSortStage;
        curop.debug().nscanned = summaryStats.totalKeysExamined;
        curop.debug().nscannedObjects = summaryStats.totalDocsExamined;
        curop.debug().idhack = summaryStats.isIdhack;

        // Set debug information for consumption by the profiler.
        if (dbProfilingLevel > 0 ||
            curop.elapsedMillis() > serverGlobalParams.slowMS ||
            logger::globalLogDomain()->shouldLog(queryLogComponent, logLevelOne)) {
            // Get BSON stats.
            scoped_ptr<PlanStageStats> execStats(exec->getStats());
            BSONObjBuilder statsBob;
            Explain::statsToBSON(*execStats, &statsBob);
            curop.debug().execStats.set(statsBob.obj());

            // Replace exec stats with plan summary if stats cannot fit into CachedBSONObj.
            if (curop.debug().execStats.tooBig() && !curop.debug().planSummary.empty()) {
                BSONObjBuilder bob;
                bob.append("summary", curop.debug().planSummary.toString());
                curop.debug().execStats.set(bob.done());
            }
        }

        long long ccId = 0;
        if (saveClientCursor) {
            // We won't use the executor until it's getMore'd.
            exec->saveState();

            // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
            // inserted into a global map by its ctor.
            ClientCursor* cc = new ClientCursor(collection, exec.get(),
                                                cq->getParsed().getOptions().toInt(),
                                                cq->getParsed().getFilter());
            ccId = cc->cursorid();

            if (fromDBDirectClient) {
                cc->setUnownedRecoveryUnit(txn->recoveryUnit());
            }
            else if (state == PlanExecutor::IS_EOF && pq.getOptions().tailable) {
                // Don't stash the RU for tailable cursors at EOF, let them get a new RU on their
                // next getMore.
            }
            else {
                // We stash away the RecoveryUnit in the ClientCursor.  It's used for subsequent
                // getMore requests.  The calling OpCtx gets a fresh RecoveryUnit.
                cc->setOwnedRecoveryUnit(txn->releaseRecoveryUnit());
                StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine();
                txn->setRecoveryUnit(storageEngine->newRecoveryUnit(txn));
            }

            QLOG() << "caching executor with cursorid " << ccId
                   << " after returning " << numResults << " results" << endl;

            // ClientCursor takes ownership of executor.  Release to make sure it's not deleted.
            exec.release();

            // TODO document
            if (pq.getOptions().oplogReplay && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            // TODO document
            if (pq.getOptions().exhaust) {
                curop.debug().exhaust = true;
            }

            // Set attributes for getMore.
            cc->setCollMetadata(collMetadata);
            cc->setPos(numResults);

            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());
        }
        else {
            QLOG() << "Not caching executor but returning " << numResults << " results.\n";
        }

        // Add the results from the query into the output buffer.
        result.appendData(bb.buf(), bb.len());
        bb.decouple();

        // Fill out the output buffer's header.
        QueryResult::View qr = result.header().view2ptr();
        qr.setCursorId(ccId);
        curop.debug().cursorid = (0 == ccId ? -1 : ccId);
        qr.setResultFlagsToOk();
        qr.msgdata().setOperation(opReply);
        qr.setStartingFrom(0);
        qr.setNReturned(numResults);

        // curop.debug().exhaust is set above.
        return curop.debug().exhaust ? pq.ns() : "";
    }
Esempio n. 28
0
    /**
     * Called by db/instance.cpp.  This is the getMore entry point.
     *
     * pass - when QueryOption_AwaitData is in use, the caller will make repeated calls 
     *        when this method returns an empty result, incrementing pass on each call.  
     *        Thus, pass == 0 indicates this is the first "attempt" before any 'awaiting'.
     */
    QueryResult::View newGetMore(OperationContext* txn,
                            const char* ns,
                            int ntoreturn,
                            long long cursorid,
                            CurOp& curop,
                            int pass,
                            bool& exhaust,
                            bool* isCursorAuthorized,
                            bool fromDBDirectClient) {

        // For testing, we may want to fail if we receive a getmore.
        if (MONGO_FAIL_POINT(failReceivedGetmore)) {
            invariant(0);
        }

        exhaust = false;

        // This is a read lock.
        const NamespaceString nss(ns);
        scoped_ptr<AutoGetCollectionForRead> ctx(new AutoGetCollectionForRead(txn, nss));
        Collection* collection = ctx->getCollection();
        uassert( 17356, "collection dropped between getMore calls", collection );

        QLOG() << "Running getMore, cursorid: " << cursorid << endl;

        // This checks to make sure the operation is allowed on a replicated node.  Since we are not
        // passing in a query object (necessary to check SlaveOK query option), the only state where
        // reads are allowed is PRIMARY (or master in master/slave).  This function uasserts if
        // reads are not okay.
        Status status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(
                txn,
                nss,
                true);
        uassertStatusOK(status);

        // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it
        // doesn't time out.  Also informs ClientCursor that there is somebody actively holding the
        // CC, so don't delete it.
        ClientCursorPin ccPin(collection, cursorid);
        ClientCursor* cc = ccPin.c();

        // If we're not being called from DBDirectClient we want to associate the RecoveryUnit
        // used to create the execution machinery inside the cursor with our OperationContext.
        // If we throw or otherwise exit this method in a disorderly fashion, we must ensure
        // that further calls to getMore won't fail, and that the provided OperationContext
        // has a valid RecoveryUnit.  As such, we use RAII to accomplish this.
        //
        // This must be destroyed before the ClientCursor is destroyed.
        std::auto_ptr<ScopedRecoveryUnitSwapper> ruSwapper;

        // These are set in the QueryResult msg we return.
        int resultFlags = ResultFlag_AwaitCapable;

        int numResults = 0;
        int startingResult = 0;

        const int InitialBufSize =
            512 + sizeof(QueryResult::Value) + MaxBytesToReturnToClientAtOnce;

        BufBuilder bb(InitialBufSize);
        bb.skip(sizeof(QueryResult::Value));

        if (NULL == cc) {
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // Quote: check for spoofing of the ns such that it does not match the one originally
            // there for the cursor
            uassert(17011, "auth error", str::equals(ns, cc->ns().c_str()));
            *isCursorAuthorized = true;

            // Restore the RecoveryUnit if we need to.
            if (fromDBDirectClient) {
                if (cc->hasRecoveryUnit())
                    invariant(txn->recoveryUnit() == cc->getUnownedRecoveryUnit());
            }
            else {
                if (!cc->hasRecoveryUnit()) {
                    // Start using a new RecoveryUnit
                    cc->setOwnedRecoveryUnit(
                        getGlobalEnvironment()->getGlobalStorageEngine()->newRecoveryUnit(txn));

                }
                // Swap RecoveryUnit(s) between the ClientCursor and OperationContext.
                ruSwapper.reset(new ScopedRecoveryUnitSwapper(cc, txn));
            }

            // Reset timeout timer on the cursor since the cursor is still in use.
            cc->setIdleTime(0);

            // TODO: fail point?

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros());
            txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

            if (0 == pass) { 
                cc->updateSlaveLocation(txn, curop); 
            }

            if (cc->isAggCursor) {
                // Agg cursors handle their own locking internally.
                ctx.reset(); // unlocks
            }

            CollectionMetadataPtr collMetadata = cc->getCollMetadata();

            // If we're replaying the oplog, we save the last time that we read.
            OpTime slaveReadTill;

            // What number result are we starting at?  Used to fill out the reply.
            startingResult = cc->pos();

            // What gives us results.
            PlanExecutor* exec = cc->getExecutor();
            const int queryOptions = cc->queryOptions();

            // Get results out of the executor.
            exec->restoreState(txn);

            BSONObj obj;
            PlanExecutor::ExecState state;
            while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) {
                // Add result to output buffer.
                bb.appendBuf((void*)obj.objdata(), obj.objsize());

                // Count the result.
                ++numResults;

                // Possibly note slave's position in the oplog.
                if (queryOptions & QueryOption_OplogReplay) {
                    BSONElement e = obj["ts"];
                    if (Date == e.type() || Timestamp == e.type()) {
                        slaveReadTill = e._opTime();
                    }
                }

                if ((ntoreturn && numResults >= ntoreturn)
                    || bb.len() > MaxBytesToReturnToClientAtOnce) {
                    break;
                }
            }

            // We save the client cursor when there might be more results, and hence we may receive
            // another getmore. If we receive a EOF or an error, or 'exec' is dead, then we know
            // that we will not be producing more results. We indicate that the cursor is closed by
            // sending a cursorId of 0 back to the client.
            //
            // On the other hand, if we retrieve all results necessary for this batch, then
            // 'saveClientCursor' is true and we send a valid cursorId back to the client. In
            // this case, there may or may not actually be more results (for example, the next call
            // to getNext(...) might just return EOF).
            bool saveClientCursor = false;

            if (PlanExecutor::DEAD == state || PlanExecutor::EXEC_ERROR == state) {
                // Propagate this error to caller.
                if (PlanExecutor::EXEC_ERROR == state) {
                    scoped_ptr<PlanStageStats> stats(exec->getStats());
                    error() << "Plan executor error, stats: "
                            << Explain::statsToBSON(*stats);
                    uasserted(17406, "getMore executor error: " +
                              WorkingSetCommon::toStatusString(obj));
                }

                // If we're dead there's no way to get more results.
                saveClientCursor = false;

                // In the old system tailable capped cursors would be killed off at the
                // cursorid level.  If a tailable capped cursor is nuked the cursorid
                // would vanish.
                //
                // In the new system they die and are cleaned up later (or time out).
                // So this is where we get to remove the cursorid.
                if (0 == numResults) {
                    resultFlags = ResultFlag_CursorNotFound;
                }
            }
            else if (PlanExecutor::IS_EOF == state) {
                // EOF is also end of the line unless it's tailable.
                saveClientCursor = queryOptions & QueryOption_CursorTailable;
            }
            else {
                verify(PlanExecutor::ADVANCED == state);
                saveClientCursor = true;
            }

            if (!saveClientCursor) {
                ruSwapper.reset();
                ccPin.deleteUnderlying();
                // cc is now invalid, as is the executor
                cursorid = 0;
                cc = NULL;
                QLOG() << "getMore NOT saving client cursor, ended with state "
                       << PlanExecutor::statestr(state)
                       << endl;
            }
            else {
                // Continue caching the ClientCursor.
                cc->incPos(numResults);
                exec->saveState();
                QLOG() << "getMore saving client cursor ended with state "
                       << PlanExecutor::statestr(state)
                       << endl;

                if (PlanExecutor::IS_EOF == state && (queryOptions & QueryOption_CursorTailable)) {
                    if (!fromDBDirectClient) {
                        // Don't stash the RU. Get a new one on the next getMore.
                        ruSwapper.reset();
                        delete cc->releaseOwnedRecoveryUnit();
                    }

                    if ((queryOptions & QueryOption_AwaitData)
                            && (numResults == 0)
                            && (pass < 1000)) {
                        // Bubble up to the AwaitData handling code in receivedGetMore which will
                        // try again.
                        return NULL;
                    }
                }

                // Possibly note slave's position in the oplog.
                if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                    cc->slaveReadTill(slaveReadTill);
                }

                exhaust = (queryOptions & QueryOption_Exhaust);

                // If the getmore had a time limit, remaining time is "rolled over" back to the
                // cursor (for use by future getmore ops).
                cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() );
            }
        }

        QueryResult::View qr = bb.buf();
        qr.msgdata().setLen(bb.len());
        qr.msgdata().setOperation(opReply);
        qr.setResultFlags(resultFlags);
        qr.setCursorId(cursorid);
        qr.setStartingFrom(startingResult);
        qr.setNReturned(numResults);
        bb.decouple();
        QLOG() << "getMore returned " << numResults << " results\n";
        return qr;
    }
        virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int options,
                          string& errmsg, BSONObjBuilder& result,
                          bool fromRepl = false ) {

            NamespaceString ns( dbname, cmdObj[name].String() );

            AutoGetCollectionForRead ctx(txn, ns.ns());

            Collection* collection = ctx.getCollection();
            if ( !collection )
                return appendCommandStatus( result,
                                            Status( ErrorCodes::NamespaceNotFound,
                                                    str::stream() <<
                                                    "ns does not exist: " << ns.ns() ) );

            size_t numCursors = static_cast<size_t>( cmdObj["numCursors"].numberInt() );

            if ( numCursors == 0 || numCursors > 10000 )
                return appendCommandStatus( result,
                                            Status( ErrorCodes::BadValue,
                                                    str::stream() <<
                                                    "numCursors has to be between 1 and 10000" <<
                                                    " was: " << numCursors ) );

            OwnedPointerVector<RecordIterator> iterators(collection->getManyIterators(txn));

            if (iterators.size() < numCursors) {
                numCursors = iterators.size();
            }

            OwnedPointerVector<PlanExecutor> execs;
            for ( size_t i = 0; i < numCursors; i++ ) {
                WorkingSet* ws = new WorkingSet();
                MultiIteratorStage* mis = new MultiIteratorStage(txn, ws, collection);

                PlanExecutor* rawExec;
                // Takes ownership of 'ws' and 'mis'.
                Status execStatus = PlanExecutor::make(txn, ws, mis, collection,
                                                       PlanExecutor::YIELD_AUTO, &rawExec);
                invariant(execStatus.isOK());
                auto_ptr<PlanExecutor> curExec(rawExec);

                // The PlanExecutor was registered on construction due to the YIELD_AUTO policy.
                // We have to deregister it, as it will be registered with ClientCursor.
                curExec->deregisterExec();

                // Need to save state while yielding locks between now and getMore().
                curExec->saveState();

                execs.push_back(curExec.release());
            }

            // transfer iterators to executors using a round-robin distribution.
            // TODO consider using a common work queue once invalidation issues go away.
            for (size_t i = 0; i < iterators.size(); i++) {
                PlanExecutor* theExec = execs[i % execs.size()];
                MultiIteratorStage* mis = static_cast<MultiIteratorStage*>(theExec->getRootStage());

                // This wasn't called above as they weren't assigned yet
                iterators[i]->saveState();

                mis->addIterator(iterators.releaseAt(i));
            }

            {
                BSONArrayBuilder bucketsBuilder;
                for (size_t i = 0; i < execs.size(); i++) {
                    // transfer ownership of an executor to the ClientCursor (which manages its own
                    // lifetime).
                    ClientCursor* cc = new ClientCursor( collection->getCursorManager(),
                                                         execs.releaseAt(i),
                                                         ns.ns() );

                    BSONObjBuilder threadResult;
                    appendCursorResponseObject( cc->cursorid(),
                                                ns.ns(),
                                                BSONArray(),
                                                &threadResult );
                    threadResult.appendBool( "ok", 1 );

                    bucketsBuilder.append( threadResult.obj() );
                }
                result.appendArray( "cursors", bucketsBuilder.obj() );
            }

            return true;

        }
Esempio n. 30
0
    /**
     * This is called by db/ops/query.cpp.  This is the entry point for answering a query.
     */
    std::string newRunQuery(CanonicalQuery* cq, CurOp& curop, Message &result) {
        QLOG() << "Running query on new system: " << cq->toString();

        // This is a read lock.
        Client::ReadContext ctx(cq->ns(), storageGlobalParams.dbpath);

        // Parse, canonicalize, plan, transcribe, and get a runner.
        Runner* rawRunner = NULL;

        // We use this a lot below.
        const LiteParsedQuery& pq = cq->getParsed();

        // Need to call cq->toString() now, since upon error getRunner doesn't guarantee
        // cq is in a consistent state.
        string cqStr = cq->toString();

        // We'll now try to get the query runner that will execute this query for us. There
        // are a few cases in which we know upfront which runner we should get and, therefore,
        // we shortcut the selection process here.
        //
        // (a) If the query is over a collection that doesn't exist, we get a special runner
        // that's is so (a runner) which doesn't return results, the EOFRunner.
        //
        // (b) if the query is a replication's initial sync one, we get a SingleSolutinRunner
        // that uses a specifically designed stage that skips extents faster (see details in
        // exec/oplogstart.h)
        //
        // Otherwise we go through the selection of which runner is most suited to the
        // query + run-time context at hand.
        Status status = Status::OK();
        if (ctx.ctx().db()->getCollection(cq->ns()) == NULL) {
            rawRunner = new EOFRunner(cq, cq->ns());
        }
        else if (pq.hasOption(QueryOption_OplogReplay)) {
            status = getOplogStartHack(cq, &rawRunner);
        }
        else {
            // Takes ownership of cq.
            size_t options = QueryPlannerParams::DEFAULT;
            if (shardingState.needCollectionMetadata(pq.ns())) {
                options |= QueryPlannerParams::INCLUDE_SHARD_FILTER;
            }
            status = getRunner(cq, &rawRunner, options);
        }

        if (!status.isOK()) {
            uasserted(17007, "Couldn't get runner for query because: " + status.reason() + " query is " + cqStr);
        }

        verify(NULL != rawRunner);
        auto_ptr<Runner> runner(rawRunner);

        // We freak out later if this changes before we're done with the query.
        const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns());

        // Handle query option $maxTimeMS (not used with commands).
        curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000);
        killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

        // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set.
        replVerifyReadsOk(&pq);

        // If this exists, the collection is sharded.
        // If it doesn't exist, we can assume we're not sharded.
        // If we're sharded, we might encounter data that is not consistent with our sharding state.
        // We must ignore this data.
        CollectionMetadataPtr collMetadata;
        if (!shardingState.needCollectionMetadata(pq.ns())) {
            collMetadata = CollectionMetadataPtr();
        }
        else {
            collMetadata = shardingState.getCollectionMetadata(pq.ns());
        }

        // Run the query.
        // bb is used to hold query results
        // this buffer should contain either requested documents per query or
        // explain information, but not both
        BufBuilder bb(32768);
        bb.skip(sizeof(QueryResult));

        // How many results have we obtained from the runner?
        int numResults = 0;

        // If we're replaying the oplog, we save the last time that we read.
        OpTime slaveReadTill;

        // Do we save the Runner in a ClientCursor for getMore calls later?
        bool saveClientCursor = false;

        // We turn on auto-yielding for the runner here.  The runner registers itself with the
        // active runners list in ClientCursor.
        ClientCursor::registerRunner(runner.get());
        runner->setYieldPolicy(Runner::YIELD_AUTO);
        auto_ptr<DeregisterEvenIfUnderlyingCodeThrows> safety(
            new DeregisterEvenIfUnderlyingCodeThrows(runner.get()));

        BSONObj obj;
        Runner::RunnerState state;
        // uint64_t numMisplacedDocs = 0;

        // set this outside loop. we will need to use this both within loop and when deciding
        // to fill in explain information
        const bool isExplain = pq.isExplain();

        while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) {
            // Add result to output buffer. This is unnecessary if explain info is requested
            if (!isExplain) {
                bb.appendBuf((void*)obj.objdata(), obj.objsize());
            }

            // Count the result.
            ++numResults;

            // Possibly note slave's position in the oplog.
            if (pq.hasOption(QueryOption_OplogReplay)) {
                BSONElement e = obj["ts"];
                if (Date == e.type() || Timestamp == e.type()) {
                    slaveReadTill = e._opTime();
                }
            }

            // TODO: only one type of 2d search doesn't support this.  We need a way to pull it out
            // of CanonicalQuery. :(
            const bool supportsGetMore = true;
            if (isExplain) {
                if (enoughForExplain(pq, numResults)) {
                    break;
                }
            }
            else if (!supportsGetMore && (enough(pq, numResults)
                                          || bb.len() >= MaxBytesToReturnToClientAtOnce)) {
                break;
            }
            else if (enoughForFirstBatch(pq, numResults, bb.len())) {
                QLOG() << "Enough for first batch, wantMore=" << pq.wantMore()
                       << " numToReturn=" << pq.getNumToReturn()
                       << " numResults=" << numResults
                       << endl;
                // If only one result requested assume it's a findOne() and don't save the cursor.
                if (pq.wantMore() && 1 != pq.getNumToReturn()) {
                    QLOG() << " runner EOF=" << runner->isEOF() << endl;
                    saveClientCursor = !runner->isEOF();
                }
                break;
            }
        }

        // If we cache the runner later, we want to deregister it as it receives notifications
        // anyway by virtue of being cached.
        //
        // If we don't cache the runner later, we are deleting it, so it must be deregistered.
        //
        // So, no matter what, deregister the runner.
        safety.reset();

        // Caller expects exceptions thrown in certain cases:
        // * in-memory sort using too much RAM.
        if (Runner::RUNNER_ERROR == state) {
            uasserted(17144, "Runner error, memory limit for sort probably exceeded");
        }

        // Why save a dead runner?
        if (Runner::RUNNER_DEAD == state) {
            saveClientCursor = false;
        }
        else if (pq.hasOption(QueryOption_CursorTailable)) {
            // If we're tailing a capped collection, we don't bother saving the cursor if the
            // collection is empty. Otherwise, the semantics of the tailable cursor is that the
            // client will keep trying to read from it. So we'll keep it around.
            Collection* collection = ctx.ctx().db()->getCollection(cq->ns());
            if (collection && collection->numRecords() != 0 && pq.getNumToReturn() != 1) {
                saveClientCursor = true;
            }
        }

        // TODO(greg): This will go away soon.
        if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // if the version changed during the query we might be missing some data and its safe to
            // send this as mongos can resend at this point
            throw SendStaleConfigException(pq.ns(), "version changed during initial query",
                                           shardingVersionAtStart,
                                           shardingState.getVersion(pq.ns()));
        }

        // Append explain information to query results by asking the runner to produce them.
        if (isExplain) {
            TypeExplain* bareExplain;
            Status res = runner->getExplainPlan(&bareExplain);

            if (!res.isOK()) {
                error() << "could not produce explain of query '" << pq.getFilter()
                        << "', error: " << res.reason();
                // If numResults and the data in bb don't correspond, we'll crash later when rooting
                // through the reply msg.
                BSONObj emptyObj;
                bb.appendBuf((void*)emptyObj.objdata(), emptyObj.objsize());
                // The explain output is actually a result.
                numResults = 1;
                // TODO: we can fill out millis etc. here just fine even if the plan screwed up.
            }
            else {
                boost::scoped_ptr<TypeExplain> explain(bareExplain);

                // Fill in the missing run-time fields in explain, starting with propeties of
                // the process running the query.
                std::string server = mongoutils::str::stream()
                    << getHostNameCached() << ":" << serverGlobalParams.port;
                explain->setServer(server);

                // We might have skipped some results due to chunk migration etc. so our count is
                // correct.
                explain->setN(numResults);

                // Clock the whole operation.
                explain->setMillis(curop.elapsedMillis());

                BSONObj explainObj = explain->toBSON();
                bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

                // The explain output is actually a result.
                numResults = 1;
            }
        }

        long long ccId = 0;
        if (saveClientCursor) {
            // We won't use the runner until it's getMore'd.
            runner->saveState();

            // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
            // inserted into a global map by its ctor.
            ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(),
                                                cq->getParsed().getFilter());
            ccId = cc->cursorid();

            QLOG() << "caching runner with cursorid " << ccId
                   << " after returning " << numResults << " results" << endl;

            // ClientCursor takes ownership of runner.  Release to make sure it's not deleted.
            runner.release();

            // TODO document
            if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            // TODO document
            if (pq.hasOption(QueryOption_Exhaust)) {
                curop.debug().exhaust = true;
            }

            // Set attributes for getMore.
            cc->setCollMetadata(collMetadata);
            cc->setPos(numResults);

            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());
        }
        else {
            QLOG() << "not caching runner but returning " << numResults << " results\n";
        }

        // Add the results from the query into the output buffer.
        result.appendData(bb.buf(), bb.len());
        bb.decouple();

        // Fill out the output buffer's header.
        QueryResult* qr = static_cast<QueryResult*>(result.header());
        qr->cursorId = ccId;
        curop.debug().cursorid = (0 == ccId ? -1 : ccId);
        qr->setResultFlagsToOk();
        qr->setOperation(opReply);
        qr->startingFrom = 0;
        qr->nReturned = numResults;

        curop.debug().ntoskip = pq.getSkip();
        curop.debug().nreturned = numResults;

        // curop.debug().exhaust is set above.
        return curop.debug().exhaust ? pq.ns() : "";
    }