Ejemplo n.º 1
0
    bool handlePossibleShardedMessage( Message &m, DbResponse &dbresponse ){

        if ( shardConfigServer.empty() ){
            return false;
        }

        int op = m.data->operation();
        if ( op < 2000 || op >= 3000 )
            return false;

        
        const char *ns = m.data->_data + 4;
        string errmsg;
        if ( shardVersionOk( ns , errmsg ) ){
            return false;
        }

        log() << "shardVersionOk failed  ns:" << ns << " " << errmsg << endl;
        
        if ( doesOpGetAResponse( op ) ){
            BufBuilder b( 32768 );
            b.skip( sizeof( QueryResult ) );
            {
                BSONObj obj = BSON( "$err" << errmsg );
                b.append( obj.objdata() , obj.objsize() );
            }
            
            QueryResult *qr = (QueryResult*)b.buf();
            qr->_resultFlags() = QueryResult::ResultFlag_ErrSet | QueryResult::ResultFlag_ShardConfigStale;
            qr->len = b.len();
            qr->setOperation( opReply );
            qr->cursorId = 0;
            qr->startingFrom = 0;
            qr->nReturned = 1;
            b.decouple();

            Message * resp = new Message();
            resp->setData( qr , true );
            
            dbresponse.response = resp;
            dbresponse.responseTo = m.data->id;
            return true;
        }
        
        OID * clientID = clientServerIds.get();
        massert( 10422 ,  "write with bad shard config and no server id!" , clientID );
        
        log() << "got write with an old config - writing back" << endl;

        BSONObjBuilder b;
        b.appendBool( "writeBack" , true );
        b.append( "ns" , ns );
        b.appendBinData( "msg" , m.data->len , bdtCustom , (char*)(m.data) );
        log() << "writing back msg with len: " << m.data->len << " op: " << m.data->_operation << endl;
        clientQueues[clientID->str()]->push( b.obj() );

        return true;
    }
Ejemplo n.º 2
0
    static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ){
        bool ok = true;
        MSGID responseTo = m.header()->id;

        DbMessage d(m);
        QueryMessage q(d);
        auto_ptr< Message > resp( new Message() );

        CurOp& op = *(c.curop());
        
        try {
            dbresponse.exhaust = runQuery(m, q, op, *resp);
            assert( !resp->empty() );
        }
        catch ( AssertionException& e ) {
            ok = false;
            op.debug().str << " exception ";
            LOGSOME { 
                log() << "assertion " << e.toString() << " ns:" << q.ns << " query:" <<
                    (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl;
                if( q.ntoskip || q.ntoreturn )
                    log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << endl;
            }

            BSONObjBuilder err;
            e.getInfo().append( err );
            BSONObj errObj = err.done();

            BufBuilder b;
            b.skip(sizeof(QueryResult));
            b.appendBuf((void*) errObj.objdata(), errObj.objsize());

            // todo: call replyToQuery() from here instead of this!!! see dbmessage.h
            QueryResult * msgdata = (QueryResult *) b.buf();
            b.decouple();
            QueryResult *qr = msgdata;
            qr->_resultFlags() = ResultFlag_ErrSet;
            if ( e.getCode() == StaleConfigInContextCode )
                qr->_resultFlags() |= ResultFlag_ShardConfigStale;
            qr->len = b.len();
            qr->setOperation(opReply);
            qr->cursorId = 0;
            qr->startingFrom = 0;
            qr->nReturned = 1;
            resp.reset( new Message() );
            resp->setData( msgdata, true );
        }

        if ( op.shouldDBProfile( 0 ) ){
            op.debug().str << " bytes:" << resp->header()->dataLen();
        }
        
        dbresponse.response = resp.release();
        dbresponse.responseTo = responseTo;
        
        return ok;
    }
Ejemplo n.º 3
0
 /* empty result for error conditions */
 QueryResult* emptyMoreResult(long long cursorid) {
     BufBuilder b(32768);
     b.skip(sizeof(QueryResult));
     QueryResult *qr = (QueryResult *) b.buf();
     qr->cursorId = 0; // 0 indicates no more data to retrieve.
     qr->startingFrom = 0;
     qr->len = b.len();
     qr->setOperation(opReply);
     qr->nReturned = 0;
     b.decouple();
     return qr;
 }
Ejemplo n.º 4
0
    void replyToQuery( int queryResultFlags, Message& response, const BSONObj& resultObj ) {
        BufBuilder bufBuilder;
        bufBuilder.skip( sizeof( QueryResult ));
        bufBuilder.appendBuf( reinterpret_cast< void *>(
                const_cast< char* >( resultObj.objdata() )), resultObj.objsize() );

        QueryResult* queryResult = reinterpret_cast< QueryResult* >( bufBuilder.buf() );
        bufBuilder.decouple();

        queryResult->_resultFlags() = queryResultFlags;
        queryResult->len = bufBuilder.len();
        queryResult->setOperation( opReply );
        queryResult->cursorId = 0;
        queryResult->startingFrom = 0;
        queryResult->nReturned = 1;

        response.setData( queryResult, true ); // transport will free
    }
Ejemplo n.º 5
0
 void replyToQuery(int queryResultFlags, Message &m, DbResponse &dbresponse, BSONObj obj) {
     BufBuilder b;
     b.skip(sizeof(QueryResult));
     b.appendBuf((void*) obj.objdata(), obj.objsize());
     QueryResult* msgdata = (QueryResult *) b.buf();
     b.decouple();
     QueryResult *qr = msgdata;
     qr->_resultFlags() = queryResultFlags;
     qr->len = b.len();
     qr->setOperation(opReply);
     qr->cursorId = 0;
     qr->startingFrom = 0;
     qr->nReturned = 1;
     Message *resp = new Message();
     resp->setData(msgdata, true); // transport will free
     dbresponse.response = resp;
     dbresponse.responseTo = m.header()->id;
 }
Ejemplo n.º 6
0
 void replyToQuery(int queryResultFlags,
                   AbstractMessagingPort* p, Message& requestMsg,
                   void *data, int size,
                   int nReturned, int startingFrom,
                   long long cursorId 
                   ) {
     BufBuilder b(32768);
     b.skip(sizeof(QueryResult));
     b.appendBuf(data, size);
     QueryResult *qr = (QueryResult *) b.buf();
     qr->_resultFlags() = queryResultFlags;
     qr->len = b.len();
     qr->setOperation(opReply);
     qr->cursorId = cursorId;
     qr->startingFrom = startingFrom;
     qr->nReturned = nReturned;
     b.decouple();
     Message resp(qr, true);
     p->reply(requestMsg, resp, requestMsg.header()->id);
 }
Ejemplo n.º 7
0
    /**
     * Also called by db/ops/query.cpp.  This is the new getMore entry point.
     */
    QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop,
                            int pass, bool& exhaust, bool* isCursorAuthorized) {
        exhaust = false;
        int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce;

        BufBuilder bb(bufSize);
        bb.skip(sizeof(QueryResult));

        // This is a read lock.  TODO: There is a cursor flag for not needing this.  Do we care?
        Client::ReadContext ctx(ns);

        QLOG() << "running getMore in new system, cursorid " << cursorid << endl;

        // This checks to make sure the operation is allowed on a replicated node.  Since we are not
        // passing in a query object (necessary to check SlaveOK query option), the only state where
        // reads are allowed is PRIMARY (or master in master/slave).  This function uasserts if
        // reads are not okay.
        replVerifyReadsOk();

        // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it
        // doesn't time out.  Also informs ClientCursor that there is somebody actively holding the
        // CC, so don't delete it.
        ClientCursorPin ccPin(cursorid);
        ClientCursor* cc = ccPin.c();

        // These are set in the QueryResult msg we return.
        int resultFlags = ResultFlag_AwaitCapable;

        int numResults = 0;
        int startingResult = 0;

        if (NULL == cc) {
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // Quote: check for spoofing of the ns such that it does not match the one originally
            // there for the cursor
            uassert(17011, "auth error", str::equals(ns, cc->ns().c_str()));
            *isCursorAuthorized = true;

            // TODO: fail point?

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros());
            killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

            // TODO:
            // curop.debug().query = BSONForQuery
            // curop.setQuery(curop.debug().query);

            // TODO: What is pass?
            if (0 == pass) { cc->updateSlaveLocation(curop); }

            CollectionMetadataPtr collMetadata = cc->getCollMetadata();

            // If we're replaying the oplog, we save the last time that we read.
            OpTime slaveReadTill;

            // What number result are we starting at?  Used to fill out the reply.
            startingResult = cc->pos();

            // What gives us results.
            Runner* runner = cc->getRunner();
            const int queryOptions = cc->queryOptions();

            // Get results out of the runner.
            runner->restoreState();

            BSONObj obj;
            Runner::RunnerState state;
            while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) {
                // Add result to output buffer.
                bb.appendBuf((void*)obj.objdata(), obj.objsize());

                // Count the result.
                ++numResults;

                // Possibly note slave's position in the oplog.
                if (queryOptions & QueryOption_OplogReplay) {
                    BSONElement e = obj["ts"];
                    if (Date == e.type() || Timestamp == e.type()) {
                        slaveReadTill = e._opTime();
                    }
                }

                if ((ntoreturn && numResults >= ntoreturn)
                    || bb.len() > MaxBytesToReturnToClientAtOnce) {
                    break;
                }
            }

            if (Runner::RUNNER_EOF == state && 0 == numResults
                && (queryOptions & QueryOption_CursorTailable)
                && (queryOptions & QueryOption_AwaitData) && (pass < 1000)) {
                // If the cursor is tailable we don't kill it if it's eof.  We let it try to get
                // data some # of times first.
                return 0;
            }

            bool saveClientCursor = false;

            if (Runner::RUNNER_DEAD == state || Runner::RUNNER_ERROR == state) {
                // If we're dead there's no way to get more results.
                saveClientCursor = false;
                // In the old system tailable capped cursors would be killed off at the
                // cursorid level.  If a tailable capped cursor is nuked the cursorid
                // would vanish.
                // 
                // In the new system they die and are cleaned up later (or time out).
                // So this is where we get to remove the cursorid.
                if (0 == numResults) {
                    resultFlags = ResultFlag_CursorNotFound;
                }
            }
            else if (Runner::RUNNER_EOF == state) {
                // EOF is also end of the line unless it's tailable.
                saveClientCursor = queryOptions & QueryOption_CursorTailable;
            }
            else {
                verify(Runner::RUNNER_ADVANCED == state);
                saveClientCursor = true;
            }

            if (!saveClientCursor) {
                ccPin.deleteUnderlying();
                // cc is now invalid, as is the runner
                cursorid = 0;
                cc = NULL;
                QLOG() << "getMore NOT saving client cursor, ended w/state "
                       << Runner::statestr(state)
                       << endl;
            }
            else {
                // Continue caching the ClientCursor.
                cc->incPos(numResults);
                runner->saveState();
                QLOG() << "getMore saving client cursor ended w/state "
                       << Runner::statestr(state)
                       << endl;

                // Possibly note slave's position in the oplog.
                if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                    cc->slaveReadTill(slaveReadTill);
                }

                exhaust = (queryOptions & QueryOption_Exhaust);

                // If the getmore had a time limit, remaining time is "rolled over" back to the
                // cursor (for use by future getmore ops).
                cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() );
            }
        }

        QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf());
        qr->len = bb.len();
        qr->setOperation(opReply);
        qr->_resultFlags() = resultFlags;
        qr->cursorId = cursorid;
        qr->startingFrom = startingResult;
        qr->nReturned = numResults;
        bb.decouple();
        QLOG() << "getMore returned " << numResults << " results\n";
        return qr;
    }
Ejemplo n.º 8
0
    QueryResult* processGetMore(const char* ns,
                                int ntoreturn,
                                long long cursorid,
                                CurOp& curop,
                                int pass,
                                bool& exhaust,
                                bool* isCursorAuthorized ) {

        bool hasRunner = false;

        // Scoped to kill the pin after seeing if the runner's there.
        {
            // See if there's a runner.  We do this until agg. is behind a Runner instead of a CC.
            ClientCursorPin p(cursorid);
            ClientCursor *cc = p.c();
            if (NULL != cc && NULL != cc->getRunner()) {
                hasRunner = true;
            }
        }

        if (hasRunner) {
            return newGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust,
                              isCursorAuthorized);
        }

        exhaust = false;

        int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce;

        BufBuilder b( bufSize );
        b.skip(sizeof(QueryResult));
        int resultFlags = ResultFlag_AwaitCapable;
        int start = 0;
        int n = 0;

        scoped_ptr<Client::ReadContext> ctx(new Client::ReadContext(ns));
        // call this readlocked so state can't change
        replVerifyReadsOk();

        ClientCursorPin p(cursorid);
        ClientCursor *cc = p.c();

        if ( unlikely(!cc) ) {
            LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl;
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // Some internal users create a ClientCursor with a Runner.  Don't crash if this
            // happens.  Instead, hand them off to the new framework.
            if (NULL != cc->getRunner()) {
                p.release();
                return newGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust, isCursorAuthorized);
            }

            // check for spoofing of the ns such that it does not match the one originally there for the cursor
            uassert(14833, "auth error", str::equals(ns, cc->ns().c_str()));

            *isCursorAuthorized = true;

            // This must be done after auth check to ensure proper cleanup.
            uassert(16951, "failing getmore due to set failpoint",
                    !MONGO_FAIL_POINT(getMoreError));

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            curop.setMaxTimeMicros( cc->getLeftoverMaxTimeMicros() );
            killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

            if ( pass == 0 )
                cc->updateSlaveLocation( curop );

            int queryOptions = cc->queryOptions();
            
            curop.debug().query = cc->query();
            curop.setQuery( cc->query() );

            start = cc->pos();
            Cursor *c = cc->c();

            if (!c->requiresLock()) {
                // make sure it won't be destroyed under us
                fassert(16952, !c->shouldDestroyOnNSDeletion());
                fassert(16953, !c->supportYields());
                ctx.reset(); // unlocks
            }

            c->recoverFromYield();
            DiskLoc last;

            // This metadata may be stale, but it's the state of chunking when the cursor was
            // created.
            CollectionMetadataPtr metadata = cc->getCollMetadata();
            KeyPattern keyPattern( metadata ? metadata->getKeyPattern() : BSONObj() );

            while ( 1 ) {
                if ( !c->ok() ) {
                    if ( c->tailable() ) {
                        // when a tailable cursor hits "EOF", ok() goes false, and current() is
                        // null.  however advance() can still be retries as a reactivation attempt.
                        // when there is new data, it will return true.  that's what we are doing
                        // here.
                        if ( c->advance() )
                            continue;

                        if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) {
                            return 0;
                        }

                        break;
                    }
                    p.release();
                    bool ok = ClientCursor::erase(cursorid);
                    verify(ok);
                    cursorid = 0;
                    cc = 0;
                    break;
                }

                MatchDetails details;
                if ( cc->fields && cc->fields->getArrayOpType() == Projection::ARRAY_OP_POSITIONAL ) {
                    // field projection specified, and contains an array operator
                    details.requestElemMatchKey();
                }

                // in some cases (clone collection) there won't be a matcher
                if ( !c->currentMatches( &details ) ) {
                }
                else if ( metadata && !metadata->keyBelongsToMe( extractKey(c, keyPattern ) ) ) {
                    LOG(2) << "cursor skipping document in un-owned chunk: " << c->current()
                               << endl;
                }
                else {
                    if( c->getsetdup(c->currLoc()) ) {
                        //out() << "  but it's a dup \n";
                    }
                    else {
                        last = c->currLoc();
                        n++;

                        // Fill out the fields requested by the query.
                        const Projection::KeyOnly *keyFieldsOnly = c->keyFieldsOnly();
                        if ( keyFieldsOnly ) {
                            fillQueryResultFromObj( b, 0, keyFieldsOnly->hydrate(
                            c->currKey() ), &details );
                        }
                        else {
                            DiskLoc loc = c->currLoc();
                            fillQueryResultFromObj( b, cc->fields.get(), c->current(), &details,
                                    ( ( cc->pq.get() && cc->pq->showDiskLoc() ) ? &loc : 0 ) );
                        }

                        if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) {
                            c->advance();
                            cc->incPos( n );
                            break;
                        }
                    }
                }
                c->advance();

                if ( ! cc->yieldSometimes( ( c->ok() && c->keyFieldsOnly() ) ?
                                          ClientCursor::DontNeed : ClientCursor::WillNeed ) ) {
                    ClientCursor::erase(cursorid);
                    cursorid = 0;
                    cc = 0;
                    break;
                }
            }
            
            if ( cc ) {
                if ( c->supportYields() ) {
                    ClientCursor::YieldData data;
                    verify( cc->prepareToYield( data ) );
                }
                else {
                    cc->c()->noteLocation();
                }
                cc->storeOpForSlave( last );
                exhaust = cc->queryOptions() & QueryOption_Exhaust;

                // If the getmore had a time limit, remaining time is "rolled over" back to the
                // cursor (for use by future getmore ops).
                cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() );
            }
        }

        QueryResult *qr = (QueryResult *) b.buf();
        qr->len = b.len();
        qr->setOperation(opReply);
        qr->_resultFlags() = resultFlags;
        qr->cursorId = cursorid;
        qr->startingFrom = start;
        qr->nReturned = n;
        b.decouple();

        return qr;
    }
Ejemplo n.º 9
0
    static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ) {
        bool ok = true;
        MSGID responseTo = m.header()->id;

        DbMessage d(m);
        QueryMessage q(d);
        auto_ptr< Message > resp( new Message() );

        CurOp& op = *(c.curop());

        shared_ptr<AssertionException> ex;

        try {
            if (!NamespaceString(d.getns()).isCommand()) {
                // Auth checking for Commands happens later.
                Status status = cc().getAuthorizationManager()->checkAuthForQuery(d.getns());
                uassert(16550, status.reason(), status.isOK());
            }
            dbresponse.exhaustNS = runQuery(m, q, op, *resp);
            verify( !resp->empty() );
        }
        catch ( SendStaleConfigException& e ){
            ex.reset( new SendStaleConfigException( e.getns(), e.getInfo().msg, e.getVersionReceived(), e.getVersionWanted() ) );
            ok = false;
        }
        catch ( AssertionException& e ) {
            ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) );
            ok = false;
        }

        if( ex ){

            op.debug().exceptionInfo = ex->getInfo();
            LOGWITHRATELIMIT {
                log() << "assertion " << ex->toString() << " ns:" << q.ns << " query:" <<
                (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl;
                if( q.ntoskip || q.ntoreturn )
                    log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << endl;
            }

            SendStaleConfigException* scex = NULL;
            if ( ex->getCode() == SendStaleConfigCode ) scex = static_cast<SendStaleConfigException*>( ex.get() );

            BSONObjBuilder err;
            ex->getInfo().append( err );
            if( scex ){
                err.append( "ns", scex->getns() );
                scex->getVersionReceived().addToBSON( err, "vReceived" );
                scex->getVersionWanted().addToBSON( err, "vWanted" );
            }
            BSONObj errObj = err.done();

            if( scex ){
                log() << "stale version detected during query over "
                      << q.ns << " : " << errObj << endl;
            }
            else{
                log() << "problem detected during query over "
                      << q.ns << " : " << errObj << endl;
            }

            BufBuilder b;
            b.skip(sizeof(QueryResult));
            b.appendBuf((void*) errObj.objdata(), errObj.objsize());

            // todo: call replyToQuery() from here instead of this!!! see dbmessage.h
            QueryResult * msgdata = (QueryResult *) b.buf();
            b.decouple();
            QueryResult *qr = msgdata;
            qr->_resultFlags() = ResultFlag_ErrSet;
            if( scex ) qr->_resultFlags() |= ResultFlag_ShardConfigStale;
            qr->len = b.len();
            qr->setOperation(opReply);
            qr->cursorId = 0;
            qr->startingFrom = 0;
            qr->nReturned = 1;
            resp.reset( new Message() );
            resp->setData( msgdata, true );

        }

        op.debug().responseLength = resp->header()->dataLen();

        dbresponse.response = resp.release();
        dbresponse.responseTo = responseTo;

        return ok;
    }
Ejemplo n.º 10
0
    /**
     * Also called by db/ops/query.cpp.  This is the new getMore entry point.
     */
    QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop,
                            int pass, bool& exhaust, bool* isCursorAuthorized) {
        exhaust = false;
        int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce;

        BufBuilder bb(bufSize);
        bb.skip(sizeof(QueryResult));

        // This is a read lock.  TODO: There is a cursor flag for not needing this.  Do we care?
        Client::ReadContext ctx(ns);

        // TODO: Document.
        replVerifyReadsOk();

        ClientCursorPin ccPin(cursorid);
        ClientCursor* cc = ccPin.c();

        // These are set in the QueryResult msg we return.
        int resultFlags = ResultFlag_AwaitCapable;

        int numResults = 0;
        int startingResult = 0;

        if (NULL == cc) {
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // Quote: check for spoofing of the ns such that it does not match the one originally
            // there for the cursor
            uassert(17011, "auth error", str::equals(ns, cc->ns().c_str()));
            *isCursorAuthorized = true;

            // TODO: fail point?

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros());
            // TODO:
            // curop.debug().query = BSONForQuery
            // curop.setQuery(curop.debug().query);

            // TODO: What is pass?
            if (0 == pass) { cc->updateSlaveLocation(curop); }

            CollectionMetadataPtr collMetadata = cc->getCollMetadata();

            // If we're replaying the oplog, we save the last time that we read.
            OpTime slaveReadTill;

            startingResult = cc->pos();

            Runner* runner = cc->getRunner();
            const ParsedQuery& pq = runner->getQuery().getParsed();

            // Get results out of the runner.
            // TODO: There may be special handling required for tailable cursors?
            runner->restoreState();
            BSONObj obj;
            // TODO: Differentiate EOF from error.
            while (runner->getNext(&obj)) {
                // If we're sharded make sure that we don't return any data that hasn't been
                // migrated off of our shard yet.
                if (collMetadata) {
                    KeyPattern kp(collMetadata->getKeyPattern());
                    if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; }
                }

                // Add result to output buffer.
                bb.appendBuf((void*)obj.objdata(), obj.objsize());

                // Count the result.
                ++numResults;

                // Possibly note slave's position in the oplog.
                if (pq.hasOption(QueryOption_OplogReplay)) {
                    BSONElement e = obj["ts"];
                    if (Date == e.type() || Timestamp == e.type()) {
                        slaveReadTill = e._opTime();
                    }
                }

                if ((numResults && numResults >= ntoreturn)
                    || bb.len() > MaxBytesToReturnToClientAtOnce) {
                    break;
                }
            }

            cc->incPos(numResults);
            runner->saveState();

            // Possibly note slave's position in the oplog.
            if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            exhaust = pq.hasOption(QueryOption_Exhaust);

            // If the getmore had a time limit, remaining time is "rolled over" back to the
            // cursor (for use by future getmore ops).
            cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() );
        }

        QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf());
        qr->len = bb.len();
        qr->setOperation(opReply);
        qr->_resultFlags() = resultFlags;
        qr->cursorId = cursorid;
        qr->startingFrom = startingResult;
        qr->nReturned = numResults;
        bb.decouple();
        return qr;
    }
Ejemplo n.º 11
0
    bool _handlePossibleShardedMessage( Message &m, DbResponse* dbresponse ) {
        DEV assert( shardingState.enabled() );

        int op = m.operation();
        if ( op < 2000
                || op >= 3000
                || op == dbGetMore  // cursors are weird
           )
            return false;

        DbMessage d(m);
        const char *ns = d.getns();
        string errmsg;
        // We don't care about the version here, since we're returning it later in the writeback
        ConfigVersion received, wanted;
        if ( shardVersionOk( ns , errmsg, received, wanted ) ) {
            return false;
        }

        LOG(1) << "connection meta data too old - will retry ns:(" << ns << ") op:(" << opToString(op) << ") " << errmsg << endl;

        if ( doesOpGetAResponse( op ) ) {
            assert( dbresponse );
            BufBuilder b( 32768 );
            b.skip( sizeof( QueryResult ) );
            {
                BSONObj obj = BSON( "$err" << errmsg << "ns" << ns );
                b.appendBuf( obj.objdata() , obj.objsize() );
            }

            QueryResult *qr = (QueryResult*)b.buf();
            qr->_resultFlags() = ResultFlag_ErrSet | ResultFlag_ShardConfigStale;
            qr->len = b.len();
            qr->setOperation( opReply );
            qr->cursorId = 0;
            qr->startingFrom = 0;
            qr->nReturned = 1;
            b.decouple();

            Message * resp = new Message();
            resp->setData( qr , true );

            dbresponse->response = resp;
            dbresponse->responseTo = m.header()->id;
            return true;
        }
        
        uassert( 9517 , "writeback" , ( d.reservedField() & DbMessage::Reserved_FromWriteback ) == 0 );

        OID writebackID;
        writebackID.init();
        lastError.getSafe()->writeback( writebackID );

        const OID& clientID = ShardedConnectionInfo::get(false)->getID();
        massert( 10422 ,  "write with bad shard config and no server id!" , clientID.isSet() );

        LOG(1) << "got write with an old config - writing back ns: " << ns << endl;
        LOG(1) << m.toString() << endl;

        BSONObjBuilder b;
        b.appendBool( "writeBack" , true );
        b.append( "ns" , ns );
        b.append( "id" , writebackID );
        b.append( "connectionId" , cc().getConnectionId() );
        b.append( "instanceIdent" , prettyHostName() );
        b.appendTimestamp( "version" , shardingState.getVersion( ns ) );
        
        ShardedConnectionInfo* info = ShardedConnectionInfo::get( false );
        b.appendTimestamp( "yourVersion" , info ? info->getVersion(ns) : (ConfigVersion)0 );

        b.appendBinData( "msg" , m.header()->len , bdtCustom , (char*)(m.singleData()) );
        LOG(2) << "writing back msg with len: " << m.header()->len << " op: " << m.operation() << endl;
        writeBackManager.queueWriteBack( clientID.str() , b.obj() );

        return true;
    }
Ejemplo n.º 12
0
    void receivedQuery(DbResponse& dbresponse, /*AbstractMessagingPort& dbMsgPort, */Message& m, stringstream& ss, bool logit) {
        MSGID responseTo = m.data->id;

        DbMessage d(m);
        QueryMessage q(d);
        QueryResult* msgdata;

        try {
            if (q.fields.get() && q.fields->errmsg)
                uassert(q.fields->errmsg, false);

            /* note these are logged BEFORE authentication -- which is sort of ok */
            if ( _diaglog.level && logit ) {
                if ( strstr(q.ns, ".$cmd") ) {
                    /* $cmd queries are "commands" and usually best treated as write operations */
                    OPWRITE;
                }
                else {
                    OPREAD;
                }
            }

            setClient( q.ns );
            Client& client = cc();
            client.top.setRead();
            strncpy(client.curop()->ns, q.ns, Namespace::MaxNsLen);
            msgdata = runQuery(m, ss ).release();
        }
        catch ( AssertionException& e ) {
            ss << " exception ";
            LOGSOME problem() << " Caught Assertion in runQuery ns:" << q.ns << ' ' << e.toString() << '\n';
            log() << "  ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << '\n';
            if ( q.query.valid() )
                log() << "  query:" << q.query.toString() << endl;
            else
                log() << "  query object is not valid!" << endl;

            BSONObjBuilder err;
            err.append("$err", e.msg.empty() ? "assertion during query" : e.msg);
            BSONObj errObj = err.done();

            BufBuilder b;
            b.skip(sizeof(QueryResult));
            b.append((void*) errObj.objdata(), errObj.objsize());

            // todo: call replyToQuery() from here instead of this!!! see dbmessage.h
            msgdata = (QueryResult *) b.buf();
            b.decouple();
            QueryResult *qr = msgdata;
            qr->resultFlags() = QueryResult::ResultFlag_ErrSet;
            qr->len = b.len();
            qr->setOperation(opReply);
            qr->cursorId = 0;
            qr->startingFrom = 0;
            qr->nReturned = 1;

        }
        Message *resp = new Message();
        resp->setData(msgdata, true); // transport will free
        dbresponse.response = resp;
        dbresponse.responseTo = responseTo;
        Database *database = cc().database();
        if ( database ) {
            if ( database->profile )
                ss << " bytes:" << resp->data->dataLen();
        }
        else {
            if ( strstr(q.ns, "$cmd") == 0 ) // (this condition is normal for $cmd dropDatabase)
                log() << "ERROR: receiveQuery: database is null; ns=" << q.ns << endl;
        }
    }
Ejemplo n.º 13
0
    QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& curop, int pass, bool& exhaust ) {
        exhaust = false;
        ClientCursor::Pointer p(cursorid);
        ClientCursor *cc = p.c();

        int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce;

        BufBuilder b( bufSize );
        b.skip(sizeof(QueryResult));
        int resultFlags = ResultFlag_AwaitCapable;
        int start = 0;
        int n = 0;

        if ( unlikely(!cc) ) {
            LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl;
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // check for spoofing of the ns such that it does not match the one originally there for the cursor
            uassert(14833, "auth error", str::equals(ns, cc->ns().c_str()));

            if ( pass == 0 )
                cc->updateSlaveLocation( curop );

            int queryOptions = cc->queryOptions();
            
            curop.debug().query = cc->query();

            start = cc->pos();
            Cursor *c = cc->c();
            c->recoverFromYield();
            DiskLoc last;

            scoped_ptr<Projection::KeyOnly> keyFieldsOnly;
            if ( cc->modifiedKeys() == false && cc->isMultiKey() == false && cc->fields )
                keyFieldsOnly.reset( cc->fields->checkKey( cc->indexKeyPattern() ) );

            // This manager may be stale, but it's the state of chunking when the cursor was created.
            ShardChunkManagerPtr manager = cc->getChunkManager();

            while ( 1 ) {
                if ( !c->ok() ) {
                    if ( c->tailable() ) {
                        /* when a tailable cursor hits "EOF", ok() goes false, and current() is null.  however
                           advance() can still be retries as a reactivation attempt.  when there is new data, it will
                           return true.  that's what we are doing here.
                           */
                        if ( c->advance() )
                            continue;

                        if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) {
                            return 0;
                        }

                        break;
                    }
                    p.release();
                    bool ok = ClientCursor::erase(cursorid);
                    verify(ok);
                    cursorid = 0;
                    cc = 0;
                    break;
                }

                // in some cases (clone collection) there won't be a matcher
                if ( !c->currentMatches() ) {
                }
                else if ( manager && ! manager->belongsToMe( cc ) ){
                    LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl;
                }
                else {
                    if( c->getsetdup(c->currLoc()) ) {
                        //out() << "  but it's a dup \n";
                    }
                    else {
                        last = c->currLoc();
                        n++;

                        if ( keyFieldsOnly ) {
                            fillQueryResultFromObj(b, 0, keyFieldsOnly->hydrate( c->currKey() ) );
                        }
                        else {
                            BSONObj js = c->current();
                            // show disk loc should be part of the main query, not in an $or clause, so this should be ok
                            fillQueryResultFromObj(b, cc->fields.get(), js, ( cc->pq.get() && cc->pq->showDiskLoc() ? &last : 0));
                        }

                        if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) {
                            c->advance();
                            cc->incPos( n );
                            break;
                        }
                    }
                }
                c->advance();

                if ( ! cc->yieldSometimes( ClientCursor::MaybeCovered ) ) {
                    ClientCursor::erase(cursorid);
                    cursorid = 0;
                    cc = 0;
                    p.deleted();
                    break;
                }
            }
            
            if ( cc ) {
                if ( c->supportYields() ) {
                    ClientCursor::YieldData data;
                    verify( cc->prepareToYield( data ) );
                }
                else {
                    cc->c()->noteLocation();
                }
                cc->mayUpgradeStorage();
                cc->storeOpForSlave( last );
                exhaust = cc->queryOptions() & QueryOption_Exhaust;
            }
        }

        QueryResult *qr = (QueryResult *) b.buf();
        qr->len = b.len();
        qr->setOperation(opReply);
        qr->_resultFlags() = resultFlags;
        qr->cursorId = cursorid;
        qr->startingFrom = start;
        qr->nReturned = n;
        b.decouple();

        return qr;
    }
Ejemplo n.º 14
0
    /**
     * Also called by db/ops/query.cpp.  This is the new getMore entry point.
     */
    QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop,
                            int pass, bool& exhaust, bool* isCursorAuthorized) {
        exhaust = false;
        int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce;

        BufBuilder bb(bufSize);
        bb.skip(sizeof(QueryResult));

        // This is a read lock.  TODO: There is a cursor flag for not needing this.  Do we care?
        Client::ReadContext ctx(ns);

        log() << "running getMore in new system, cursorid " << cursorid << endl;

        // TODO: Document.
        // TODO: do this when we can pass in our own parsed query
        //replVerifyReadsOk();

        // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it
        // doesn't time out.  Also informs ClientCursor that there is somebody actively holding the
        // CC, so don't delete it.
        ClientCursorPin ccPin(cursorid);
        ClientCursor* cc = ccPin.c();

        // These are set in the QueryResult msg we return.
        int resultFlags = ResultFlag_AwaitCapable;

        int numResults = 0;
        int startingResult = 0;

        if (NULL == cc) {
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // Quote: check for spoofing of the ns such that it does not match the one originally
            // there for the cursor
            uassert(17011, "auth error", str::equals(ns, cc->ns().c_str()));
            *isCursorAuthorized = true;

            // TODO: fail point?

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros());
            // TODO:
            // curop.debug().query = BSONForQuery
            // curop.setQuery(curop.debug().query);

            // TODO: What is pass?
            if (0 == pass) { cc->updateSlaveLocation(curop); }

            CollectionMetadataPtr collMetadata = cc->getCollMetadata();

            // If we're replaying the oplog, we save the last time that we read.
            OpTime slaveReadTill;

            // What number result are we starting at?  Used to fill out the reply.
            startingResult = cc->pos();

            // What gives us results.
            Runner* runner = cc->getRunner();
            const int queryOptions = cc->queryOptions();

            // Get results out of the runner.
            // TODO: There may be special handling required for tailable cursors?
            runner->restoreState();

            BSONObj obj;
            Runner::RunnerState state;
            while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) {
                // If we're sharded make sure that we don't return any data that hasn't been
                // migrated off of our shard yet.
                if (collMetadata) {
                    KeyPattern kp(collMetadata->getKeyPattern());
                    if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; }
                }

                // Add result to output buffer.
                bb.appendBuf((void*)obj.objdata(), obj.objsize());

                // Count the result.
                ++numResults;

                // Possibly note slave's position in the oplog.
                if (queryOptions & QueryOption_OplogReplay) {
                    BSONElement e = obj["ts"];
                    if (Date == e.type() || Timestamp == e.type()) {
                        slaveReadTill = e._opTime();
                    }
                }

                if ((numResults && numResults >= ntoreturn)
                    || bb.len() > MaxBytesToReturnToClientAtOnce) {
                    break;
                }
            }

            if (Runner::RUNNER_DEAD == state || Runner::RUNNER_EOF == state) {
                log() << "getMore(): runner with id " << cursorid << " EOF/DEAD, state = "
                      << static_cast<int>(state) << endl;
                // TODO: If the cursor is tailable we don't kill it if it's eof.
                ccPin.free();
                // cc is now invalid, as is the runner
                cursorid = 0;
                cc = NULL;
            }
            else {
                // Continue caching the ClientCursor.
                cc->incPos(numResults);
                runner->saveState();

                // Possibly note slave's position in the oplog.
                if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                    cc->slaveReadTill(slaveReadTill);
                }

                exhaust = (queryOptions & QueryOption_Exhaust);

                // If the getmore had a time limit, remaining time is "rolled over" back to the
                // cursor (for use by future getmore ops).
                cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() );
            }
        }

        QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf());
        qr->len = bb.len();
        qr->setOperation(opReply);
        qr->_resultFlags() = resultFlags;
        qr->cursorId = cursorid;
        qr->startingFrom = startingResult;
        qr->nReturned = numResults;
        bb.decouple();
        return qr;
    }
Ejemplo n.º 15
0
    /**
     * This is called by db/ops/query.cpp.  This is the entry point for answering a query.
     */
    string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
        log() << "Running query on new system: " << q.query.toString() << endl;

        // This is a read lock.
        Client::ReadContext ctx(q.ns, dbpath);

        // Parse, canonicalize, plan, transcribe, and get a runner.
        Runner* rawRunner;
        CanonicalQuery* cq;
        Status status = getRunner(q, &rawRunner, &cq);
        if (!status.isOK()) {
            uasserted(17007, "Couldn't process query " + q.query.toString()
                         + " why: " + status.reason());
        }
        verify(NULL != rawRunner);
        auto_ptr<Runner> runner(rawRunner);

        // We freak out later if this changes before we're done with the query.
        const ChunkVersion shardingVersionAtStart = shardingState.getVersion(q.ns);

        // We use this a lot below.
        const LiteParsedQuery& pq = cq->getParsed();

        // TODO: Document why we do this.
        // TODO: do this when we can pass in our own parsed query
        //replVerifyReadsOk(&pq);

        // If this exists, the collection is sharded.
        // If it doesn't exist, we can assume we're not sharded.
        // If we're sharded, we might encounter data that is not consistent with our sharding state.
        // We must ignore this data.
        CollectionMetadataPtr collMetadata;
        if (!shardingState.needCollectionMetadata(pq.ns())) {
            collMetadata = CollectionMetadataPtr();
        }
        else {
            collMetadata = shardingState.getCollectionMetadata(pq.ns());
        }

        // Run the query.
        BufBuilder bb(32768);
        bb.skip(sizeof(QueryResult));

        // How many results have we obtained from the runner?
        int numResults = 0;

        // If we're replaying the oplog, we save the last time that we read.
        OpTime slaveReadTill;

        // Do we save the Runner in a ClientCursor for getMore calls later?
        bool saveClientCursor = false;

        // We turn on auto-yielding for the runner here, so we must register it with the active
        // runners list in ClientCursor.
        ClientCursor::registerRunner(runner.get());
        runner->setYieldPolicy(Runner::YIELD_AUTO);

        BSONObj obj;
        Runner::RunnerState state;

        while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) {
            // If we're sharded make sure that we don't return any data that hasn't been migrated
            // off of our shared yet.
            if (collMetadata) {
                // This information can change if we yield and as such we must make sure to re-fetch
                // it if we yield.
                KeyPattern kp(collMetadata->getKeyPattern());
                // This performs excessive BSONObj creation but that's OK for now.
                if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; }
            }

            // Add result to output buffer.
            bb.appendBuf((void*)obj.objdata(), obj.objsize());

            // Count the result.
            ++numResults;

            // Possibly note slave's position in the oplog.
            if (pq.hasOption(QueryOption_OplogReplay)) {
                BSONElement e = obj["ts"];
                if (Date == e.type() || Timestamp == e.type()) {
                    slaveReadTill = e._opTime();
                }
            }

            // TODO: only one type of 2d search doesn't support this.  We need a way to pull it out
            // of CanonicalQuery. :(
            const bool supportsGetMore = true;
            const bool isExplain = pq.isExplain();
            if (isExplain && enoughForExplain(pq, numResults)) {
                break;
            }
            else if (!supportsGetMore && (enough(pq, numResults)
                                          || bb.len() >= MaxBytesToReturnToClientAtOnce)) {
                break;
            }
            else if (enoughForFirstBatch(pq, numResults, bb.len())) {
                // If only one result requested assume it's a findOne() and don't save the cursor.
                if (pq.wantMore() && 1 != pq.getNumToReturn()) {
                    saveClientCursor = true;
                }
                break;
            }
        }

        // If we cache the runner later, we want to deregister it as it receives notifications
        // anyway by virtue of being cached.
        //
        // If we don't cache the runner later, we are deleting it, so it must be deregistered.
        //
        // So, no matter what, deregister the runner.
        ClientCursor::deregisterRunner(runner.get());

        // Why save a dead runner?
        if (Runner::RUNNER_DEAD == state) { saveClientCursor = false; }

        // TODO: Stage creation can set tailable depending on what's in the parsed query.  We have
        // the full parsed query available during planning...set it there.
        //
        // TODO: If we're tailable we want to save the client cursor.  Make sure we do this later.
        //if (pq.hasOption(QueryOption_CursorTailable) && pq.getNumToReturn() != 1) { ... }

        // TODO(greg): This will go away soon.
        if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // if the version changed during the query we might be missing some data and its safe to
            // send this as mongos can resend at this point
            throw SendStaleConfigException(pq.ns(), "version changed during initial query",
                                           shardingVersionAtStart,
                                           shardingState.getVersion(pq.ns()));
        }

        long long ccId = 0;
        if (saveClientCursor) {
            // We won't use the runner until it's getMore'd.
            runner->saveState();

            // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
            // inserted into a global map by its ctor.
            ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(),
                                                cq->getParsed().getFilter());
            ccId = cc->cursorid();

            log() << "caching runner with cursorid " << ccId << endl;

            // ClientCursor takes ownership of runner.  Release to make sure it's not deleted.
            runner.release();

            // TODO document
            if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            // TODO document
            if (pq.hasOption(QueryOption_Exhaust)) {
                curop.debug().exhaust = true;
            }

            // Set attributes for getMore.
            cc->setCollMetadata(collMetadata);
            cc->setPos(numResults);

            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());
        }

        // Add the results from the query into the output buffer.
        result.appendData(bb.buf(), bb.len());
        bb.decouple();

        // Fill out the output buffer's header.
        QueryResult* qr = static_cast<QueryResult*>(result.header());
        qr->cursorId = ccId;
        curop.debug().cursorid = (0 == ccId ? -1 : ccId);
        qr->setResultFlagsToOk();
        qr->setOperation(opReply);
        qr->startingFrom = 0;
        qr->nReturned = numResults;
        // TODO: nscanned is bogus.
        // curop.debug().nscanned = ( cursor ? cursor->nscanned() : 0LL );
        curop.debug().ntoskip = pq.getSkip();
        curop.debug().nreturned = numResults;

        // curop.debug().exhaust is set above.
        return curop.debug().exhaust ? pq.ns() : "";
    }
Ejemplo n.º 16
0
    /**
     * Run a query with a cursor provided by the query optimizer, or FindingStartCursor.
     * @returns true if client cursor was saved, false if the query has completed.
     */
    bool queryWithQueryOptimizer( int queryOptions, const string& ns,
                                  const BSONObj &jsobj, CurOp& curop,
                                  const BSONObj &query, const BSONObj &order,
                                  const shared_ptr<ParsedQuery> &pq_shared,
                                  const ConfigVersion &shardingVersionAtStart,
                                  const bool getCachedExplainPlan,
                                  const bool inMultiStatementTxn,
                                  Message &result ) {

        const ParsedQuery &pq( *pq_shared );
        shared_ptr<Cursor> cursor;
        QueryPlanSummary queryPlan;

        const bool tailable = pq.hasOption( QueryOption_CursorTailable ) && pq.getNumToReturn() != 1;
        
        LOG(1) << "query beginning read-only transaction. tailable: " << tailable << endl;
        
        BSONObj oldPlan;
        if (getCachedExplainPlan) {
            scoped_ptr<MultiPlanScanner> mps( MultiPlanScanner::make( ns.c_str(), query, order ) );
            oldPlan = mps->cachedPlanExplainSummary();
        }
        
        cursor = getOptimizedCursor( ns.c_str(), query, order, QueryPlanSelectionPolicy::any(),
                                     pq_shared, false, &queryPlan );
        verify( cursor );

        // Tailable cursors must be marked as such before any use. This is so that
        // the implementation knows that uncommitted data cannot be returned.
        if ( tailable ) {
            cursor->setTailable();
        }

        scoped_ptr<QueryResponseBuilder> queryResponseBuilder
                ( QueryResponseBuilder::make( pq, cursor, queryPlan, oldPlan ) );
        bool saveClientCursor = false;
        int options = QueryOption_NoCursorTimeout;
        if (pq.hasOption( QueryOption_OplogReplay )) {
            options |= QueryOption_OplogReplay;
        }
        // create a client cursor that does not create a cursorID.
        // The cursor ID will be created if and only if we save
        // the client cursor further below
        ClientCursor::Holder ccPointer(
            new ClientCursor( options, cursor, ns, BSONObj(), false, false )
            );

        // for oplog cursors, we check if we are reading data that is too old and might
        // be stale.
        bool opChecked = false;
        bool slaveLocationUpdated = false;
        BSONObj last;
        bool lastBSONObjSet = false;
        for ( ; cursor->ok(); cursor->advance() ) {

            if ( pq.getMaxScan() && cursor->nscanned() > pq.getMaxScan() ) {
                break;
            }
            
            if ( !queryResponseBuilder->addMatch() ) {
                continue;
            }
            
            // Note slave's position in the oplog.
            if ( pq.hasOption( QueryOption_OplogReplay ) ) {
                BSONObj current = cursor->current();
                last = current.copy();
                lastBSONObjSet = true;
                
                // the first row returned is equal to the last element that
                // the slave has synced up to, so we might as well update
                // the slave location
                if (!slaveLocationUpdated) {
                    ccPointer->updateSlaveLocation(curop);
                    slaveLocationUpdated = true;
                }
                // check if data we are about to return may be too stale
                if (!opChecked) {
                    ccPointer->storeOpForSlave(current);
                    uassert(16785, "oplog cursor reading data that is too old", !ccPointer->lastOpForSlaveTooOld());
                    opChecked = true;
                }
            }
            
            if ( pq.isExplain() ) {
                if ( queryResponseBuilder->enoughTotalResults() ) {
                    break;
                }
            }
            else if ( queryResponseBuilder->enoughForFirstBatch() ) {
                // if only 1 requested, no cursor saved for efficiency...we assume it is findOne()
                if ( pq.wantMore() && pq.getNumToReturn() != 1 ) {
                    queryResponseBuilder->finishedFirstBatch();
                    if ( cursor->advance() ) {
                        saveClientCursor = true;
                    }
                }
                break;
            }
        }

        // If the tailing request succeeded
        if ( cursor->tailable() ) {
            saveClientCursor = true;
        }
        
        if ( ! shardingState.getVersion( ns ).isWriteCompatibleWith( shardingVersionAtStart ) ) {
            // if the version changed during the query
            // we might be missing some data
            // and its safe to send this as mongos can resend
            // at this point
            throw SendStaleConfigException( ns , "version changed during initial query", shardingVersionAtStart, shardingState.getVersion( ns ) );
        }
        
        int nReturned = queryResponseBuilder->handoff( result );

        ccPointer.reset();
        long long cursorid = 0;
        if ( saveClientCursor ) {
            // Create a new ClientCursor, with a default timeout.
            ccPointer.reset( new ClientCursor( queryOptions, cursor, ns,
                                               jsobj.getOwned(), inMultiStatementTxn ) );
            cursorid = ccPointer->cursorid();
            DEV tlog(2) << "query has more, cursorid: " << cursorid << endl;
            
            if ( !ccPointer->ok() && ccPointer->c()->tailable() ) {
                DEV tlog() << "query has no more but tailable, cursorid: " << cursorid << endl;
            }
            
            if( queryOptions & QueryOption_Exhaust ) {
                curop.debug().exhaust = true;
            }
            
            // Set attributes for getMore.
            ccPointer->setChunkManager( queryResponseBuilder->chunkManager() );
            ccPointer->setPos( nReturned );
            ccPointer->pq = pq_shared;
            ccPointer->fields = pq.getFieldPtr();
            if (pq.hasOption( QueryOption_OplogReplay ) && lastBSONObjSet) {
                ccPointer->storeOpForSlave(last);
            }
            if (!inMultiStatementTxn) {
                // This cursor is not part of a multi-statement transaction, so
                // we pass off the current client's transaction stack to the
                // cursor so that it may be live as long as the cursor.
                cc().swapTransactionStack(ccPointer->transactions);
                verify(!cc().hasTxn());
            }
            ccPointer.release();
        }

        QueryResult *qr = (QueryResult *) result.header();
        qr->cursorId = cursorid;
        curop.debug().cursorid = ( cursorid == 0 ? -1 : qr->cursorId );
        qr->setResultFlagsToOk();
        // qr->len is updated automatically by appendData()
        curop.debug().responseLength = qr->len;
        qr->setOperation(opReply);
        qr->startingFrom = 0;
        qr->nReturned = nReturned;

        curop.debug().nscanned = ( cursor ? cursor->nscanned() : 0LL );
        curop.debug().ntoskip = pq.getSkip();
        curop.debug().nreturned = nReturned;

        return saveClientCursor;
    }
Ejemplo n.º 17
0
    QueryResult* processGetMore(const char* ns,
                                int ntoreturn,
                                long long cursorid,
                                CurOp& curop,
                                int pass,
                                bool& exhaust,
                                bool* isCursorAuthorized ) {
        exhaust = false;
        ClientCursor::Pin p(cursorid);
        ClientCursor *client_cursor = p.c();

        int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce;

        BufBuilder b( bufSize );
        b.skip(sizeof(QueryResult));
        int resultFlags = ResultFlag_AwaitCapable;
        int start = 0;
        int n = 0;

        if ( unlikely(!client_cursor) ) {
            LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl;
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // check for spoofing of the ns such that it does not match the one originally there for the cursor
            uassert(14833, "auth error", str::equals(ns, client_cursor->ns().c_str()));
            uassert(16784, "oplog cursor reading data that is too old", !client_cursor->lastOpForSlaveTooOld());

            int queryOptions = client_cursor->queryOptions();
            OpSettings settings;
            settings.setBulkFetch(true);
            settings.setQueryCursorMode(DEFAULT_LOCK_CURSOR);
            settings.setCappedAppendPK(queryOptions & QueryOption_AddHiddenPK);
            cc().setOpSettings(settings);

            // Check if the cursor is part of a multi-statement transaction. If it is
            // and this is not the right client (meaning the current transaction stack
            // does not match that in the cursor), it will uassert. If the cursor is
            // not part of a multi-statement transaction, then we need to use the stack
            // in the cursor for this scope.
            const bool cursorPartOfMultiStatementTxn = client_cursor->checkMultiStatementTxn();
            scoped_ptr<Client::WithTxnStack> wts;
            if (!cursorPartOfMultiStatementTxn) {
                // For simplicity, prevent multi-statement transactions from
                // reading cursors it didn't create.
                uassert(16813, "Cannot getMore() on a cursor not created by this multi-statement transaction",
                           !cc().hasTxn());
                wts.reset(new Client::WithTxnStack(client_cursor->transactions)); 
            }

            *isCursorAuthorized = true;

            if (pass == 0) {
                client_cursor->updateSlaveLocation( curop );
            }
            
            curop.debug().query = client_cursor->query();

            start = client_cursor->pos();
            Cursor *c = client_cursor->c();

            // This manager may be stale, but it's the state of chunking when the cursor was created.
            ShardChunkManagerPtr manager = client_cursor->getChunkManager();

            while ( 1 ) {
                if ( !c->ok() ) {
                    if ( c->tailable() ) {
                        /* when a tailable cursor hits "EOF", ok() goes false, and current() is null.  however
                           advance() can still be retries as a reactivation attempt.  when there is new data, it will
                           return true.  that's what we are doing here.
                           */
                        if ( c->advance() )
                            continue;

                        if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) {
                            return 0;
                        }

                        break;
                    }
                    p.release();

                    // Done with this cursor, steal transaction stack back to commit or abort it here.
                    bool ok = ClientCursor::erase(cursorid);
                    verify(ok);
                    cursorid = 0;
                    client_cursor = 0;
                    break;
                }

                MatchDetails details;
                if ( client_cursor->fields && client_cursor->fields->getArrayOpType() == Projection::ARRAY_OP_POSITIONAL ) {
                    // field projection specified, and contains an array operator
                    details.requestElemMatchKey();
                }

                // in some cases (clone collection) there won't be a matcher
                if ( !c->currentMatches( &details ) ) {
                }
                else if ( manager && ! manager->belongsToMe( client_cursor ) ){
                    LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl;
                }
                else {
                    if( c->getsetdup(c->currPK()) ) {
                        //out() << "  but it's a dup \n";
                    }
                    else {
                        // save this so that at the end of the loop,
                        // we can update the location for write concern
                        // in replication. Note that if this cursor is not
                        // doing replication, this is pointless
                        if ( client_cursor->queryOptions() & QueryOption_OplogReplay ) {
                            client_cursor->storeOpForSlave( c->current() );
                        }
                        n++;

                        client_cursor->fillQueryResultFromObj( b, &details );

                        if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) {
                            c->advance();
                            client_cursor->incPos( n );
                            break;
                        }
                    }
                }
                c->advance();
            }
            
            if ( client_cursor ) {
                client_cursor->resetIdleAge();
                exhaust = client_cursor->queryOptions() & QueryOption_Exhaust;
            } else if (!cursorPartOfMultiStatementTxn) {
                // This cursor is done and it wasn't part of a multi-statement
                // transaction. We can commit the transaction now.
                cc().commitTopTxn();
                wts->release();
            }
        }

        QueryResult *qr = (QueryResult *) b.buf();
        qr->len = b.len();
        qr->setOperation(opReply);
        qr->_resultFlags() = resultFlags;
        qr->cursorId = cursorid;
        qr->startingFrom = start;
        qr->nReturned = n;
        b.decouple();

        return qr;
    }
Ejemplo n.º 18
0
    std::string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) {
        // Validate the namespace.
        const char *ns = q.ns;
        uassert(16332, "can't have an empty ns", ns[0]);

        const NamespaceString nsString(ns);
        uassert(16256, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid());

        // Set curop information.
        curop.debug().ns = ns;
        curop.debug().ntoreturn = q.ntoreturn;
        curop.debug().query = q.query;
        curop.setQuery(q.query);

        // If the query is really a command, run it.
        if (nsString.isCommand()) {
            int nToReturn = q.ntoreturn;
            uassert(16979, str::stream() << "bad numberToReturn (" << nToReturn
                                         << ") for $cmd type ns - can only be 1 or -1",
                    nToReturn == 1 || nToReturn == -1);

            curop.markCommand();

            BufBuilder bb;
            bb.skip(sizeof(QueryResult));

            BSONObjBuilder cmdResBuf;
            if (!runCommands(ns, q.query, curop, bb, cmdResBuf, false, q.queryOptions)) {
                uasserted(13530, "bad or malformed command request?");
            }

            curop.debug().iscommand = true;
            // TODO: Does this get overwritten/do we really need to set this twice?
            curop.debug().query = q.query;

            QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf());
            bb.decouple();
            qr->setResultFlagsToOk();
            qr->len = bb.len();
            curop.debug().responseLength = bb.len();
            qr->setOperation(opReply);
            qr->cursorId = 0;
            qr->startingFrom = 0;
            qr->nReturned = 1;
            result.setData(qr, true);
            return "";
        }

        // This is a read lock.  We require this because if we're parsing a $where, the
        // where-specific parsing code assumes we have a lock and creates execution machinery that
        // requires it.
        Client::ReadContext ctx(q.ns);
        Collection* collection = ctx.ctx().db()->getCollection( ns );

        // Parse the qm into a CanonicalQuery.
        CanonicalQuery* cq;
        Status canonStatus = CanonicalQuery::canonicalize(q, &cq);
        if (!canonStatus.isOK()) {
            uasserted(17287, str::stream() << "Can't canonicalize query: " << canonStatus.toString());
        }
        verify(cq);

        QLOG() << "Running query:\n" << cq->toString();
        LOG(2) << "Running query: " << cq->toStringShort();

        // Parse, canonicalize, plan, transcribe, and get a runner.
        Runner* rawRunner = NULL;

        // We use this a lot below.
        const LiteParsedQuery& pq = cq->getParsed();

        // We'll now try to get the query runner that will execute this query for us. There
        // are a few cases in which we know upfront which runner we should get and, therefore,
        // we shortcut the selection process here.
        //
        // (a) If the query is over a collection that doesn't exist, we get a special runner
        // that's is so (a runner) which doesn't return results, the EOFRunner.
        //
        // (b) if the query is a replication's initial sync one, we get a SingleSolutinRunner
        // that uses a specifically designed stage that skips extents faster (see details in
        // exec/oplogstart.h)
        //
        // Otherwise we go through the selection of which runner is most suited to the
        // query + run-time context at hand.
        Status status = Status::OK();
        if (collection == NULL) {
            rawRunner = new EOFRunner(cq, cq->ns());
        }
        else if (pq.hasOption(QueryOption_OplogReplay)) {
            status = getOplogStartHack(collection, cq, &rawRunner);
        }
        else {
            // Takes ownership of cq.
            size_t options = QueryPlannerParams::DEFAULT;
            if (shardingState.needCollectionMetadata(pq.ns())) {
                options |= QueryPlannerParams::INCLUDE_SHARD_FILTER;
            }
            status = getRunner(cq, &rawRunner, options);
        }

        if (!status.isOK()) {
            // NOTE: Do not access cq as getRunner has deleted it.
            uasserted(17007, "Unable to execute query: " + status.reason());
        }

        verify(NULL != rawRunner);
        auto_ptr<Runner> runner(rawRunner);

        // We freak out later if this changes before we're done with the query.
        const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns());

        // Handle query option $maxTimeMS (not used with commands).
        curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000);
        killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

        // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set.
        replVerifyReadsOk(&pq);

        // If this exists, the collection is sharded.
        // If it doesn't exist, we can assume we're not sharded.
        // If we're sharded, we might encounter data that is not consistent with our sharding state.
        // We must ignore this data.
        CollectionMetadataPtr collMetadata;
        if (!shardingState.needCollectionMetadata(pq.ns())) {
            collMetadata = CollectionMetadataPtr();
        }
        else {
            collMetadata = shardingState.getCollectionMetadata(pq.ns());
        }

        // Run the query.
        // bb is used to hold query results
        // this buffer should contain either requested documents per query or
        // explain information, but not both
        BufBuilder bb(32768);
        bb.skip(sizeof(QueryResult));

        // How many results have we obtained from the runner?
        int numResults = 0;

        // If we're replaying the oplog, we save the last time that we read.
        OpTime slaveReadTill;

        // Do we save the Runner in a ClientCursor for getMore calls later?
        bool saveClientCursor = false;

        // We turn on auto-yielding for the runner here.  The runner registers itself with the
        // active runners list in ClientCursor.
        auto_ptr<ScopedRunnerRegistration> safety(new ScopedRunnerRegistration(runner.get()));
        runner->setYieldPolicy(Runner::YIELD_AUTO);

        BSONObj obj;
        Runner::RunnerState state;
        // uint64_t numMisplacedDocs = 0;

        // set this outside loop. we will need to use this both within loop and when deciding
        // to fill in explain information
        const bool isExplain = pq.isExplain();

        // Have we retrieved info about which plan the runner will
        // use to execute the query yet?
        bool gotPlanInfo = false;
        PlanInfo* rawInfo;
        boost::scoped_ptr<PlanInfo> planInfo;

        while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) {
            // Add result to output buffer. This is unnecessary if explain info is requested
            if (!isExplain) {
                bb.appendBuf((void*)obj.objdata(), obj.objsize());
            }

            // Count the result.
            ++numResults;

            // In the case of the multi plan runner, we may not be able to
            // successfully retrieve plan info until after the query starts
            // to run. This is because the multi plan runner doesn't know what
            // plan it will end up using until it runs candidates and selects
            // the best.
            //
            // TODO: Do we ever want to output what the MPR is comparing?
            if (!gotPlanInfo) {
                Status infoStatus = runner->getInfo(NULL, &rawInfo);
                if (infoStatus.isOK()) {
                    gotPlanInfo = true;
                    planInfo.reset(rawInfo);
                    // planSummary is really a ThreadSafeString which copies the data from
                    // the provided pointer.
                    curop.debug().planSummary = planInfo->planSummary.c_str();
                }
            }

            // Possibly note slave's position in the oplog.
            if (pq.hasOption(QueryOption_OplogReplay)) {
                BSONElement e = obj["ts"];
                if (Date == e.type() || Timestamp == e.type()) {
                    slaveReadTill = e._opTime();
                }
            }

            // TODO: only one type of 2d search doesn't support this.  We need a way to pull it out
            // of CanonicalQuery. :(
            const bool supportsGetMore = true;
            if (isExplain) {
                if (enoughForExplain(pq, numResults)) {
                    break;
                }
            }
            else if (!supportsGetMore && (enough(pq, numResults)
                                          || bb.len() >= MaxBytesToReturnToClientAtOnce)) {
                break;
            }
            else if (enoughForFirstBatch(pq, numResults, bb.len())) {
                QLOG() << "Enough for first batch, wantMore=" << pq.wantMore()
                       << " numToReturn=" << pq.getNumToReturn()
                       << " numResults=" << numResults
                       << endl;
                // If only one result requested assume it's a findOne() and don't save the cursor.
                if (pq.wantMore() && 1 != pq.getNumToReturn()) {
                    QLOG() << " runner EOF=" << runner->isEOF() << endl;
                    saveClientCursor = !runner->isEOF();
                }
                break;
            }
        }

        // Try to get information about the plan which the runner
        // will use to execute the query, it we don't have it already.
        if (!gotPlanInfo) {
            Status infoStatus = runner->getInfo(NULL, &rawInfo);
            if (infoStatus.isOK()) {
                gotPlanInfo = true;
                planInfo.reset(rawInfo);
                // planSummary is really a ThreadSafeString which copies the data from
                // the provided pointer.
                curop.debug().planSummary = planInfo->planSummary.c_str();
            }
        }

        // If we cache the runner later, we want to deregister it as it receives notifications
        // anyway by virtue of being cached.
        //
        // If we don't cache the runner later, we are deleting it, so it must be deregistered.
        //
        // So, no matter what, deregister the runner.
        safety.reset();

        // Caller expects exceptions thrown in certain cases.
        if (Runner::RUNNER_ERROR == state) {
            TypeExplain* bareExplain;
            Status res = runner->getInfo(&bareExplain, NULL);
            if (res.isOK()) {
                boost::scoped_ptr<TypeExplain> errorExplain(bareExplain);
                error() << "Runner error, stats:\n"
                        << errorExplain->stats.jsonString(Strict, true);
            }
            uasserted(17144, "Runner error: " + WorkingSetCommon::toStatusString(obj));
        }

        // Why save a dead runner?
        if (Runner::RUNNER_DEAD == state) {
            saveClientCursor = false;
        }
        else if (pq.hasOption(QueryOption_CursorTailable)) {
            // If we're tailing a capped collection, we don't bother saving the cursor if the
            // collection is empty. Otherwise, the semantics of the tailable cursor is that the
            // client will keep trying to read from it. So we'll keep it around.
            Collection* collection = ctx.ctx().db()->getCollection(cq->ns());
            if (collection && collection->numRecords() != 0 && pq.getNumToReturn() != 1) {
                saveClientCursor = true;
            }
        }

        // TODO(greg): This will go away soon.
        if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // if the version changed during the query we might be missing some data and its safe to
            // send this as mongos can resend at this point
            throw SendStaleConfigException(pq.ns(), "version changed during initial query",
                                           shardingVersionAtStart,
                                           shardingState.getVersion(pq.ns()));
        }

        // Used to fill in explain and to determine if the query is slow enough to be logged.
        int elapsedMillis = curop.elapsedMillis();

        // Get explain information if:
        // 1) it is needed by an explain query;
        // 2) profiling is enabled; or
        // 3) profiling is disabled but we still need explain details to log a "slow" query.
        // Producing explain information is expensive and should be done only if we are certain
        // the information will be used.
        boost::scoped_ptr<TypeExplain> explain(NULL);
        if (isExplain ||
            ctx.ctx().db()->getProfilingLevel() > 0 ||
            elapsedMillis > serverGlobalParams.slowMS) {
            // Ask the runner to produce explain information.
            TypeExplain* bareExplain;
            Status res = runner->getInfo(&bareExplain, NULL);
            if (res.isOK()) {
                explain.reset(bareExplain);
            }
            else if (isExplain) {
                error() << "could not produce explain of query '" << pq.getFilter()
                        << "', error: " << res.reason();
                // If numResults and the data in bb don't correspond, we'll crash later when rooting
                // through the reply msg.
                BSONObj emptyObj;
                bb.appendBuf((void*)emptyObj.objdata(), emptyObj.objsize());
                // The explain output is actually a result.
                numResults = 1;
                // TODO: we can fill out millis etc. here just fine even if the plan screwed up.
            }
        }

        // Fill in the missing run-time fields in explain, starting with propeties of
        // the process running the query.
        if (isExplain && NULL != explain.get()) {
            std::string server = mongoutils::str::stream()
                << getHostNameCached() << ":" << serverGlobalParams.port;
            explain->setServer(server);

            // We might have skipped some results due to chunk migration etc. so our count is
            // correct.
            explain->setN(numResults);

            // Clock the whole operation.
            explain->setMillis(elapsedMillis);

            BSONObj explainObj = explain->toBSON();
            bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

            // The explain output is actually a result.
            numResults = 1;
        }

        long long ccId = 0;
        if (saveClientCursor) {
            // We won't use the runner until it's getMore'd.
            runner->saveState();

            // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
            // inserted into a global map by its ctor.
            ClientCursor* cc = new ClientCursor(collection, runner.get(),
                                                cq->getParsed().getOptions(),
                                                cq->getParsed().getFilter());
            ccId = cc->cursorid();

            QLOG() << "caching runner with cursorid " << ccId
                   << " after returning " << numResults << " results" << endl;

            // ClientCursor takes ownership of runner.  Release to make sure it's not deleted.
            runner.release();

            // TODO document
            if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            // TODO document
            if (pq.hasOption(QueryOption_Exhaust)) {
                curop.debug().exhaust = true;
            }

            // Set attributes for getMore.
            cc->setCollMetadata(collMetadata);
            cc->setPos(numResults);

            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());
        }
        else {
            QLOG() << "Not caching runner but returning " << numResults << " results.\n";
        }

        // Add the results from the query into the output buffer.
        result.appendData(bb.buf(), bb.len());
        bb.decouple();

        // Fill out the output buffer's header.
        QueryResult* qr = static_cast<QueryResult*>(result.header());
        qr->cursorId = ccId;
        curop.debug().cursorid = (0 == ccId ? -1 : ccId);
        qr->setResultFlagsToOk();
        qr->setOperation(opReply);
        qr->startingFrom = 0;
        qr->nReturned = numResults;

        // Set debug information for consumption by the profiler.
        curop.debug().ntoskip = pq.getSkip();
        curop.debug().nreturned = numResults;
        if (NULL != explain.get()) {
            if (explain->isScanAndOrderSet()) {
                curop.debug().scanAndOrder = explain->getScanAndOrder();
            }
            else {
                curop.debug().scanAndOrder = false;
            }

            if (explain->isNScannedSet()) {
                curop.debug().nscanned = explain->getNScanned();
            }

            if (explain->isNScannedObjectsSet()) {
                curop.debug().nscannedObjects = explain->getNScannedObjects();
            }

            if (explain->isIDHackSet()) {
                curop.debug().idhack = explain->getIDHack();
            }

            if (!explain->stats.isEmpty()) {
                // execStats is a CachedBSONObj because it lives in the race-prone
                // curop.
                curop.debug().execStats.set(explain->stats);

                // Replace exec stats with plan summary if stats cannot fit into CachedBSONObj.
                if (curop.debug().execStats.tooBig() && !curop.debug().planSummary.empty()) {
                    BSONObjBuilder bob;
                    bob.append("summary", curop.debug().planSummary.toString());
                    curop.debug().execStats.set(bob.done());
                }

            }
        }

        // curop.debug().exhaust is set above.
        return curop.debug().exhaust ? pq.ns() : "";
    }
Ejemplo n.º 19
0
    /**
     * This is called by db/ops/query.cpp.  This is the entry point for answering a query.
     */
    std::string newRunQuery(CanonicalQuery* cq, CurOp& curop, Message &result) {
        QLOG() << "Running query on new system: " << cq->toString();

        // This is a read lock.
        Client::ReadContext ctx(cq->ns(), storageGlobalParams.dbpath);

        // Parse, canonicalize, plan, transcribe, and get a runner.
        Runner* rawRunner = NULL;

        // We use this a lot below.
        const LiteParsedQuery& pq = cq->getParsed();

        // Need to call cq->toString() now, since upon error getRunner doesn't guarantee
        // cq is in a consistent state.
        string cqStr = cq->toString();

        // We'll now try to get the query runner that will execute this query for us. There
        // are a few cases in which we know upfront which runner we should get and, therefore,
        // we shortcut the selection process here.
        //
        // (a) If the query is over a collection that doesn't exist, we get a special runner
        // that's is so (a runner) which doesn't return results, the EOFRunner.
        //
        // (b) if the query is a replication's initial sync one, we get a SingleSolutinRunner
        // that uses a specifically designed stage that skips extents faster (see details in
        // exec/oplogstart.h)
        //
        // Otherwise we go through the selection of which runner is most suited to the
        // query + run-time context at hand.
        Status status = Status::OK();
        if (ctx.ctx().db()->getCollection(cq->ns()) == NULL) {
            rawRunner = new EOFRunner(cq, cq->ns());
        }
        else if (pq.hasOption(QueryOption_OplogReplay)) {
            status = getOplogStartHack(cq, &rawRunner);
        }
        else {
            // Takes ownership of cq.
            size_t options = QueryPlannerParams::DEFAULT;
            if (shardingState.needCollectionMetadata(pq.ns())) {
                options |= QueryPlannerParams::INCLUDE_SHARD_FILTER;
            }
            status = getRunner(cq, &rawRunner, options);
        }

        if (!status.isOK()) {
            uasserted(17007, "Couldn't get runner for query because: " + status.reason() + " query is " + cqStr);
        }

        verify(NULL != rawRunner);
        auto_ptr<Runner> runner(rawRunner);

        // We freak out later if this changes before we're done with the query.
        const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns());

        // Handle query option $maxTimeMS (not used with commands).
        curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000);
        killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

        // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set.
        replVerifyReadsOk(&pq);

        // If this exists, the collection is sharded.
        // If it doesn't exist, we can assume we're not sharded.
        // If we're sharded, we might encounter data that is not consistent with our sharding state.
        // We must ignore this data.
        CollectionMetadataPtr collMetadata;
        if (!shardingState.needCollectionMetadata(pq.ns())) {
            collMetadata = CollectionMetadataPtr();
        }
        else {
            collMetadata = shardingState.getCollectionMetadata(pq.ns());
        }

        // Run the query.
        // bb is used to hold query results
        // this buffer should contain either requested documents per query or
        // explain information, but not both
        BufBuilder bb(32768);
        bb.skip(sizeof(QueryResult));

        // How many results have we obtained from the runner?
        int numResults = 0;

        // If we're replaying the oplog, we save the last time that we read.
        OpTime slaveReadTill;

        // Do we save the Runner in a ClientCursor for getMore calls later?
        bool saveClientCursor = false;

        // We turn on auto-yielding for the runner here.  The runner registers itself with the
        // active runners list in ClientCursor.
        ClientCursor::registerRunner(runner.get());
        runner->setYieldPolicy(Runner::YIELD_AUTO);
        auto_ptr<DeregisterEvenIfUnderlyingCodeThrows> safety(
            new DeregisterEvenIfUnderlyingCodeThrows(runner.get()));

        BSONObj obj;
        Runner::RunnerState state;
        // uint64_t numMisplacedDocs = 0;

        // set this outside loop. we will need to use this both within loop and when deciding
        // to fill in explain information
        const bool isExplain = pq.isExplain();

        while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) {
            // Add result to output buffer. This is unnecessary if explain info is requested
            if (!isExplain) {
                bb.appendBuf((void*)obj.objdata(), obj.objsize());
            }

            // Count the result.
            ++numResults;

            // Possibly note slave's position in the oplog.
            if (pq.hasOption(QueryOption_OplogReplay)) {
                BSONElement e = obj["ts"];
                if (Date == e.type() || Timestamp == e.type()) {
                    slaveReadTill = e._opTime();
                }
            }

            // TODO: only one type of 2d search doesn't support this.  We need a way to pull it out
            // of CanonicalQuery. :(
            const bool supportsGetMore = true;
            if (isExplain) {
                if (enoughForExplain(pq, numResults)) {
                    break;
                }
            }
            else if (!supportsGetMore && (enough(pq, numResults)
                                          || bb.len() >= MaxBytesToReturnToClientAtOnce)) {
                break;
            }
            else if (enoughForFirstBatch(pq, numResults, bb.len())) {
                QLOG() << "Enough for first batch, wantMore=" << pq.wantMore()
                       << " numToReturn=" << pq.getNumToReturn()
                       << " numResults=" << numResults
                       << endl;
                // If only one result requested assume it's a findOne() and don't save the cursor.
                if (pq.wantMore() && 1 != pq.getNumToReturn()) {
                    QLOG() << " runner EOF=" << runner->isEOF() << endl;
                    saveClientCursor = !runner->isEOF();
                }
                break;
            }
        }

        // If we cache the runner later, we want to deregister it as it receives notifications
        // anyway by virtue of being cached.
        //
        // If we don't cache the runner later, we are deleting it, so it must be deregistered.
        //
        // So, no matter what, deregister the runner.
        safety.reset();

        // Caller expects exceptions thrown in certain cases:
        // * in-memory sort using too much RAM.
        if (Runner::RUNNER_ERROR == state) {
            uasserted(17144, "Runner error, memory limit for sort probably exceeded");
        }

        // Why save a dead runner?
        if (Runner::RUNNER_DEAD == state) {
            saveClientCursor = false;
        }
        else if (pq.hasOption(QueryOption_CursorTailable)) {
            // If we're tailing a capped collection, we don't bother saving the cursor if the
            // collection is empty. Otherwise, the semantics of the tailable cursor is that the
            // client will keep trying to read from it. So we'll keep it around.
            Collection* collection = ctx.ctx().db()->getCollection(cq->ns());
            if (collection && collection->numRecords() != 0 && pq.getNumToReturn() != 1) {
                saveClientCursor = true;
            }
        }

        // TODO(greg): This will go away soon.
        if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) {
            // if the version changed during the query we might be missing some data and its safe to
            // send this as mongos can resend at this point
            throw SendStaleConfigException(pq.ns(), "version changed during initial query",
                                           shardingVersionAtStart,
                                           shardingState.getVersion(pq.ns()));
        }

        // Append explain information to query results by asking the runner to produce them.
        if (isExplain) {
            TypeExplain* bareExplain;
            Status res = runner->getExplainPlan(&bareExplain);

            if (!res.isOK()) {
                error() << "could not produce explain of query '" << pq.getFilter()
                        << "', error: " << res.reason();
                // If numResults and the data in bb don't correspond, we'll crash later when rooting
                // through the reply msg.
                BSONObj emptyObj;
                bb.appendBuf((void*)emptyObj.objdata(), emptyObj.objsize());
                // The explain output is actually a result.
                numResults = 1;
                // TODO: we can fill out millis etc. here just fine even if the plan screwed up.
            }
            else {
                boost::scoped_ptr<TypeExplain> explain(bareExplain);

                // Fill in the missing run-time fields in explain, starting with propeties of
                // the process running the query.
                std::string server = mongoutils::str::stream()
                    << getHostNameCached() << ":" << serverGlobalParams.port;
                explain->setServer(server);

                // We might have skipped some results due to chunk migration etc. so our count is
                // correct.
                explain->setN(numResults);

                // Clock the whole operation.
                explain->setMillis(curop.elapsedMillis());

                BSONObj explainObj = explain->toBSON();
                bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize());

                // The explain output is actually a result.
                numResults = 1;
            }
        }

        long long ccId = 0;
        if (saveClientCursor) {
            // We won't use the runner until it's getMore'd.
            runner->saveState();

            // Allocate a new ClientCursor.  We don't have to worry about leaking it as it's
            // inserted into a global map by its ctor.
            ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(),
                                                cq->getParsed().getFilter());
            ccId = cc->cursorid();

            QLOG() << "caching runner with cursorid " << ccId
                   << " after returning " << numResults << " results" << endl;

            // ClientCursor takes ownership of runner.  Release to make sure it's not deleted.
            runner.release();

            // TODO document
            if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            // TODO document
            if (pq.hasOption(QueryOption_Exhaust)) {
                curop.debug().exhaust = true;
            }

            // Set attributes for getMore.
            cc->setCollMetadata(collMetadata);
            cc->setPos(numResults);

            // If the query had a time limit, remaining time is "rolled over" to the cursor (for
            // use by future getmore ops).
            cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros());
        }
        else {
            QLOG() << "not caching runner but returning " << numResults << " results\n";
        }

        // Add the results from the query into the output buffer.
        result.appendData(bb.buf(), bb.len());
        bb.decouple();

        // Fill out the output buffer's header.
        QueryResult* qr = static_cast<QueryResult*>(result.header());
        qr->cursorId = ccId;
        curop.debug().cursorid = (0 == ccId ? -1 : ccId);
        qr->setResultFlagsToOk();
        qr->setOperation(opReply);
        qr->startingFrom = 0;
        qr->nReturned = numResults;

        curop.debug().ntoskip = pq.getSkip();
        curop.debug().nreturned = numResults;

        // curop.debug().exhaust is set above.
        return curop.debug().exhaust ? pq.ns() : "";
    }
Ejemplo n.º 20
0
    static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ) {
        bool ok = true;
        MSGID responseTo = m.header()->id;

        DbMessage d(m);
        QueryMessage q(d);
        auto_ptr< Message > resp( new Message() );

        CurOp& op = *(c.curop());

        shared_ptr<AssertionException> ex;

        try {
            dbresponse.exhaust = runQuery(m, q, op, *resp);
            assert( !resp->empty() );
        }
        catch ( SendStaleConfigException& e ){
            ex.reset( new SendStaleConfigException( e.getns(), e.getInfo().msg ) );
            ok = false;
        }
        catch ( AssertionException& e ) {
            ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) );
            ok = false;
        }

        if( ex ){

            op.debug().exceptionInfo = ex->getInfo();
            LOGWITHRATELIMIT {
                log() << "assertion " << ex->toString() << " ns:" << q.ns << " query:" <<
                (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl;
                if( q.ntoskip || q.ntoreturn )
                    log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << endl;
            }

            SendStaleConfigException* scex = NULL;
            if ( ex->getCode() == SendStaleConfigCode ) scex = static_cast<SendStaleConfigException*>( ex.get() );

            BSONObjBuilder err;
            ex->getInfo().append( err );
            if( scex ) err.append( "ns", scex->getns() );
            BSONObj errObj = err.done();

            log() << errObj << endl;

            BufBuilder b;
            b.skip(sizeof(QueryResult));
            b.appendBuf((void*) errObj.objdata(), errObj.objsize());

            // todo: call replyToQuery() from here instead of this!!! see dbmessage.h
            QueryResult * msgdata = (QueryResult *) b.buf();
            b.decouple();
            QueryResult *qr = msgdata;
            qr->_resultFlags() = ResultFlag_ErrSet;
            if( scex ) qr->_resultFlags() |= ResultFlag_ShardConfigStale;
            qr->len = b.len();
            qr->setOperation(opReply);
            qr->cursorId = 0;
            qr->startingFrom = 0;
            qr->nReturned = 1;
            resp.reset( new Message() );
            resp->setData( msgdata, true );

        }

        op.debug().responseLength = resp->header()->dataLen();

        dbresponse.response = resp.release();
        dbresponse.responseTo = responseTo;

        return ok;
    }
Ejemplo n.º 21
0
    bool _handlePossibleShardedMessage( Message &m, DbResponse* dbresponse ) {
        DEV verify( shardingState.enabled() );

        int op = m.operation();
        if ( op < 2000
                || op >= 3000
                || op == dbGetMore  // cursors are weird
           )
            return false;

        DbMessage d(m);
        const char *ns = d.getns();
        string errmsg;
        // We don't care about the version here, since we're returning it later in the writeback
        ChunkVersion received, wanted;
        if ( shardVersionOk( ns , errmsg, received, wanted ) ) {
            return false;
        }

        bool getsAResponse = doesOpGetAResponse( op );

        LOG(1) << "connection sharding metadata does not match for collection " << ns
               << ", will retry (wanted : " << wanted << ", received : " << received << ")"
               << ( getsAResponse ? "" : " (queuing writeback)" ) << endl;

        if( getsAResponse ){
            verify( dbresponse );
            BufBuilder b( 32768 );
            b.skip( sizeof( QueryResult ) );
            {
                BSONObjBuilder bob;

                bob.append( "$err", errmsg );
                bob.append( "ns", ns );
                wanted.addToBSON( bob, "vWanted" );
                received.addToBSON( bob, "vReceived" );

                BSONObj obj = bob.obj();

                b.appendBuf( obj.objdata() , obj.objsize() );
            }

            QueryResult *qr = (QueryResult*)b.buf();
            qr->_resultFlags() = ResultFlag_ErrSet | ResultFlag_ShardConfigStale;
            qr->len = b.len();
            qr->setOperation( opReply );
            qr->cursorId = 0;
            qr->startingFrom = 0;
            qr->nReturned = 1;
            b.decouple();

            Message * resp = new Message();
            resp->setData( qr , true );

            dbresponse->response = resp;
            dbresponse->responseTo = m.header()->id;
            return true;
        }

        uassert(9517, "cannot queue a writeback operation to the writeback queue",
                (d.reservedField() & Reserved_FromWriteback) == 0);

        const OID& clientID = ShardedConnectionInfo::get(false)->getID();
        massert( 10422 ,  "write with bad shard config and no server id!" , clientID.isSet() );

        // We need to check this here, since otherwise we'll get errors wrapping the writeback -
        // not just here, but also when returning as a command result.
        // We choose 1/2 the overhead of the internal maximum so that we can still handle ops of
        // 16MB exactly.
        massert( 16437, "data size of operation is too large to queue for writeback",
                 m.dataSize() < BSONObjMaxInternalSize - (8 * 1024));

        LOG(1) << "writeback queued for " << m.toString() << endl;

        BSONObjBuilder b;
        b.appendBool( "writeBack" , true );
        b.append( "ns" , ns );
        b.append( "connectionId" , cc().getConnectionId() );
        b.append( "instanceIdent" , prettyHostName() );
        wanted.addToBSON( b );
        received.addToBSON( b, "yourVersion" );

        b.appendBinData( "msg" , m.header()->len , bdtCustom , (char*)(m.singleData()) );
        LOG(2) << "writing back msg with len: " << m.header()->len << " op: " << m.operation() << endl;
        
        // we pass the builder to queueWriteBack so that it can select the writebackId
        // this is important so that the id is guaranteed to be ascending 
        // that is important since mongos assumes if its seen a greater writeback
        // that all former have been processed
        OID writebackID = writeBackManager.queueWriteBack( clientID.str() , b );

        lastError.getSafe()->writeback( writebackID );

        return true;
    }
Ejemplo n.º 22
0
    QueryResult* getMore(const char *ns, int ntoreturn, long long cursorid , stringstream& ss) {
        ClientCursor *cc = ClientCursor::find(cursorid);
        
        int bufSize = 512;
        if ( cc ){
            bufSize += sizeof( QueryResult );
            bufSize += ( ntoreturn ? 4 : 1 ) * 1024 * 1024;
        }
        BufBuilder b( bufSize );

        b.skip(sizeof(QueryResult));

        int resultFlags = 0;
        int start = 0;
        int n = 0;

        if ( !cc ) {
            log() << "getMore: cursorid not found " << ns << " " << cursorid << endl;
            cursorid = 0;
            resultFlags = QueryResult::ResultFlag_CursorNotFound;
        }
        else {
            ss << " query: " << cc->query << " ";
            start = cc->pos;
            Cursor *c = cc->c.get();
            c->checkLocation();
            while ( 1 ) {
                if ( !c->ok() ) {
                    if ( c->tailable() ) {
                        if ( c->advance() ) {
                            continue;
                        }
                        break;
                    }
                    bool ok = ClientCursor::erase(cursorid);
                    assert(ok);
                    cursorid = 0;
                    cc = 0;
                    break;
                }
                if ( !cc->matcher->matches(c->currKey(), c->currLoc() ) ) {
                }
                else {
                    //out() << "matches " << c->currLoc().toString() << '\n';
                    if( c->getsetdup(c->currLoc()) ) {
                        //out() << "  but it's a dup \n";
                    }
                    else {
                        BSONObj js = c->current();
                        fillQueryResultFromObj(b, cc->filter.get(), js);
                        n++;
                        if ( (ntoreturn>0 && (n >= ntoreturn || b.len() > MaxBytesToReturnToClientAtOnce)) ||
                             (ntoreturn==0 && b.len()>1*1024*1024) ) {
                            c->advance();
                            cc->pos += n;
                            //cc->updateLocation();
                            break;
                        }
                    }
                }
                c->advance();
            }
            if ( cc ) {
                cc->updateLocation();
                cc->mayUpgradeStorage();
            }
        }

        QueryResult *qr = (QueryResult *) b.buf();
        qr->len = b.len();
        qr->setOperation(opReply);
        qr->resultFlags() = resultFlags;
        qr->cursorId = cursorid;
        qr->startingFrom = start;
        qr->nReturned = n;
        b.decouple();

        return qr;
    }
Ejemplo n.º 23
0
    bool handlePossibleShardedMessage( Message &m, DbResponse* dbresponse ) {
        if ( ! shardingState.enabled() )
            return false;

        int op = m.operation();
        if ( op < 2000
                || op >= 3000
                || op == dbGetMore  // cursors are weird
           )
            return false;

        DbMessage d(m);
        const char *ns = d.getns();
        string errmsg;
        if ( shardVersionOk( ns , opIsWrite( op ) , errmsg ) ) {
            return false;
        }

        log(1) << "connection meta data too old - will retry ns:(" << ns << ") op:(" << opToString(op) << ") " << errmsg << endl;

        if ( doesOpGetAResponse( op ) ) {
            assert( dbresponse );
            BufBuilder b( 32768 );
            b.skip( sizeof( QueryResult ) );
            {
                BSONObj obj = BSON( "$err" << errmsg );
                b.appendBuf( obj.objdata() , obj.objsize() );
            }

            QueryResult *qr = (QueryResult*)b.buf();
            qr->_resultFlags() = ResultFlag_ErrSet | ResultFlag_ShardConfigStale;
            qr->len = b.len();
            qr->setOperation( opReply );
            qr->cursorId = 0;
            qr->startingFrom = 0;
            qr->nReturned = 1;
            b.decouple();

            Message * resp = new Message();
            resp->setData( qr , true );

            dbresponse->response = resp;
            dbresponse->responseTo = m.header()->id;
            return true;
        }

        OID writebackID;
        writebackID.init();
        lastError.getSafe()->writeback( writebackID );

        const OID& clientID = ShardedConnectionInfo::get(false)->getID();
        massert( 10422 ,  "write with bad shard config and no server id!" , clientID.isSet() );

        log(1) << "got write with an old config - writing back ns: " << ns << endl;
        if ( logLevel ) log(1) << debugString( m ) << endl;

        BSONObjBuilder b;
        b.appendBool( "writeBack" , true );
        b.append( "ns" , ns );
        b.append( "id" , writebackID );
        b.append( "connectionId" , cc().getConnectionId() );
        b.appendTimestamp( "version" , shardingState.getVersion( ns ) );
        b.appendTimestamp( "yourVersion" , ShardedConnectionInfo::get( true )->getVersion( ns ) );
        b.appendBinData( "msg" , m.header()->len , bdtCustom , (char*)(m.singleData()) );
        log(2) << "writing back msg with len: " << m.header()->len << " op: " << m.operation() << endl;
        writeBackManager.queueWriteBack( clientID.str() , b.obj() );

        return true;
    }
Ejemplo n.º 24
0
    /**
     * Run a query with a cursor provided by the query optimizer, or FindingStartCursor.
     * @yields the db lock.
     */
    const char *queryWithQueryOptimizer( Message &m, int queryOptions, const char *ns,
                                        const BSONObj &jsobj, CurOp& curop,
                                        const BSONObj &query, const BSONObj &order,
                                        const shared_ptr<ParsedQuery> &pq_shared,
                                        const BSONObj &oldPlan,
                                        const ConfigVersion &shardingVersionAtStart,
                                        Message &result ) {

        const ParsedQuery &pq( *pq_shared );
        shared_ptr<Cursor> cursor;
        QueryPlanSummary queryPlan;
        
        if ( pq.hasOption( QueryOption_OplogReplay ) ) {
            cursor = FindingStartCursor::getCursor( ns, query, order );
        }
        else {
            cursor =
            NamespaceDetailsTransient::getCursor( ns, query, order, QueryPlanSelectionPolicy::any(),
                                                 0, pq_shared, &queryPlan );
        }
        verify( cursor );
        
        QueryResponseBuilder queryResponseBuilder( pq, cursor, queryPlan, oldPlan );
        bool saveClientCursor = false;
        const char *exhaust = 0;
        OpTime slaveReadTill;
        ClientCursor::CleanupPointer ccPointer;
        ccPointer.reset( new ClientCursor( QueryOption_NoCursorTimeout, cursor, ns ) );
        
        for( ; cursor->ok(); cursor->advance() ) {

            bool yielded = false;
            if ( !ccPointer->yieldSometimes( ClientCursor::MaybeCovered, &yielded ) ||
                !cursor->ok() ) {
                cursor.reset();
                queryResponseBuilder.noteYield();
                // !!! TODO The queryResponseBuilder still holds cursor.  Currently it will not do
                // anything unsafe with the cursor in handoff(), but this is very fragile.
                //
                // We don't fail the query since we're fine with returning partial data if the
                // collection was dropped.
                // NOTE see SERVER-2454.
                // TODO This is wrong.  The cursor could be gone if the closeAllDatabases command
                // just ran.
                break;
            }

            if ( yielded ) {
                queryResponseBuilder.noteYield();
            }
            
            if ( pq.getMaxScan() && cursor->nscanned() > pq.getMaxScan() ) {
                break;
            }
            
            if ( !queryResponseBuilder.addMatch() ) {
                continue;
            }
            
            // Note slave's position in the oplog.
            if ( pq.hasOption( QueryOption_OplogReplay ) ) {
                BSONObj current = cursor->current();
                BSONElement e = current["ts"];
                if ( e.type() == Date || e.type() == Timestamp ) {
                    slaveReadTill = e._opTime();
                }
            }
            
            if ( !cursor->supportGetMore() || pq.isExplain() ) {
                if ( queryResponseBuilder.enoughTotalResults() ) {
                    break;
                }
            }
            else if ( queryResponseBuilder.enoughForFirstBatch() ) {
                // if only 1 requested, no cursor saved for efficiency...we assume it is findOne()
                if ( pq.wantMore() && pq.getNumToReturn() != 1 ) {
                    queryResponseBuilder.finishedFirstBatch();
                    if ( cursor->advance() ) {
                        saveClientCursor = true;
                    }
                }
                break;
            }
        }
        
        if ( cursor ) {
            if ( pq.hasOption( QueryOption_CursorTailable ) && pq.getNumToReturn() != 1 ) {
                cursor->setTailable();
            }
            
            // If the tailing request succeeded.
            if ( cursor->tailable() ) {
                saveClientCursor = true;
            }
        }
        
        if ( shardingState.getVersion( ns ) != shardingVersionAtStart ) {
            // if the version changed during the query
            // we might be missing some data
            // and its safe to send this as mongos can resend
            // at this point
            throw SendStaleConfigException( ns , "version changed during initial query", shardingVersionAtStart, shardingState.getVersion( ns ) );
        }

        int nReturned = queryResponseBuilder.handoff( result );

        ccPointer.reset();
        long long cursorid = 0;
        if ( saveClientCursor ) {
            // Create a new ClientCursor, with a default timeout.
            ccPointer.reset( new ClientCursor( queryOptions, cursor, ns,
                                              jsobj.getOwned() ) );
            cursorid = ccPointer->cursorid();
            DEV tlog(2) << "query has more, cursorid: " << cursorid << endl;
            if ( cursor->supportYields() ) {
                ClientCursor::YieldData data;
                ccPointer->prepareToYield( data );
            }
            else {
                ccPointer->c()->noteLocation();
            }
            
            // !!! Save the original message buffer, so it can be referenced in getMore.
            ccPointer->originalMessage = m;

            // Save slave's position in the oplog.
            if ( pq.hasOption( QueryOption_OplogReplay ) && !slaveReadTill.isNull() ) {
                ccPointer->slaveReadTill( slaveReadTill );
            }
            
            if ( !ccPointer->ok() && ccPointer->c()->tailable() ) {
                DEV tlog() << "query has no more but tailable, cursorid: " << cursorid << endl;
            }
            
            if( queryOptions & QueryOption_Exhaust ) {
                exhaust = ns;
                curop.debug().exhaust = true;
            }
            
            // Set attributes for getMore.
            ccPointer->setChunkManager( queryResponseBuilder.chunkManager() );
            ccPointer->setPos( nReturned );
            ccPointer->pq = pq_shared;
            ccPointer->fields = pq.getFieldPtr();
            ccPointer.release();
        }
        
        QueryResult *qr = (QueryResult *) result.header();
        qr->cursorId = cursorid;
        curop.debug().cursorid = ( cursorid == 0 ? -1 : qr->cursorId );
        qr->setResultFlagsToOk();
        // qr->len is updated automatically by appendData()
        curop.debug().responseLength = qr->len;
        qr->setOperation(opReply);
        qr->startingFrom = 0;
        qr->nReturned = nReturned;
        
        int duration = curop.elapsedMillis();
        bool dbprofile = curop.shouldDBProfile( duration );
        if ( dbprofile || duration >= cmdLine.slowMS ) {
            curop.debug().nscanned = (int)( cursor ? cursor->nscanned() : 0 );
            curop.debug().ntoskip = pq.getSkip();
        }
        curop.debug().nreturned = nReturned;

        return exhaust;
    }