Ejemplo n.º 1
0
    static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ){
        bool ok = true;
        MSGID responseTo = m.header()->id;

        DbMessage d(m);
        QueryMessage q(d);
        auto_ptr< Message > resp( new Message() );

        CurOp& op = *(c.curop());
        
        try {
            dbresponse.exhaust = runQuery(m, q, op, *resp);
            assert( !resp->empty() );
        }
        catch ( AssertionException& e ) {
            ok = false;
            op.debug().str << " exception ";
            LOGSOME { 
                log() << "assertion " << e.toString() << " ns:" << q.ns << " query:" <<
                    (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl;
                if( q.ntoskip || q.ntoreturn )
                    log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << endl;
            }

            BSONObjBuilder err;
            e.getInfo().append( err );
            BSONObj errObj = err.done();

            BufBuilder b;
            b.skip(sizeof(QueryResult));
            b.appendBuf((void*) errObj.objdata(), errObj.objsize());

            // todo: call replyToQuery() from here instead of this!!! see dbmessage.h
            QueryResult * msgdata = (QueryResult *) b.buf();
            b.decouple();
            QueryResult *qr = msgdata;
            qr->_resultFlags() = ResultFlag_ErrSet;
            if ( e.getCode() == StaleConfigInContextCode )
                qr->_resultFlags() |= ResultFlag_ShardConfigStale;
            qr->len = b.len();
            qr->setOperation(opReply);
            qr->cursorId = 0;
            qr->startingFrom = 0;
            qr->nReturned = 1;
            resp.reset( new Message() );
            resp->setData( msgdata, true );
        }

        if ( op.shouldDBProfile( 0 ) ){
            op.debug().str << " bytes:" << resp->header()->dataLen();
        }
        
        dbresponse.response = resp.release();
        dbresponse.responseTo = responseTo;
        
        return ok;
    }
Ejemplo n.º 2
0
    bool handlePossibleShardedMessage( Message &m, DbResponse &dbresponse ){

        if ( shardConfigServer.empty() ){
            return false;
        }

        int op = m.data->operation();
        if ( op < 2000 || op >= 3000 )
            return false;

        
        const char *ns = m.data->_data + 4;
        string errmsg;
        if ( shardVersionOk( ns , errmsg ) ){
            return false;
        }

        log() << "shardVersionOk failed  ns:" << ns << " " << errmsg << endl;
        
        if ( doesOpGetAResponse( op ) ){
            BufBuilder b( 32768 );
            b.skip( sizeof( QueryResult ) );
            {
                BSONObj obj = BSON( "$err" << errmsg );
                b.append( obj.objdata() , obj.objsize() );
            }
            
            QueryResult *qr = (QueryResult*)b.buf();
            qr->_resultFlags() = QueryResult::ResultFlag_ErrSet | QueryResult::ResultFlag_ShardConfigStale;
            qr->len = b.len();
            qr->setOperation( opReply );
            qr->cursorId = 0;
            qr->startingFrom = 0;
            qr->nReturned = 1;
            b.decouple();

            Message * resp = new Message();
            resp->setData( qr , true );
            
            dbresponse.response = resp;
            dbresponse.responseTo = m.data->id;
            return true;
        }
        
        OID * clientID = clientServerIds.get();
        massert( 10422 ,  "write with bad shard config and no server id!" , clientID );
        
        log() << "got write with an old config - writing back" << endl;

        BSONObjBuilder b;
        b.appendBool( "writeBack" , true );
        b.append( "ns" , ns );
        b.appendBinData( "msg" , m.data->len , bdtCustom , (char*)(m.data) );
        log() << "writing back msg with len: " << m.data->len << " op: " << m.data->_operation << endl;
        clientQueues[clientID->str()]->push( b.obj() );

        return true;
    }
Ejemplo n.º 3
0
 void replyToQuery(int queryResultFlags, Message &m, DbResponse &dbresponse, BSONObj obj) {
     BufBuilder b;
     b.skip(sizeof(QueryResult));
     b.appendBuf((void*) obj.objdata(), obj.objsize());
     QueryResult* msgdata = (QueryResult *) b.buf();
     b.decouple();
     QueryResult *qr = msgdata;
     qr->_resultFlags() = queryResultFlags;
     qr->len = b.len();
     qr->setOperation(opReply);
     qr->cursorId = 0;
     qr->startingFrom = 0;
     qr->nReturned = 1;
     Message *resp = new Message();
     resp->setData(msgdata, true); // transport will free
     dbresponse.response = resp;
     dbresponse.responseTo = m.header()->id;
 }
Ejemplo n.º 4
0
    void replyToQuery( int queryResultFlags, Message& response, const BSONObj& resultObj ) {
        BufBuilder bufBuilder;
        bufBuilder.skip( sizeof( QueryResult ));
        bufBuilder.appendBuf( reinterpret_cast< void *>(
                const_cast< char* >( resultObj.objdata() )), resultObj.objsize() );

        QueryResult* queryResult = reinterpret_cast< QueryResult* >( bufBuilder.buf() );
        bufBuilder.decouple();

        queryResult->_resultFlags() = queryResultFlags;
        queryResult->len = bufBuilder.len();
        queryResult->setOperation( opReply );
        queryResult->cursorId = 0;
        queryResult->startingFrom = 0;
        queryResult->nReturned = 1;

        response.setData( queryResult, true ); // transport will free
    }
Ejemplo n.º 5
0
 void replyToQuery(int queryResultFlags,
                   AbstractMessagingPort* p, Message& requestMsg,
                   void *data, int size,
                   int nReturned, int startingFrom,
                   long long cursorId 
                   ) {
     BufBuilder b(32768);
     b.skip(sizeof(QueryResult));
     b.appendBuf(data, size);
     QueryResult *qr = (QueryResult *) b.buf();
     qr->_resultFlags() = queryResultFlags;
     qr->len = b.len();
     qr->setOperation(opReply);
     qr->cursorId = cursorId;
     qr->startingFrom = startingFrom;
     qr->nReturned = nReturned;
     b.decouple();
     Message resp(qr, true);
     p->reply(requestMsg, resp, requestMsg.header()->id);
 }
Ejemplo n.º 6
0
    bool handlePossibleShardedMessage( Message &m, DbResponse* dbresponse ) {
        if ( ! shardingState.enabled() )
            return false;

        int op = m.operation();
        if ( op < 2000
                || op >= 3000
                || op == dbGetMore  // cursors are weird
           )
            return false;

        DbMessage d(m);
        const char *ns = d.getns();
        string errmsg;
        if ( shardVersionOk( ns , opIsWrite( op ) , errmsg ) ) {
            return false;
        }

        log(1) << "connection meta data too old - will retry ns:(" << ns << ") op:(" << opToString(op) << ") " << errmsg << endl;

        if ( doesOpGetAResponse( op ) ) {
            assert( dbresponse );
            BufBuilder b( 32768 );
            b.skip( sizeof( QueryResult ) );
            {
                BSONObj obj = BSON( "$err" << errmsg );
                b.appendBuf( obj.objdata() , obj.objsize() );
            }

            QueryResult *qr = (QueryResult*)b.buf();
            qr->_resultFlags() = ResultFlag_ErrSet | ResultFlag_ShardConfigStale;
            qr->len = b.len();
            qr->setOperation( opReply );
            qr->cursorId = 0;
            qr->startingFrom = 0;
            qr->nReturned = 1;
            b.decouple();

            Message * resp = new Message();
            resp->setData( qr , true );

            dbresponse->response = resp;
            dbresponse->responseTo = m.header()->id;
            return true;
        }

        OID writebackID;
        writebackID.init();
        lastError.getSafe()->writeback( writebackID );

        const OID& clientID = ShardedConnectionInfo::get(false)->getID();
        massert( 10422 ,  "write with bad shard config and no server id!" , clientID.isSet() );

        log(1) << "got write with an old config - writing back ns: " << ns << endl;
        if ( logLevel ) log(1) << debugString( m ) << endl;

        BSONObjBuilder b;
        b.appendBool( "writeBack" , true );
        b.append( "ns" , ns );
        b.append( "id" , writebackID );
        b.append( "connectionId" , cc().getConnectionId() );
        b.appendTimestamp( "version" , shardingState.getVersion( ns ) );
        b.appendTimestamp( "yourVersion" , ShardedConnectionInfo::get( true )->getVersion( ns ) );
        b.appendBinData( "msg" , m.header()->len , bdtCustom , (char*)(m.singleData()) );
        log(2) << "writing back msg with len: " << m.header()->len << " op: " << m.operation() << endl;
        writeBackManager.queueWriteBack( clientID.str() , b.obj() );

        return true;
    }
Ejemplo n.º 7
0
    /**
     * Also called by db/ops/query.cpp.  This is the new getMore entry point.
     */
    QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop,
                            int pass, bool& exhaust, bool* isCursorAuthorized) {
        exhaust = false;
        int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce;

        BufBuilder bb(bufSize);
        bb.skip(sizeof(QueryResult));

        // This is a read lock.  TODO: There is a cursor flag for not needing this.  Do we care?
        Client::ReadContext ctx(ns);

        QLOG() << "running getMore in new system, cursorid " << cursorid << endl;

        // This checks to make sure the operation is allowed on a replicated node.  Since we are not
        // passing in a query object (necessary to check SlaveOK query option), the only state where
        // reads are allowed is PRIMARY (or master in master/slave).  This function uasserts if
        // reads are not okay.
        replVerifyReadsOk();

        // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it
        // doesn't time out.  Also informs ClientCursor that there is somebody actively holding the
        // CC, so don't delete it.
        ClientCursorPin ccPin(cursorid);
        ClientCursor* cc = ccPin.c();

        // These are set in the QueryResult msg we return.
        int resultFlags = ResultFlag_AwaitCapable;

        int numResults = 0;
        int startingResult = 0;

        if (NULL == cc) {
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // Quote: check for spoofing of the ns such that it does not match the one originally
            // there for the cursor
            uassert(17011, "auth error", str::equals(ns, cc->ns().c_str()));
            *isCursorAuthorized = true;

            // TODO: fail point?

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros());
            killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

            // TODO:
            // curop.debug().query = BSONForQuery
            // curop.setQuery(curop.debug().query);

            // TODO: What is pass?
            if (0 == pass) { cc->updateSlaveLocation(curop); }

            CollectionMetadataPtr collMetadata = cc->getCollMetadata();

            // If we're replaying the oplog, we save the last time that we read.
            OpTime slaveReadTill;

            // What number result are we starting at?  Used to fill out the reply.
            startingResult = cc->pos();

            // What gives us results.
            Runner* runner = cc->getRunner();
            const int queryOptions = cc->queryOptions();

            // Get results out of the runner.
            runner->restoreState();

            BSONObj obj;
            Runner::RunnerState state;
            while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) {
                // Add result to output buffer.
                bb.appendBuf((void*)obj.objdata(), obj.objsize());

                // Count the result.
                ++numResults;

                // Possibly note slave's position in the oplog.
                if (queryOptions & QueryOption_OplogReplay) {
                    BSONElement e = obj["ts"];
                    if (Date == e.type() || Timestamp == e.type()) {
                        slaveReadTill = e._opTime();
                    }
                }

                if ((ntoreturn && numResults >= ntoreturn)
                    || bb.len() > MaxBytesToReturnToClientAtOnce) {
                    break;
                }
            }

            if (Runner::RUNNER_EOF == state && 0 == numResults
                && (queryOptions & QueryOption_CursorTailable)
                && (queryOptions & QueryOption_AwaitData) && (pass < 1000)) {
                // If the cursor is tailable we don't kill it if it's eof.  We let it try to get
                // data some # of times first.
                return 0;
            }

            bool saveClientCursor = false;

            if (Runner::RUNNER_DEAD == state || Runner::RUNNER_ERROR == state) {
                // If we're dead there's no way to get more results.
                saveClientCursor = false;
                // In the old system tailable capped cursors would be killed off at the
                // cursorid level.  If a tailable capped cursor is nuked the cursorid
                // would vanish.
                // 
                // In the new system they die and are cleaned up later (or time out).
                // So this is where we get to remove the cursorid.
                if (0 == numResults) {
                    resultFlags = ResultFlag_CursorNotFound;
                }
            }
            else if (Runner::RUNNER_EOF == state) {
                // EOF is also end of the line unless it's tailable.
                saveClientCursor = queryOptions & QueryOption_CursorTailable;
            }
            else {
                verify(Runner::RUNNER_ADVANCED == state);
                saveClientCursor = true;
            }

            if (!saveClientCursor) {
                ccPin.deleteUnderlying();
                // cc is now invalid, as is the runner
                cursorid = 0;
                cc = NULL;
                QLOG() << "getMore NOT saving client cursor, ended w/state "
                       << Runner::statestr(state)
                       << endl;
            }
            else {
                // Continue caching the ClientCursor.
                cc->incPos(numResults);
                runner->saveState();
                QLOG() << "getMore saving client cursor ended w/state "
                       << Runner::statestr(state)
                       << endl;

                // Possibly note slave's position in the oplog.
                if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                    cc->slaveReadTill(slaveReadTill);
                }

                exhaust = (queryOptions & QueryOption_Exhaust);

                // If the getmore had a time limit, remaining time is "rolled over" back to the
                // cursor (for use by future getmore ops).
                cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() );
            }
        }

        QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf());
        qr->len = bb.len();
        qr->setOperation(opReply);
        qr->_resultFlags() = resultFlags;
        qr->cursorId = cursorid;
        qr->startingFrom = startingResult;
        qr->nReturned = numResults;
        bb.decouple();
        QLOG() << "getMore returned " << numResults << " results\n";
        return qr;
    }
Ejemplo n.º 8
0
    QueryResult* processGetMore(const char* ns,
                                int ntoreturn,
                                long long cursorid,
                                CurOp& curop,
                                int pass,
                                bool& exhaust,
                                bool* isCursorAuthorized ) {

        bool hasRunner = false;

        // Scoped to kill the pin after seeing if the runner's there.
        {
            // See if there's a runner.  We do this until agg. is behind a Runner instead of a CC.
            ClientCursorPin p(cursorid);
            ClientCursor *cc = p.c();
            if (NULL != cc && NULL != cc->getRunner()) {
                hasRunner = true;
            }
        }

        if (hasRunner) {
            return newGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust,
                              isCursorAuthorized);
        }

        exhaust = false;

        int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce;

        BufBuilder b( bufSize );
        b.skip(sizeof(QueryResult));
        int resultFlags = ResultFlag_AwaitCapable;
        int start = 0;
        int n = 0;

        scoped_ptr<Client::ReadContext> ctx(new Client::ReadContext(ns));
        // call this readlocked so state can't change
        replVerifyReadsOk();

        ClientCursorPin p(cursorid);
        ClientCursor *cc = p.c();

        if ( unlikely(!cc) ) {
            LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl;
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // Some internal users create a ClientCursor with a Runner.  Don't crash if this
            // happens.  Instead, hand them off to the new framework.
            if (NULL != cc->getRunner()) {
                p.release();
                return newGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust, isCursorAuthorized);
            }

            // check for spoofing of the ns such that it does not match the one originally there for the cursor
            uassert(14833, "auth error", str::equals(ns, cc->ns().c_str()));

            *isCursorAuthorized = true;

            // This must be done after auth check to ensure proper cleanup.
            uassert(16951, "failing getmore due to set failpoint",
                    !MONGO_FAIL_POINT(getMoreError));

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            curop.setMaxTimeMicros( cc->getLeftoverMaxTimeMicros() );
            killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point.

            if ( pass == 0 )
                cc->updateSlaveLocation( curop );

            int queryOptions = cc->queryOptions();
            
            curop.debug().query = cc->query();
            curop.setQuery( cc->query() );

            start = cc->pos();
            Cursor *c = cc->c();

            if (!c->requiresLock()) {
                // make sure it won't be destroyed under us
                fassert(16952, !c->shouldDestroyOnNSDeletion());
                fassert(16953, !c->supportYields());
                ctx.reset(); // unlocks
            }

            c->recoverFromYield();
            DiskLoc last;

            // This metadata may be stale, but it's the state of chunking when the cursor was
            // created.
            CollectionMetadataPtr metadata = cc->getCollMetadata();
            KeyPattern keyPattern( metadata ? metadata->getKeyPattern() : BSONObj() );

            while ( 1 ) {
                if ( !c->ok() ) {
                    if ( c->tailable() ) {
                        // when a tailable cursor hits "EOF", ok() goes false, and current() is
                        // null.  however advance() can still be retries as a reactivation attempt.
                        // when there is new data, it will return true.  that's what we are doing
                        // here.
                        if ( c->advance() )
                            continue;

                        if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) {
                            return 0;
                        }

                        break;
                    }
                    p.release();
                    bool ok = ClientCursor::erase(cursorid);
                    verify(ok);
                    cursorid = 0;
                    cc = 0;
                    break;
                }

                MatchDetails details;
                if ( cc->fields && cc->fields->getArrayOpType() == Projection::ARRAY_OP_POSITIONAL ) {
                    // field projection specified, and contains an array operator
                    details.requestElemMatchKey();
                }

                // in some cases (clone collection) there won't be a matcher
                if ( !c->currentMatches( &details ) ) {
                }
                else if ( metadata && !metadata->keyBelongsToMe( extractKey(c, keyPattern ) ) ) {
                    LOG(2) << "cursor skipping document in un-owned chunk: " << c->current()
                               << endl;
                }
                else {
                    if( c->getsetdup(c->currLoc()) ) {
                        //out() << "  but it's a dup \n";
                    }
                    else {
                        last = c->currLoc();
                        n++;

                        // Fill out the fields requested by the query.
                        const Projection::KeyOnly *keyFieldsOnly = c->keyFieldsOnly();
                        if ( keyFieldsOnly ) {
                            fillQueryResultFromObj( b, 0, keyFieldsOnly->hydrate(
                            c->currKey() ), &details );
                        }
                        else {
                            DiskLoc loc = c->currLoc();
                            fillQueryResultFromObj( b, cc->fields.get(), c->current(), &details,
                                    ( ( cc->pq.get() && cc->pq->showDiskLoc() ) ? &loc : 0 ) );
                        }

                        if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) {
                            c->advance();
                            cc->incPos( n );
                            break;
                        }
                    }
                }
                c->advance();

                if ( ! cc->yieldSometimes( ( c->ok() && c->keyFieldsOnly() ) ?
                                          ClientCursor::DontNeed : ClientCursor::WillNeed ) ) {
                    ClientCursor::erase(cursorid);
                    cursorid = 0;
                    cc = 0;
                    break;
                }
            }
            
            if ( cc ) {
                if ( c->supportYields() ) {
                    ClientCursor::YieldData data;
                    verify( cc->prepareToYield( data ) );
                }
                else {
                    cc->c()->noteLocation();
                }
                cc->storeOpForSlave( last );
                exhaust = cc->queryOptions() & QueryOption_Exhaust;

                // If the getmore had a time limit, remaining time is "rolled over" back to the
                // cursor (for use by future getmore ops).
                cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() );
            }
        }

        QueryResult *qr = (QueryResult *) b.buf();
        qr->len = b.len();
        qr->setOperation(opReply);
        qr->_resultFlags() = resultFlags;
        qr->cursorId = cursorid;
        qr->startingFrom = start;
        qr->nReturned = n;
        b.decouple();

        return qr;
    }
Ejemplo n.º 9
0
    static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ) {
        bool ok = true;
        MSGID responseTo = m.header()->id;

        DbMessage d(m);
        QueryMessage q(d);
        auto_ptr< Message > resp( new Message() );

        CurOp& op = *(c.curop());

        shared_ptr<AssertionException> ex;

        try {
            if (!NamespaceString(d.getns()).isCommand()) {
                // Auth checking for Commands happens later.
                Status status = cc().getAuthorizationManager()->checkAuthForQuery(d.getns());
                uassert(16550, status.reason(), status.isOK());
            }
            dbresponse.exhaustNS = runQuery(m, q, op, *resp);
            verify( !resp->empty() );
        }
        catch ( SendStaleConfigException& e ){
            ex.reset( new SendStaleConfigException( e.getns(), e.getInfo().msg, e.getVersionReceived(), e.getVersionWanted() ) );
            ok = false;
        }
        catch ( AssertionException& e ) {
            ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) );
            ok = false;
        }

        if( ex ){

            op.debug().exceptionInfo = ex->getInfo();
            LOGWITHRATELIMIT {
                log() << "assertion " << ex->toString() << " ns:" << q.ns << " query:" <<
                (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl;
                if( q.ntoskip || q.ntoreturn )
                    log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << endl;
            }

            SendStaleConfigException* scex = NULL;
            if ( ex->getCode() == SendStaleConfigCode ) scex = static_cast<SendStaleConfigException*>( ex.get() );

            BSONObjBuilder err;
            ex->getInfo().append( err );
            if( scex ){
                err.append( "ns", scex->getns() );
                scex->getVersionReceived().addToBSON( err, "vReceived" );
                scex->getVersionWanted().addToBSON( err, "vWanted" );
            }
            BSONObj errObj = err.done();

            if( scex ){
                log() << "stale version detected during query over "
                      << q.ns << " : " << errObj << endl;
            }
            else{
                log() << "problem detected during query over "
                      << q.ns << " : " << errObj << endl;
            }

            BufBuilder b;
            b.skip(sizeof(QueryResult));
            b.appendBuf((void*) errObj.objdata(), errObj.objsize());

            // todo: call replyToQuery() from here instead of this!!! see dbmessage.h
            QueryResult * msgdata = (QueryResult *) b.buf();
            b.decouple();
            QueryResult *qr = msgdata;
            qr->_resultFlags() = ResultFlag_ErrSet;
            if( scex ) qr->_resultFlags() |= ResultFlag_ShardConfigStale;
            qr->len = b.len();
            qr->setOperation(opReply);
            qr->cursorId = 0;
            qr->startingFrom = 0;
            qr->nReturned = 1;
            resp.reset( new Message() );
            resp->setData( msgdata, true );

        }

        op.debug().responseLength = resp->header()->dataLen();

        dbresponse.response = resp.release();
        dbresponse.responseTo = responseTo;

        return ok;
    }
Ejemplo n.º 10
0
    bool _handlePossibleShardedMessage( Message &m, DbResponse* dbresponse ) {
        DEV assert( shardingState.enabled() );

        int op = m.operation();
        if ( op < 2000
                || op >= 3000
                || op == dbGetMore  // cursors are weird
           )
            return false;

        DbMessage d(m);
        const char *ns = d.getns();
        string errmsg;
        // We don't care about the version here, since we're returning it later in the writeback
        ConfigVersion received, wanted;
        if ( shardVersionOk( ns , errmsg, received, wanted ) ) {
            return false;
        }

        LOG(1) << "connection meta data too old - will retry ns:(" << ns << ") op:(" << opToString(op) << ") " << errmsg << endl;

        if ( doesOpGetAResponse( op ) ) {
            assert( dbresponse );
            BufBuilder b( 32768 );
            b.skip( sizeof( QueryResult ) );
            {
                BSONObj obj = BSON( "$err" << errmsg << "ns" << ns );
                b.appendBuf( obj.objdata() , obj.objsize() );
            }

            QueryResult *qr = (QueryResult*)b.buf();
            qr->_resultFlags() = ResultFlag_ErrSet | ResultFlag_ShardConfigStale;
            qr->len = b.len();
            qr->setOperation( opReply );
            qr->cursorId = 0;
            qr->startingFrom = 0;
            qr->nReturned = 1;
            b.decouple();

            Message * resp = new Message();
            resp->setData( qr , true );

            dbresponse->response = resp;
            dbresponse->responseTo = m.header()->id;
            return true;
        }
        
        uassert( 9517 , "writeback" , ( d.reservedField() & DbMessage::Reserved_FromWriteback ) == 0 );

        OID writebackID;
        writebackID.init();
        lastError.getSafe()->writeback( writebackID );

        const OID& clientID = ShardedConnectionInfo::get(false)->getID();
        massert( 10422 ,  "write with bad shard config and no server id!" , clientID.isSet() );

        LOG(1) << "got write with an old config - writing back ns: " << ns << endl;
        LOG(1) << m.toString() << endl;

        BSONObjBuilder b;
        b.appendBool( "writeBack" , true );
        b.append( "ns" , ns );
        b.append( "id" , writebackID );
        b.append( "connectionId" , cc().getConnectionId() );
        b.append( "instanceIdent" , prettyHostName() );
        b.appendTimestamp( "version" , shardingState.getVersion( ns ) );
        
        ShardedConnectionInfo* info = ShardedConnectionInfo::get( false );
        b.appendTimestamp( "yourVersion" , info ? info->getVersion(ns) : (ConfigVersion)0 );

        b.appendBinData( "msg" , m.header()->len , bdtCustom , (char*)(m.singleData()) );
        LOG(2) << "writing back msg with len: " << m.header()->len << " op: " << m.operation() << endl;
        writeBackManager.queueWriteBack( clientID.str() , b.obj() );

        return true;
    }
Ejemplo n.º 11
0
    static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ) {
        bool ok = true;
        MSGID responseTo = m.header()->id;

        DbMessage d(m);
        QueryMessage q(d);
        auto_ptr< Message > resp( new Message() );

        CurOp& op = *(c.curop());

        shared_ptr<AssertionException> ex;

        try {
            dbresponse.exhaust = runQuery(m, q, op, *resp);
            assert( !resp->empty() );
        }
        catch ( SendStaleConfigException& e ){
            ex.reset( new SendStaleConfigException( e.getns(), e.getInfo().msg ) );
            ok = false;
        }
        catch ( AssertionException& e ) {
            ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) );
            ok = false;
        }

        if( ex ){

            op.debug().exceptionInfo = ex->getInfo();
            LOGWITHRATELIMIT {
                log() << "assertion " << ex->toString() << " ns:" << q.ns << " query:" <<
                (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl;
                if( q.ntoskip || q.ntoreturn )
                    log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << endl;
            }

            SendStaleConfigException* scex = NULL;
            if ( ex->getCode() == SendStaleConfigCode ) scex = static_cast<SendStaleConfigException*>( ex.get() );

            BSONObjBuilder err;
            ex->getInfo().append( err );
            if( scex ) err.append( "ns", scex->getns() );
            BSONObj errObj = err.done();

            log() << errObj << endl;

            BufBuilder b;
            b.skip(sizeof(QueryResult));
            b.appendBuf((void*) errObj.objdata(), errObj.objsize());

            // todo: call replyToQuery() from here instead of this!!! see dbmessage.h
            QueryResult * msgdata = (QueryResult *) b.buf();
            b.decouple();
            QueryResult *qr = msgdata;
            qr->_resultFlags() = ResultFlag_ErrSet;
            if( scex ) qr->_resultFlags() |= ResultFlag_ShardConfigStale;
            qr->len = b.len();
            qr->setOperation(opReply);
            qr->cursorId = 0;
            qr->startingFrom = 0;
            qr->nReturned = 1;
            resp.reset( new Message() );
            resp->setData( msgdata, true );

        }

        op.debug().responseLength = resp->header()->dataLen();

        dbresponse.response = resp.release();
        dbresponse.responseTo = responseTo;

        return ok;
    }
Ejemplo n.º 12
0
    bool _handlePossibleShardedMessage( Message &m, DbResponse* dbresponse ) {
        DEV verify( shardingState.enabled() );

        int op = m.operation();
        if ( op < 2000
                || op >= 3000
                || op == dbGetMore  // cursors are weird
           )
            return false;

        DbMessage d(m);
        const char *ns = d.getns();
        string errmsg;
        // We don't care about the version here, since we're returning it later in the writeback
        ChunkVersion received, wanted;
        if ( shardVersionOk( ns , errmsg, received, wanted ) ) {
            return false;
        }

        bool getsAResponse = doesOpGetAResponse( op );

        LOG(1) << "connection sharding metadata does not match for collection " << ns
               << ", will retry (wanted : " << wanted << ", received : " << received << ")"
               << ( getsAResponse ? "" : " (queuing writeback)" ) << endl;

        if( getsAResponse ){
            verify( dbresponse );
            BufBuilder b( 32768 );
            b.skip( sizeof( QueryResult ) );
            {
                BSONObjBuilder bob;

                bob.append( "$err", errmsg );
                bob.append( "ns", ns );
                wanted.addToBSON( bob, "vWanted" );
                received.addToBSON( bob, "vReceived" );

                BSONObj obj = bob.obj();

                b.appendBuf( obj.objdata() , obj.objsize() );
            }

            QueryResult *qr = (QueryResult*)b.buf();
            qr->_resultFlags() = ResultFlag_ErrSet | ResultFlag_ShardConfigStale;
            qr->len = b.len();
            qr->setOperation( opReply );
            qr->cursorId = 0;
            qr->startingFrom = 0;
            qr->nReturned = 1;
            b.decouple();

            Message * resp = new Message();
            resp->setData( qr , true );

            dbresponse->response = resp;
            dbresponse->responseTo = m.header()->id;
            return true;
        }

        uassert(9517, "cannot queue a writeback operation to the writeback queue",
                (d.reservedField() & Reserved_FromWriteback) == 0);

        const OID& clientID = ShardedConnectionInfo::get(false)->getID();
        massert( 10422 ,  "write with bad shard config and no server id!" , clientID.isSet() );

        // We need to check this here, since otherwise we'll get errors wrapping the writeback -
        // not just here, but also when returning as a command result.
        // We choose 1/2 the overhead of the internal maximum so that we can still handle ops of
        // 16MB exactly.
        massert( 16437, "data size of operation is too large to queue for writeback",
                 m.dataSize() < BSONObjMaxInternalSize - (8 * 1024));

        LOG(1) << "writeback queued for " << m.toString() << endl;

        BSONObjBuilder b;
        b.appendBool( "writeBack" , true );
        b.append( "ns" , ns );
        b.append( "connectionId" , cc().getConnectionId() );
        b.append( "instanceIdent" , prettyHostName() );
        wanted.addToBSON( b );
        received.addToBSON( b, "yourVersion" );

        b.appendBinData( "msg" , m.header()->len , bdtCustom , (char*)(m.singleData()) );
        LOG(2) << "writing back msg with len: " << m.header()->len << " op: " << m.operation() << endl;
        
        // we pass the builder to queueWriteBack so that it can select the writebackId
        // this is important so that the id is guaranteed to be ascending 
        // that is important since mongos assumes if its seen a greater writeback
        // that all former have been processed
        OID writebackID = writeBackManager.queueWriteBack( clientID.str() , b );

        lastError.getSafe()->writeback( writebackID );

        return true;
    }
Ejemplo n.º 13
0
    /**
     * Also called by db/ops/query.cpp.  This is the new getMore entry point.
     */
    QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop,
                            int pass, bool& exhaust, bool* isCursorAuthorized) {
        exhaust = false;
        int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce;

        BufBuilder bb(bufSize);
        bb.skip(sizeof(QueryResult));

        // This is a read lock.  TODO: There is a cursor flag for not needing this.  Do we care?
        Client::ReadContext ctx(ns);

        log() << "running getMore in new system, cursorid " << cursorid << endl;

        // TODO: Document.
        // TODO: do this when we can pass in our own parsed query
        //replVerifyReadsOk();

        // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it
        // doesn't time out.  Also informs ClientCursor that there is somebody actively holding the
        // CC, so don't delete it.
        ClientCursorPin ccPin(cursorid);
        ClientCursor* cc = ccPin.c();

        // These are set in the QueryResult msg we return.
        int resultFlags = ResultFlag_AwaitCapable;

        int numResults = 0;
        int startingResult = 0;

        if (NULL == cc) {
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // Quote: check for spoofing of the ns such that it does not match the one originally
            // there for the cursor
            uassert(17011, "auth error", str::equals(ns, cc->ns().c_str()));
            *isCursorAuthorized = true;

            // TODO: fail point?

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros());
            // TODO:
            // curop.debug().query = BSONForQuery
            // curop.setQuery(curop.debug().query);

            // TODO: What is pass?
            if (0 == pass) { cc->updateSlaveLocation(curop); }

            CollectionMetadataPtr collMetadata = cc->getCollMetadata();

            // If we're replaying the oplog, we save the last time that we read.
            OpTime slaveReadTill;

            // What number result are we starting at?  Used to fill out the reply.
            startingResult = cc->pos();

            // What gives us results.
            Runner* runner = cc->getRunner();
            const int queryOptions = cc->queryOptions();

            // Get results out of the runner.
            // TODO: There may be special handling required for tailable cursors?
            runner->restoreState();

            BSONObj obj;
            Runner::RunnerState state;
            while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) {
                // If we're sharded make sure that we don't return any data that hasn't been
                // migrated off of our shard yet.
                if (collMetadata) {
                    KeyPattern kp(collMetadata->getKeyPattern());
                    if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; }
                }

                // Add result to output buffer.
                bb.appendBuf((void*)obj.objdata(), obj.objsize());

                // Count the result.
                ++numResults;

                // Possibly note slave's position in the oplog.
                if (queryOptions & QueryOption_OplogReplay) {
                    BSONElement e = obj["ts"];
                    if (Date == e.type() || Timestamp == e.type()) {
                        slaveReadTill = e._opTime();
                    }
                }

                if ((numResults && numResults >= ntoreturn)
                    || bb.len() > MaxBytesToReturnToClientAtOnce) {
                    break;
                }
            }

            if (Runner::RUNNER_DEAD == state || Runner::RUNNER_EOF == state) {
                log() << "getMore(): runner with id " << cursorid << " EOF/DEAD, state = "
                      << static_cast<int>(state) << endl;
                // TODO: If the cursor is tailable we don't kill it if it's eof.
                ccPin.free();
                // cc is now invalid, as is the runner
                cursorid = 0;
                cc = NULL;
            }
            else {
                // Continue caching the ClientCursor.
                cc->incPos(numResults);
                runner->saveState();

                // Possibly note slave's position in the oplog.
                if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                    cc->slaveReadTill(slaveReadTill);
                }

                exhaust = (queryOptions & QueryOption_Exhaust);

                // If the getmore had a time limit, remaining time is "rolled over" back to the
                // cursor (for use by future getmore ops).
                cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() );
            }
        }

        QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf());
        qr->len = bb.len();
        qr->setOperation(opReply);
        qr->_resultFlags() = resultFlags;
        qr->cursorId = cursorid;
        qr->startingFrom = startingResult;
        qr->nReturned = numResults;
        bb.decouple();
        return qr;
    }
Ejemplo n.º 14
0
    QueryResult* processGetMore(const char* ns,
                                int ntoreturn,
                                long long cursorid,
                                CurOp& curop,
                                int pass,
                                bool& exhaust,
                                bool* isCursorAuthorized ) {
        exhaust = false;
        ClientCursor::Pin p(cursorid);
        ClientCursor *client_cursor = p.c();

        int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce;

        BufBuilder b( bufSize );
        b.skip(sizeof(QueryResult));
        int resultFlags = ResultFlag_AwaitCapable;
        int start = 0;
        int n = 0;

        if ( unlikely(!client_cursor) ) {
            LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl;
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // check for spoofing of the ns such that it does not match the one originally there for the cursor
            uassert(14833, "auth error", str::equals(ns, client_cursor->ns().c_str()));
            uassert(16784, "oplog cursor reading data that is too old", !client_cursor->lastOpForSlaveTooOld());

            int queryOptions = client_cursor->queryOptions();
            OpSettings settings;
            settings.setBulkFetch(true);
            settings.setQueryCursorMode(DEFAULT_LOCK_CURSOR);
            settings.setCappedAppendPK(queryOptions & QueryOption_AddHiddenPK);
            cc().setOpSettings(settings);

            // Check if the cursor is part of a multi-statement transaction. If it is
            // and this is not the right client (meaning the current transaction stack
            // does not match that in the cursor), it will uassert. If the cursor is
            // not part of a multi-statement transaction, then we need to use the stack
            // in the cursor for this scope.
            const bool cursorPartOfMultiStatementTxn = client_cursor->checkMultiStatementTxn();
            scoped_ptr<Client::WithTxnStack> wts;
            if (!cursorPartOfMultiStatementTxn) {
                // For simplicity, prevent multi-statement transactions from
                // reading cursors it didn't create.
                uassert(16813, "Cannot getMore() on a cursor not created by this multi-statement transaction",
                           !cc().hasTxn());
                wts.reset(new Client::WithTxnStack(client_cursor->transactions)); 
            }

            *isCursorAuthorized = true;

            if (pass == 0) {
                client_cursor->updateSlaveLocation( curop );
            }
            
            curop.debug().query = client_cursor->query();

            start = client_cursor->pos();
            Cursor *c = client_cursor->c();

            // This manager may be stale, but it's the state of chunking when the cursor was created.
            ShardChunkManagerPtr manager = client_cursor->getChunkManager();

            while ( 1 ) {
                if ( !c->ok() ) {
                    if ( c->tailable() ) {
                        /* when a tailable cursor hits "EOF", ok() goes false, and current() is null.  however
                           advance() can still be retries as a reactivation attempt.  when there is new data, it will
                           return true.  that's what we are doing here.
                           */
                        if ( c->advance() )
                            continue;

                        if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) {
                            return 0;
                        }

                        break;
                    }
                    p.release();

                    // Done with this cursor, steal transaction stack back to commit or abort it here.
                    bool ok = ClientCursor::erase(cursorid);
                    verify(ok);
                    cursorid = 0;
                    client_cursor = 0;
                    break;
                }

                MatchDetails details;
                if ( client_cursor->fields && client_cursor->fields->getArrayOpType() == Projection::ARRAY_OP_POSITIONAL ) {
                    // field projection specified, and contains an array operator
                    details.requestElemMatchKey();
                }

                // in some cases (clone collection) there won't be a matcher
                if ( !c->currentMatches( &details ) ) {
                }
                else if ( manager && ! manager->belongsToMe( client_cursor ) ){
                    LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl;
                }
                else {
                    if( c->getsetdup(c->currPK()) ) {
                        //out() << "  but it's a dup \n";
                    }
                    else {
                        // save this so that at the end of the loop,
                        // we can update the location for write concern
                        // in replication. Note that if this cursor is not
                        // doing replication, this is pointless
                        if ( client_cursor->queryOptions() & QueryOption_OplogReplay ) {
                            client_cursor->storeOpForSlave( c->current() );
                        }
                        n++;

                        client_cursor->fillQueryResultFromObj( b, &details );

                        if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) {
                            c->advance();
                            client_cursor->incPos( n );
                            break;
                        }
                    }
                }
                c->advance();
            }
            
            if ( client_cursor ) {
                client_cursor->resetIdleAge();
                exhaust = client_cursor->queryOptions() & QueryOption_Exhaust;
            } else if (!cursorPartOfMultiStatementTxn) {
                // This cursor is done and it wasn't part of a multi-statement
                // transaction. We can commit the transaction now.
                cc().commitTopTxn();
                wts->release();
            }
        }

        QueryResult *qr = (QueryResult *) b.buf();
        qr->len = b.len();
        qr->setOperation(opReply);
        qr->_resultFlags() = resultFlags;
        qr->cursorId = cursorid;
        qr->startingFrom = start;
        qr->nReturned = n;
        b.decouple();

        return qr;
    }
Ejemplo n.º 15
0
    /**
     * Also called by db/ops/query.cpp.  This is the new getMore entry point.
     */
    QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop,
                            int pass, bool& exhaust, bool* isCursorAuthorized) {
        exhaust = false;
        int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce;

        BufBuilder bb(bufSize);
        bb.skip(sizeof(QueryResult));

        // This is a read lock.  TODO: There is a cursor flag for not needing this.  Do we care?
        Client::ReadContext ctx(ns);

        // TODO: Document.
        replVerifyReadsOk();

        ClientCursorPin ccPin(cursorid);
        ClientCursor* cc = ccPin.c();

        // These are set in the QueryResult msg we return.
        int resultFlags = ResultFlag_AwaitCapable;

        int numResults = 0;
        int startingResult = 0;

        if (NULL == cc) {
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // Quote: check for spoofing of the ns such that it does not match the one originally
            // there for the cursor
            uassert(17011, "auth error", str::equals(ns, cc->ns().c_str()));
            *isCursorAuthorized = true;

            // TODO: fail point?

            // If the operation that spawned this cursor had a time limit set, apply leftover
            // time to this getmore.
            curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros());
            // TODO:
            // curop.debug().query = BSONForQuery
            // curop.setQuery(curop.debug().query);

            // TODO: What is pass?
            if (0 == pass) { cc->updateSlaveLocation(curop); }

            CollectionMetadataPtr collMetadata = cc->getCollMetadata();

            // If we're replaying the oplog, we save the last time that we read.
            OpTime slaveReadTill;

            startingResult = cc->pos();

            Runner* runner = cc->getRunner();
            const ParsedQuery& pq = runner->getQuery().getParsed();

            // Get results out of the runner.
            // TODO: There may be special handling required for tailable cursors?
            runner->restoreState();
            BSONObj obj;
            // TODO: Differentiate EOF from error.
            while (runner->getNext(&obj)) {
                // If we're sharded make sure that we don't return any data that hasn't been
                // migrated off of our shard yet.
                if (collMetadata) {
                    KeyPattern kp(collMetadata->getKeyPattern());
                    if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; }
                }

                // Add result to output buffer.
                bb.appendBuf((void*)obj.objdata(), obj.objsize());

                // Count the result.
                ++numResults;

                // Possibly note slave's position in the oplog.
                if (pq.hasOption(QueryOption_OplogReplay)) {
                    BSONElement e = obj["ts"];
                    if (Date == e.type() || Timestamp == e.type()) {
                        slaveReadTill = e._opTime();
                    }
                }

                if ((numResults && numResults >= ntoreturn)
                    || bb.len() > MaxBytesToReturnToClientAtOnce) {
                    break;
                }
            }

            cc->incPos(numResults);
            runner->saveState();

            // Possibly note slave's position in the oplog.
            if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) {
                cc->slaveReadTill(slaveReadTill);
            }

            exhaust = pq.hasOption(QueryOption_Exhaust);

            // If the getmore had a time limit, remaining time is "rolled over" back to the
            // cursor (for use by future getmore ops).
            cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() );
        }

        QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf());
        qr->len = bb.len();
        qr->setOperation(opReply);
        qr->_resultFlags() = resultFlags;
        qr->cursorId = cursorid;
        qr->startingFrom = startingResult;
        qr->nReturned = numResults;
        bb.decouple();
        return qr;
    }
Ejemplo n.º 16
0
    QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& curop, int pass, bool& exhaust ) {
        exhaust = false;
        ClientCursor::Pointer p(cursorid);
        ClientCursor *cc = p.c();

        int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce;

        BufBuilder b( bufSize );
        b.skip(sizeof(QueryResult));
        int resultFlags = ResultFlag_AwaitCapable;
        int start = 0;
        int n = 0;

        if ( unlikely(!cc) ) {
            LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl;
            cursorid = 0;
            resultFlags = ResultFlag_CursorNotFound;
        }
        else {
            // check for spoofing of the ns such that it does not match the one originally there for the cursor
            uassert(14833, "auth error", str::equals(ns, cc->ns().c_str()));

            if ( pass == 0 )
                cc->updateSlaveLocation( curop );

            int queryOptions = cc->queryOptions();
            
            curop.debug().query = cc->query();

            start = cc->pos();
            Cursor *c = cc->c();
            c->recoverFromYield();
            DiskLoc last;

            scoped_ptr<Projection::KeyOnly> keyFieldsOnly;
            if ( cc->modifiedKeys() == false && cc->isMultiKey() == false && cc->fields )
                keyFieldsOnly.reset( cc->fields->checkKey( cc->indexKeyPattern() ) );

            // This manager may be stale, but it's the state of chunking when the cursor was created.
            ShardChunkManagerPtr manager = cc->getChunkManager();

            while ( 1 ) {
                if ( !c->ok() ) {
                    if ( c->tailable() ) {
                        /* when a tailable cursor hits "EOF", ok() goes false, and current() is null.  however
                           advance() can still be retries as a reactivation attempt.  when there is new data, it will
                           return true.  that's what we are doing here.
                           */
                        if ( c->advance() )
                            continue;

                        if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) {
                            return 0;
                        }

                        break;
                    }
                    p.release();
                    bool ok = ClientCursor::erase(cursorid);
                    verify(ok);
                    cursorid = 0;
                    cc = 0;
                    break;
                }

                // in some cases (clone collection) there won't be a matcher
                if ( !c->currentMatches() ) {
                }
                else if ( manager && ! manager->belongsToMe( cc ) ){
                    LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl;
                }
                else {
                    if( c->getsetdup(c->currLoc()) ) {
                        //out() << "  but it's a dup \n";
                    }
                    else {
                        last = c->currLoc();
                        n++;

                        if ( keyFieldsOnly ) {
                            fillQueryResultFromObj(b, 0, keyFieldsOnly->hydrate( c->currKey() ) );
                        }
                        else {
                            BSONObj js = c->current();
                            // show disk loc should be part of the main query, not in an $or clause, so this should be ok
                            fillQueryResultFromObj(b, cc->fields.get(), js, ( cc->pq.get() && cc->pq->showDiskLoc() ? &last : 0));
                        }

                        if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) {
                            c->advance();
                            cc->incPos( n );
                            break;
                        }
                    }
                }
                c->advance();

                if ( ! cc->yieldSometimes( ClientCursor::MaybeCovered ) ) {
                    ClientCursor::erase(cursorid);
                    cursorid = 0;
                    cc = 0;
                    p.deleted();
                    break;
                }
            }
            
            if ( cc ) {
                if ( c->supportYields() ) {
                    ClientCursor::YieldData data;
                    verify( cc->prepareToYield( data ) );
                }
                else {
                    cc->c()->noteLocation();
                }
                cc->mayUpgradeStorage();
                cc->storeOpForSlave( last );
                exhaust = cc->queryOptions() & QueryOption_Exhaust;
            }
        }

        QueryResult *qr = (QueryResult *) b.buf();
        qr->len = b.len();
        qr->setOperation(opReply);
        qr->_resultFlags() = resultFlags;
        qr->cursorId = cursorid;
        qr->startingFrom = start;
        qr->nReturned = n;
        b.decouple();

        return qr;
    }