bool handlePossibleShardedMessage( Message &m, DbResponse &dbresponse ){ if ( shardConfigServer.empty() ){ return false; } int op = m.data->operation(); if ( op < 2000 || op >= 3000 ) return false; const char *ns = m.data->_data + 4; string errmsg; if ( shardVersionOk( ns , errmsg ) ){ return false; } log() << "shardVersionOk failed ns:" << ns << " " << errmsg << endl; if ( doesOpGetAResponse( op ) ){ BufBuilder b( 32768 ); b.skip( sizeof( QueryResult ) ); { BSONObj obj = BSON( "$err" << errmsg ); b.append( obj.objdata() , obj.objsize() ); } QueryResult *qr = (QueryResult*)b.buf(); qr->_resultFlags() = QueryResult::ResultFlag_ErrSet | QueryResult::ResultFlag_ShardConfigStale; qr->len = b.len(); qr->setOperation( opReply ); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; b.decouple(); Message * resp = new Message(); resp->setData( qr , true ); dbresponse.response = resp; dbresponse.responseTo = m.data->id; return true; } OID * clientID = clientServerIds.get(); massert( 10422 , "write with bad shard config and no server id!" , clientID ); log() << "got write with an old config - writing back" << endl; BSONObjBuilder b; b.appendBool( "writeBack" , true ); b.append( "ns" , ns ); b.appendBinData( "msg" , m.data->len , bdtCustom , (char*)(m.data) ); log() << "writing back msg with len: " << m.data->len << " op: " << m.data->_operation << endl; clientQueues[clientID->str()]->push( b.obj() ); return true; }
static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ){ bool ok = true; MSGID responseTo = m.header()->id; DbMessage d(m); QueryMessage q(d); auto_ptr< Message > resp( new Message() ); CurOp& op = *(c.curop()); try { dbresponse.exhaust = runQuery(m, q, op, *resp); assert( !resp->empty() ); } catch ( AssertionException& e ) { ok = false; op.debug().str << " exception "; LOGSOME { log() << "assertion " << e.toString() << " ns:" << q.ns << " query:" << (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl; if( q.ntoskip || q.ntoreturn ) log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << endl; } BSONObjBuilder err; e.getInfo().append( err ); BSONObj errObj = err.done(); BufBuilder b; b.skip(sizeof(QueryResult)); b.appendBuf((void*) errObj.objdata(), errObj.objsize()); // todo: call replyToQuery() from here instead of this!!! see dbmessage.h QueryResult * msgdata = (QueryResult *) b.buf(); b.decouple(); QueryResult *qr = msgdata; qr->_resultFlags() = ResultFlag_ErrSet; if ( e.getCode() == StaleConfigInContextCode ) qr->_resultFlags() |= ResultFlag_ShardConfigStale; qr->len = b.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; resp.reset( new Message() ); resp->setData( msgdata, true ); } if ( op.shouldDBProfile( 0 ) ){ op.debug().str << " bytes:" << resp->header()->dataLen(); } dbresponse.response = resp.release(); dbresponse.responseTo = responseTo; return ok; }
/* empty result for error conditions */ QueryResult* emptyMoreResult(long long cursorid) { BufBuilder b(32768); b.skip(sizeof(QueryResult)); QueryResult *qr = (QueryResult *) b.buf(); qr->cursorId = 0; // 0 indicates no more data to retrieve. qr->startingFrom = 0; qr->len = b.len(); qr->setOperation(opReply); qr->nReturned = 0; b.decouple(); return qr; }
void replyToQuery( int queryResultFlags, Message& response, const BSONObj& resultObj ) { BufBuilder bufBuilder; bufBuilder.skip( sizeof( QueryResult )); bufBuilder.appendBuf( reinterpret_cast< void *>( const_cast< char* >( resultObj.objdata() )), resultObj.objsize() ); QueryResult* queryResult = reinterpret_cast< QueryResult* >( bufBuilder.buf() ); bufBuilder.decouple(); queryResult->_resultFlags() = queryResultFlags; queryResult->len = bufBuilder.len(); queryResult->setOperation( opReply ); queryResult->cursorId = 0; queryResult->startingFrom = 0; queryResult->nReturned = 1; response.setData( queryResult, true ); // transport will free }
void replyToQuery(int queryResultFlags, Message &m, DbResponse &dbresponse, BSONObj obj) { BufBuilder b; b.skip(sizeof(QueryResult)); b.appendBuf((void*) obj.objdata(), obj.objsize()); QueryResult* msgdata = (QueryResult *) b.buf(); b.decouple(); QueryResult *qr = msgdata; qr->_resultFlags() = queryResultFlags; qr->len = b.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; Message *resp = new Message(); resp->setData(msgdata, true); // transport will free dbresponse.response = resp; dbresponse.responseTo = m.header()->id; }
void replyToQuery(int queryResultFlags, AbstractMessagingPort* p, Message& requestMsg, void *data, int size, int nReturned, int startingFrom, long long cursorId ) { BufBuilder b(32768); b.skip(sizeof(QueryResult)); b.appendBuf(data, size); QueryResult *qr = (QueryResult *) b.buf(); qr->_resultFlags() = queryResultFlags; qr->len = b.len(); qr->setOperation(opReply); qr->cursorId = cursorId; qr->startingFrom = startingFrom; qr->nReturned = nReturned; b.decouple(); Message resp(qr, true); p->reply(requestMsg, resp, requestMsg.header()->id); }
/** * Also called by db/ops/query.cpp. This is the new getMore entry point. */ QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized) { exhaust = false; int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(bufSize); bb.skip(sizeof(QueryResult)); // This is a read lock. TODO: There is a cursor flag for not needing this. Do we care? Client::ReadContext ctx(ns); QLOG() << "running getMore in new system, cursorid " << cursorid << endl; // This checks to make sure the operation is allowed on a replicated node. Since we are not // passing in a query object (necessary to check SlaveOK query option), the only state where // reads are allowed is PRIMARY (or master in master/slave). This function uasserts if // reads are not okay. replVerifyReadsOk(); // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. ClientCursorPin ccPin(cursorid); ClientCursor* cc = ccPin.c(); // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Quote: check for spoofing of the ns such that it does not match the one originally // there for the cursor uassert(17011, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // TODO: fail point? // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // TODO: // curop.debug().query = BSONForQuery // curop.setQuery(curop.debug().query); // TODO: What is pass? if (0 == pass) { cc->updateSlaveLocation(curop); } CollectionMetadataPtr collMetadata = cc->getCollMetadata(); // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); // What gives us results. Runner* runner = cc->getRunner(); const int queryOptions = cc->queryOptions(); // Get results out of the runner. runner->restoreState(); BSONObj obj; Runner::RunnerState state; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (queryOptions & QueryOption_OplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } if ((ntoreturn && numResults >= ntoreturn) || bb.len() > MaxBytesToReturnToClientAtOnce) { break; } } if (Runner::RUNNER_EOF == state && 0 == numResults && (queryOptions & QueryOption_CursorTailable) && (queryOptions & QueryOption_AwaitData) && (pass < 1000)) { // If the cursor is tailable we don't kill it if it's eof. We let it try to get // data some # of times first. return 0; } bool saveClientCursor = false; if (Runner::RUNNER_DEAD == state || Runner::RUNNER_ERROR == state) { // If we're dead there's no way to get more results. saveClientCursor = false; // In the old system tailable capped cursors would be killed off at the // cursorid level. If a tailable capped cursor is nuked the cursorid // would vanish. // // In the new system they die and are cleaned up later (or time out). // So this is where we get to remove the cursorid. if (0 == numResults) { resultFlags = ResultFlag_CursorNotFound; } } else if (Runner::RUNNER_EOF == state) { // EOF is also end of the line unless it's tailable. saveClientCursor = queryOptions & QueryOption_CursorTailable; } else { verify(Runner::RUNNER_ADVANCED == state); saveClientCursor = true; } if (!saveClientCursor) { ccPin.deleteUnderlying(); // cc is now invalid, as is the runner cursorid = 0; cc = NULL; QLOG() << "getMore NOT saving client cursor, ended w/state " << Runner::statestr(state) << endl; } else { // Continue caching the ClientCursor. cc->incPos(numResults); runner->saveState(); QLOG() << "getMore saving client cursor ended w/state " << Runner::statestr(state) << endl; // Possibly note slave's position in the oplog. if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = (queryOptions & QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf()); qr->len = bb.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = startingResult; qr->nReturned = numResults; bb.decouple(); QLOG() << "getMore returned " << numResults << " results\n"; return qr; }
QueryResult* processGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized ) { bool hasRunner = false; // Scoped to kill the pin after seeing if the runner's there. { // See if there's a runner. We do this until agg. is behind a Runner instead of a CC. ClientCursorPin p(cursorid); ClientCursor *cc = p.c(); if (NULL != cc && NULL != cc->getRunner()) { hasRunner = true; } } if (hasRunner) { return newGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust, isCursorAuthorized); } exhaust = false; int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce; BufBuilder b( bufSize ); b.skip(sizeof(QueryResult)); int resultFlags = ResultFlag_AwaitCapable; int start = 0; int n = 0; scoped_ptr<Client::ReadContext> ctx(new Client::ReadContext(ns)); // call this readlocked so state can't change replVerifyReadsOk(); ClientCursorPin p(cursorid); ClientCursor *cc = p.c(); if ( unlikely(!cc) ) { LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl; cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Some internal users create a ClientCursor with a Runner. Don't crash if this // happens. Instead, hand them off to the new framework. if (NULL != cc->getRunner()) { p.release(); return newGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust, isCursorAuthorized); } // check for spoofing of the ns such that it does not match the one originally there for the cursor uassert(14833, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // This must be done after auth check to ensure proper cleanup. uassert(16951, "failing getmore due to set failpoint", !MONGO_FAIL_POINT(getMoreError)); // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros( cc->getLeftoverMaxTimeMicros() ); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. if ( pass == 0 ) cc->updateSlaveLocation( curop ); int queryOptions = cc->queryOptions(); curop.debug().query = cc->query(); curop.setQuery( cc->query() ); start = cc->pos(); Cursor *c = cc->c(); if (!c->requiresLock()) { // make sure it won't be destroyed under us fassert(16952, !c->shouldDestroyOnNSDeletion()); fassert(16953, !c->supportYields()); ctx.reset(); // unlocks } c->recoverFromYield(); DiskLoc last; // This metadata may be stale, but it's the state of chunking when the cursor was // created. CollectionMetadataPtr metadata = cc->getCollMetadata(); KeyPattern keyPattern( metadata ? metadata->getKeyPattern() : BSONObj() ); while ( 1 ) { if ( !c->ok() ) { if ( c->tailable() ) { // when a tailable cursor hits "EOF", ok() goes false, and current() is // null. however advance() can still be retries as a reactivation attempt. // when there is new data, it will return true. that's what we are doing // here. if ( c->advance() ) continue; if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) { return 0; } break; } p.release(); bool ok = ClientCursor::erase(cursorid); verify(ok); cursorid = 0; cc = 0; break; } MatchDetails details; if ( cc->fields && cc->fields->getArrayOpType() == Projection::ARRAY_OP_POSITIONAL ) { // field projection specified, and contains an array operator details.requestElemMatchKey(); } // in some cases (clone collection) there won't be a matcher if ( !c->currentMatches( &details ) ) { } else if ( metadata && !metadata->keyBelongsToMe( extractKey(c, keyPattern ) ) ) { LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl; } else { if( c->getsetdup(c->currLoc()) ) { //out() << " but it's a dup \n"; } else { last = c->currLoc(); n++; // Fill out the fields requested by the query. const Projection::KeyOnly *keyFieldsOnly = c->keyFieldsOnly(); if ( keyFieldsOnly ) { fillQueryResultFromObj( b, 0, keyFieldsOnly->hydrate( c->currKey() ), &details ); } else { DiskLoc loc = c->currLoc(); fillQueryResultFromObj( b, cc->fields.get(), c->current(), &details, ( ( cc->pq.get() && cc->pq->showDiskLoc() ) ? &loc : 0 ) ); } if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) { c->advance(); cc->incPos( n ); break; } } } c->advance(); if ( ! cc->yieldSometimes( ( c->ok() && c->keyFieldsOnly() ) ? ClientCursor::DontNeed : ClientCursor::WillNeed ) ) { ClientCursor::erase(cursorid); cursorid = 0; cc = 0; break; } } if ( cc ) { if ( c->supportYields() ) { ClientCursor::YieldData data; verify( cc->prepareToYield( data ) ); } else { cc->c()->noteLocation(); } cc->storeOpForSlave( last ); exhaust = cc->queryOptions() & QueryOption_Exhaust; // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult *qr = (QueryResult *) b.buf(); qr->len = b.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = start; qr->nReturned = n; b.decouple(); return qr; }
static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ) { bool ok = true; MSGID responseTo = m.header()->id; DbMessage d(m); QueryMessage q(d); auto_ptr< Message > resp( new Message() ); CurOp& op = *(c.curop()); shared_ptr<AssertionException> ex; try { if (!NamespaceString(d.getns()).isCommand()) { // Auth checking for Commands happens later. Status status = cc().getAuthorizationManager()->checkAuthForQuery(d.getns()); uassert(16550, status.reason(), status.isOK()); } dbresponse.exhaustNS = runQuery(m, q, op, *resp); verify( !resp->empty() ); } catch ( SendStaleConfigException& e ){ ex.reset( new SendStaleConfigException( e.getns(), e.getInfo().msg, e.getVersionReceived(), e.getVersionWanted() ) ); ok = false; } catch ( AssertionException& e ) { ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) ); ok = false; } if( ex ){ op.debug().exceptionInfo = ex->getInfo(); LOGWITHRATELIMIT { log() << "assertion " << ex->toString() << " ns:" << q.ns << " query:" << (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl; if( q.ntoskip || q.ntoreturn ) log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << endl; } SendStaleConfigException* scex = NULL; if ( ex->getCode() == SendStaleConfigCode ) scex = static_cast<SendStaleConfigException*>( ex.get() ); BSONObjBuilder err; ex->getInfo().append( err ); if( scex ){ err.append( "ns", scex->getns() ); scex->getVersionReceived().addToBSON( err, "vReceived" ); scex->getVersionWanted().addToBSON( err, "vWanted" ); } BSONObj errObj = err.done(); if( scex ){ log() << "stale version detected during query over " << q.ns << " : " << errObj << endl; } else{ log() << "problem detected during query over " << q.ns << " : " << errObj << endl; } BufBuilder b; b.skip(sizeof(QueryResult)); b.appendBuf((void*) errObj.objdata(), errObj.objsize()); // todo: call replyToQuery() from here instead of this!!! see dbmessage.h QueryResult * msgdata = (QueryResult *) b.buf(); b.decouple(); QueryResult *qr = msgdata; qr->_resultFlags() = ResultFlag_ErrSet; if( scex ) qr->_resultFlags() |= ResultFlag_ShardConfigStale; qr->len = b.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; resp.reset( new Message() ); resp->setData( msgdata, true ); } op.debug().responseLength = resp->header()->dataLen(); dbresponse.response = resp.release(); dbresponse.responseTo = responseTo; return ok; }
/** * Also called by db/ops/query.cpp. This is the new getMore entry point. */ QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized) { exhaust = false; int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(bufSize); bb.skip(sizeof(QueryResult)); // This is a read lock. TODO: There is a cursor flag for not needing this. Do we care? Client::ReadContext ctx(ns); // TODO: Document. replVerifyReadsOk(); ClientCursorPin ccPin(cursorid); ClientCursor* cc = ccPin.c(); // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Quote: check for spoofing of the ns such that it does not match the one originally // there for the cursor uassert(17011, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // TODO: fail point? // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); // TODO: // curop.debug().query = BSONForQuery // curop.setQuery(curop.debug().query); // TODO: What is pass? if (0 == pass) { cc->updateSlaveLocation(curop); } CollectionMetadataPtr collMetadata = cc->getCollMetadata(); // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; startingResult = cc->pos(); Runner* runner = cc->getRunner(); const ParsedQuery& pq = runner->getQuery().getParsed(); // Get results out of the runner. // TODO: There may be special handling required for tailable cursors? runner->restoreState(); BSONObj obj; // TODO: Differentiate EOF from error. while (runner->getNext(&obj)) { // If we're sharded make sure that we don't return any data that hasn't been // migrated off of our shard yet. if (collMetadata) { KeyPattern kp(collMetadata->getKeyPattern()); if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; } } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } if ((numResults && numResults >= ntoreturn) || bb.len() > MaxBytesToReturnToClientAtOnce) { break; } } cc->incPos(numResults); runner->saveState(); // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = pq.hasOption(QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf()); qr->len = bb.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = startingResult; qr->nReturned = numResults; bb.decouple(); return qr; }
bool _handlePossibleShardedMessage( Message &m, DbResponse* dbresponse ) { DEV assert( shardingState.enabled() ); int op = m.operation(); if ( op < 2000 || op >= 3000 || op == dbGetMore // cursors are weird ) return false; DbMessage d(m); const char *ns = d.getns(); string errmsg; // We don't care about the version here, since we're returning it later in the writeback ConfigVersion received, wanted; if ( shardVersionOk( ns , errmsg, received, wanted ) ) { return false; } LOG(1) << "connection meta data too old - will retry ns:(" << ns << ") op:(" << opToString(op) << ") " << errmsg << endl; if ( doesOpGetAResponse( op ) ) { assert( dbresponse ); BufBuilder b( 32768 ); b.skip( sizeof( QueryResult ) ); { BSONObj obj = BSON( "$err" << errmsg << "ns" << ns ); b.appendBuf( obj.objdata() , obj.objsize() ); } QueryResult *qr = (QueryResult*)b.buf(); qr->_resultFlags() = ResultFlag_ErrSet | ResultFlag_ShardConfigStale; qr->len = b.len(); qr->setOperation( opReply ); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; b.decouple(); Message * resp = new Message(); resp->setData( qr , true ); dbresponse->response = resp; dbresponse->responseTo = m.header()->id; return true; } uassert( 9517 , "writeback" , ( d.reservedField() & DbMessage::Reserved_FromWriteback ) == 0 ); OID writebackID; writebackID.init(); lastError.getSafe()->writeback( writebackID ); const OID& clientID = ShardedConnectionInfo::get(false)->getID(); massert( 10422 , "write with bad shard config and no server id!" , clientID.isSet() ); LOG(1) << "got write with an old config - writing back ns: " << ns << endl; LOG(1) << m.toString() << endl; BSONObjBuilder b; b.appendBool( "writeBack" , true ); b.append( "ns" , ns ); b.append( "id" , writebackID ); b.append( "connectionId" , cc().getConnectionId() ); b.append( "instanceIdent" , prettyHostName() ); b.appendTimestamp( "version" , shardingState.getVersion( ns ) ); ShardedConnectionInfo* info = ShardedConnectionInfo::get( false ); b.appendTimestamp( "yourVersion" , info ? info->getVersion(ns) : (ConfigVersion)0 ); b.appendBinData( "msg" , m.header()->len , bdtCustom , (char*)(m.singleData()) ); LOG(2) << "writing back msg with len: " << m.header()->len << " op: " << m.operation() << endl; writeBackManager.queueWriteBack( clientID.str() , b.obj() ); return true; }
void receivedQuery(DbResponse& dbresponse, /*AbstractMessagingPort& dbMsgPort, */Message& m, stringstream& ss, bool logit) { MSGID responseTo = m.data->id; DbMessage d(m); QueryMessage q(d); QueryResult* msgdata; try { if (q.fields.get() && q.fields->errmsg) uassert(q.fields->errmsg, false); /* note these are logged BEFORE authentication -- which is sort of ok */ if ( _diaglog.level && logit ) { if ( strstr(q.ns, ".$cmd") ) { /* $cmd queries are "commands" and usually best treated as write operations */ OPWRITE; } else { OPREAD; } } setClient( q.ns ); Client& client = cc(); client.top.setRead(); strncpy(client.curop()->ns, q.ns, Namespace::MaxNsLen); msgdata = runQuery(m, ss ).release(); } catch ( AssertionException& e ) { ss << " exception "; LOGSOME problem() << " Caught Assertion in runQuery ns:" << q.ns << ' ' << e.toString() << '\n'; log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << '\n'; if ( q.query.valid() ) log() << " query:" << q.query.toString() << endl; else log() << " query object is not valid!" << endl; BSONObjBuilder err; err.append("$err", e.msg.empty() ? "assertion during query" : e.msg); BSONObj errObj = err.done(); BufBuilder b; b.skip(sizeof(QueryResult)); b.append((void*) errObj.objdata(), errObj.objsize()); // todo: call replyToQuery() from here instead of this!!! see dbmessage.h msgdata = (QueryResult *) b.buf(); b.decouple(); QueryResult *qr = msgdata; qr->resultFlags() = QueryResult::ResultFlag_ErrSet; qr->len = b.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; } Message *resp = new Message(); resp->setData(msgdata, true); // transport will free dbresponse.response = resp; dbresponse.responseTo = responseTo; Database *database = cc().database(); if ( database ) { if ( database->profile ) ss << " bytes:" << resp->data->dataLen(); } else { if ( strstr(q.ns, "$cmd") == 0 ) // (this condition is normal for $cmd dropDatabase) log() << "ERROR: receiveQuery: database is null; ns=" << q.ns << endl; } }
QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& curop, int pass, bool& exhaust ) { exhaust = false; ClientCursor::Pointer p(cursorid); ClientCursor *cc = p.c(); int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce; BufBuilder b( bufSize ); b.skip(sizeof(QueryResult)); int resultFlags = ResultFlag_AwaitCapable; int start = 0; int n = 0; if ( unlikely(!cc) ) { LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl; cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // check for spoofing of the ns such that it does not match the one originally there for the cursor uassert(14833, "auth error", str::equals(ns, cc->ns().c_str())); if ( pass == 0 ) cc->updateSlaveLocation( curop ); int queryOptions = cc->queryOptions(); curop.debug().query = cc->query(); start = cc->pos(); Cursor *c = cc->c(); c->recoverFromYield(); DiskLoc last; scoped_ptr<Projection::KeyOnly> keyFieldsOnly; if ( cc->modifiedKeys() == false && cc->isMultiKey() == false && cc->fields ) keyFieldsOnly.reset( cc->fields->checkKey( cc->indexKeyPattern() ) ); // This manager may be stale, but it's the state of chunking when the cursor was created. ShardChunkManagerPtr manager = cc->getChunkManager(); while ( 1 ) { if ( !c->ok() ) { if ( c->tailable() ) { /* when a tailable cursor hits "EOF", ok() goes false, and current() is null. however advance() can still be retries as a reactivation attempt. when there is new data, it will return true. that's what we are doing here. */ if ( c->advance() ) continue; if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) { return 0; } break; } p.release(); bool ok = ClientCursor::erase(cursorid); verify(ok); cursorid = 0; cc = 0; break; } // in some cases (clone collection) there won't be a matcher if ( !c->currentMatches() ) { } else if ( manager && ! manager->belongsToMe( cc ) ){ LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl; } else { if( c->getsetdup(c->currLoc()) ) { //out() << " but it's a dup \n"; } else { last = c->currLoc(); n++; if ( keyFieldsOnly ) { fillQueryResultFromObj(b, 0, keyFieldsOnly->hydrate( c->currKey() ) ); } else { BSONObj js = c->current(); // show disk loc should be part of the main query, not in an $or clause, so this should be ok fillQueryResultFromObj(b, cc->fields.get(), js, ( cc->pq.get() && cc->pq->showDiskLoc() ? &last : 0)); } if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) { c->advance(); cc->incPos( n ); break; } } } c->advance(); if ( ! cc->yieldSometimes( ClientCursor::MaybeCovered ) ) { ClientCursor::erase(cursorid); cursorid = 0; cc = 0; p.deleted(); break; } } if ( cc ) { if ( c->supportYields() ) { ClientCursor::YieldData data; verify( cc->prepareToYield( data ) ); } else { cc->c()->noteLocation(); } cc->mayUpgradeStorage(); cc->storeOpForSlave( last ); exhaust = cc->queryOptions() & QueryOption_Exhaust; } } QueryResult *qr = (QueryResult *) b.buf(); qr->len = b.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = start; qr->nReturned = n; b.decouple(); return qr; }
/** * Also called by db/ops/query.cpp. This is the new getMore entry point. */ QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized) { exhaust = false; int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(bufSize); bb.skip(sizeof(QueryResult)); // This is a read lock. TODO: There is a cursor flag for not needing this. Do we care? Client::ReadContext ctx(ns); log() << "running getMore in new system, cursorid " << cursorid << endl; // TODO: Document. // TODO: do this when we can pass in our own parsed query //replVerifyReadsOk(); // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. ClientCursorPin ccPin(cursorid); ClientCursor* cc = ccPin.c(); // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Quote: check for spoofing of the ns such that it does not match the one originally // there for the cursor uassert(17011, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // TODO: fail point? // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); // TODO: // curop.debug().query = BSONForQuery // curop.setQuery(curop.debug().query); // TODO: What is pass? if (0 == pass) { cc->updateSlaveLocation(curop); } CollectionMetadataPtr collMetadata = cc->getCollMetadata(); // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); // What gives us results. Runner* runner = cc->getRunner(); const int queryOptions = cc->queryOptions(); // Get results out of the runner. // TODO: There may be special handling required for tailable cursors? runner->restoreState(); BSONObj obj; Runner::RunnerState state; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // If we're sharded make sure that we don't return any data that hasn't been // migrated off of our shard yet. if (collMetadata) { KeyPattern kp(collMetadata->getKeyPattern()); if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; } } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (queryOptions & QueryOption_OplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } if ((numResults && numResults >= ntoreturn) || bb.len() > MaxBytesToReturnToClientAtOnce) { break; } } if (Runner::RUNNER_DEAD == state || Runner::RUNNER_EOF == state) { log() << "getMore(): runner with id " << cursorid << " EOF/DEAD, state = " << static_cast<int>(state) << endl; // TODO: If the cursor is tailable we don't kill it if it's eof. ccPin.free(); // cc is now invalid, as is the runner cursorid = 0; cc = NULL; } else { // Continue caching the ClientCursor. cc->incPos(numResults); runner->saveState(); // Possibly note slave's position in the oplog. if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = (queryOptions & QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf()); qr->len = bb.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = startingResult; qr->nReturned = numResults; bb.decouple(); return qr; }
/** * This is called by db/ops/query.cpp. This is the entry point for answering a query. */ string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { log() << "Running query on new system: " << q.query.toString() << endl; // This is a read lock. Client::ReadContext ctx(q.ns, dbpath); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner; CanonicalQuery* cq; Status status = getRunner(q, &rawRunner, &cq); if (!status.isOK()) { uasserted(17007, "Couldn't process query " + q.query.toString() + " why: " + status.reason()); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(q.ns); // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); // TODO: Document why we do this. // TODO: do this when we can pass in our own parsed query //replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; // We turn on auto-yielding for the runner here, so we must register it with the active // runners list in ClientCursor. ClientCursor::registerRunner(runner.get()); runner->setYieldPolicy(Runner::YIELD_AUTO); BSONObj obj; Runner::RunnerState state; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // If we're sharded make sure that we don't return any data that hasn't been migrated // off of our shared yet. if (collMetadata) { // This information can change if we yield and as such we must make sure to re-fetch // it if we yield. KeyPattern kp(collMetadata->getKeyPattern()); // This performs excessive BSONObj creation but that's OK for now. if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; } } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; const bool isExplain = pq.isExplain(); if (isExplain && enoughForExplain(pq, numResults)) { break; } else if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { saveClientCursor = true; } break; } } // If we cache the runner later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the runner later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the runner. ClientCursor::deregisterRunner(runner.get()); // Why save a dead runner? if (Runner::RUNNER_DEAD == state) { saveClientCursor = false; } // TODO: Stage creation can set tailable depending on what's in the parsed query. We have // the full parsed query available during planning...set it there. // // TODO: If we're tailable we want to save the client cursor. Make sure we do this later. //if (pq.hasOption(QueryOption_CursorTailable) && pq.getNumToReturn() != 1) { ... } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } long long ccId = 0; if (saveClientCursor) { // We won't use the runner until it's getMore'd. runner->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(), cq->getParsed().getFilter()); ccId = cc->cursorid(); log() << "caching runner with cursorid " << ccId << endl; // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); // TODO document if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; // TODO: nscanned is bogus. // curop.debug().nscanned = ( cursor ? cursor->nscanned() : 0LL ); curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
/** * Run a query with a cursor provided by the query optimizer, or FindingStartCursor. * @returns true if client cursor was saved, false if the query has completed. */ bool queryWithQueryOptimizer( int queryOptions, const string& ns, const BSONObj &jsobj, CurOp& curop, const BSONObj &query, const BSONObj &order, const shared_ptr<ParsedQuery> &pq_shared, const ConfigVersion &shardingVersionAtStart, const bool getCachedExplainPlan, const bool inMultiStatementTxn, Message &result ) { const ParsedQuery &pq( *pq_shared ); shared_ptr<Cursor> cursor; QueryPlanSummary queryPlan; const bool tailable = pq.hasOption( QueryOption_CursorTailable ) && pq.getNumToReturn() != 1; LOG(1) << "query beginning read-only transaction. tailable: " << tailable << endl; BSONObj oldPlan; if (getCachedExplainPlan) { scoped_ptr<MultiPlanScanner> mps( MultiPlanScanner::make( ns.c_str(), query, order ) ); oldPlan = mps->cachedPlanExplainSummary(); } cursor = getOptimizedCursor( ns.c_str(), query, order, QueryPlanSelectionPolicy::any(), pq_shared, false, &queryPlan ); verify( cursor ); // Tailable cursors must be marked as such before any use. This is so that // the implementation knows that uncommitted data cannot be returned. if ( tailable ) { cursor->setTailable(); } scoped_ptr<QueryResponseBuilder> queryResponseBuilder ( QueryResponseBuilder::make( pq, cursor, queryPlan, oldPlan ) ); bool saveClientCursor = false; int options = QueryOption_NoCursorTimeout; if (pq.hasOption( QueryOption_OplogReplay )) { options |= QueryOption_OplogReplay; } // create a client cursor that does not create a cursorID. // The cursor ID will be created if and only if we save // the client cursor further below ClientCursor::Holder ccPointer( new ClientCursor( options, cursor, ns, BSONObj(), false, false ) ); // for oplog cursors, we check if we are reading data that is too old and might // be stale. bool opChecked = false; bool slaveLocationUpdated = false; BSONObj last; bool lastBSONObjSet = false; for ( ; cursor->ok(); cursor->advance() ) { if ( pq.getMaxScan() && cursor->nscanned() > pq.getMaxScan() ) { break; } if ( !queryResponseBuilder->addMatch() ) { continue; } // Note slave's position in the oplog. if ( pq.hasOption( QueryOption_OplogReplay ) ) { BSONObj current = cursor->current(); last = current.copy(); lastBSONObjSet = true; // the first row returned is equal to the last element that // the slave has synced up to, so we might as well update // the slave location if (!slaveLocationUpdated) { ccPointer->updateSlaveLocation(curop); slaveLocationUpdated = true; } // check if data we are about to return may be too stale if (!opChecked) { ccPointer->storeOpForSlave(current); uassert(16785, "oplog cursor reading data that is too old", !ccPointer->lastOpForSlaveTooOld()); opChecked = true; } } if ( pq.isExplain() ) { if ( queryResponseBuilder->enoughTotalResults() ) { break; } } else if ( queryResponseBuilder->enoughForFirstBatch() ) { // if only 1 requested, no cursor saved for efficiency...we assume it is findOne() if ( pq.wantMore() && pq.getNumToReturn() != 1 ) { queryResponseBuilder->finishedFirstBatch(); if ( cursor->advance() ) { saveClientCursor = true; } } break; } } // If the tailing request succeeded if ( cursor->tailable() ) { saveClientCursor = true; } if ( ! shardingState.getVersion( ns ).isWriteCompatibleWith( shardingVersionAtStart ) ) { // if the version changed during the query // we might be missing some data // and its safe to send this as mongos can resend // at this point throw SendStaleConfigException( ns , "version changed during initial query", shardingVersionAtStart, shardingState.getVersion( ns ) ); } int nReturned = queryResponseBuilder->handoff( result ); ccPointer.reset(); long long cursorid = 0; if ( saveClientCursor ) { // Create a new ClientCursor, with a default timeout. ccPointer.reset( new ClientCursor( queryOptions, cursor, ns, jsobj.getOwned(), inMultiStatementTxn ) ); cursorid = ccPointer->cursorid(); DEV tlog(2) << "query has more, cursorid: " << cursorid << endl; if ( !ccPointer->ok() && ccPointer->c()->tailable() ) { DEV tlog() << "query has no more but tailable, cursorid: " << cursorid << endl; } if( queryOptions & QueryOption_Exhaust ) { curop.debug().exhaust = true; } // Set attributes for getMore. ccPointer->setChunkManager( queryResponseBuilder->chunkManager() ); ccPointer->setPos( nReturned ); ccPointer->pq = pq_shared; ccPointer->fields = pq.getFieldPtr(); if (pq.hasOption( QueryOption_OplogReplay ) && lastBSONObjSet) { ccPointer->storeOpForSlave(last); } if (!inMultiStatementTxn) { // This cursor is not part of a multi-statement transaction, so // we pass off the current client's transaction stack to the // cursor so that it may be live as long as the cursor. cc().swapTransactionStack(ccPointer->transactions); verify(!cc().hasTxn()); } ccPointer.release(); } QueryResult *qr = (QueryResult *) result.header(); qr->cursorId = cursorid; curop.debug().cursorid = ( cursorid == 0 ? -1 : qr->cursorId ); qr->setResultFlagsToOk(); // qr->len is updated automatically by appendData() curop.debug().responseLength = qr->len; qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = nReturned; curop.debug().nscanned = ( cursor ? cursor->nscanned() : 0LL ); curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = nReturned; return saveClientCursor; }
QueryResult* processGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized ) { exhaust = false; ClientCursor::Pin p(cursorid); ClientCursor *client_cursor = p.c(); int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce; BufBuilder b( bufSize ); b.skip(sizeof(QueryResult)); int resultFlags = ResultFlag_AwaitCapable; int start = 0; int n = 0; if ( unlikely(!client_cursor) ) { LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl; cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // check for spoofing of the ns such that it does not match the one originally there for the cursor uassert(14833, "auth error", str::equals(ns, client_cursor->ns().c_str())); uassert(16784, "oplog cursor reading data that is too old", !client_cursor->lastOpForSlaveTooOld()); int queryOptions = client_cursor->queryOptions(); OpSettings settings; settings.setBulkFetch(true); settings.setQueryCursorMode(DEFAULT_LOCK_CURSOR); settings.setCappedAppendPK(queryOptions & QueryOption_AddHiddenPK); cc().setOpSettings(settings); // Check if the cursor is part of a multi-statement transaction. If it is // and this is not the right client (meaning the current transaction stack // does not match that in the cursor), it will uassert. If the cursor is // not part of a multi-statement transaction, then we need to use the stack // in the cursor for this scope. const bool cursorPartOfMultiStatementTxn = client_cursor->checkMultiStatementTxn(); scoped_ptr<Client::WithTxnStack> wts; if (!cursorPartOfMultiStatementTxn) { // For simplicity, prevent multi-statement transactions from // reading cursors it didn't create. uassert(16813, "Cannot getMore() on a cursor not created by this multi-statement transaction", !cc().hasTxn()); wts.reset(new Client::WithTxnStack(client_cursor->transactions)); } *isCursorAuthorized = true; if (pass == 0) { client_cursor->updateSlaveLocation( curop ); } curop.debug().query = client_cursor->query(); start = client_cursor->pos(); Cursor *c = client_cursor->c(); // This manager may be stale, but it's the state of chunking when the cursor was created. ShardChunkManagerPtr manager = client_cursor->getChunkManager(); while ( 1 ) { if ( !c->ok() ) { if ( c->tailable() ) { /* when a tailable cursor hits "EOF", ok() goes false, and current() is null. however advance() can still be retries as a reactivation attempt. when there is new data, it will return true. that's what we are doing here. */ if ( c->advance() ) continue; if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) { return 0; } break; } p.release(); // Done with this cursor, steal transaction stack back to commit or abort it here. bool ok = ClientCursor::erase(cursorid); verify(ok); cursorid = 0; client_cursor = 0; break; } MatchDetails details; if ( client_cursor->fields && client_cursor->fields->getArrayOpType() == Projection::ARRAY_OP_POSITIONAL ) { // field projection specified, and contains an array operator details.requestElemMatchKey(); } // in some cases (clone collection) there won't be a matcher if ( !c->currentMatches( &details ) ) { } else if ( manager && ! manager->belongsToMe( client_cursor ) ){ LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl; } else { if( c->getsetdup(c->currPK()) ) { //out() << " but it's a dup \n"; } else { // save this so that at the end of the loop, // we can update the location for write concern // in replication. Note that if this cursor is not // doing replication, this is pointless if ( client_cursor->queryOptions() & QueryOption_OplogReplay ) { client_cursor->storeOpForSlave( c->current() ); } n++; client_cursor->fillQueryResultFromObj( b, &details ); if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) { c->advance(); client_cursor->incPos( n ); break; } } } c->advance(); } if ( client_cursor ) { client_cursor->resetIdleAge(); exhaust = client_cursor->queryOptions() & QueryOption_Exhaust; } else if (!cursorPartOfMultiStatementTxn) { // This cursor is done and it wasn't part of a multi-statement // transaction. We can commit the transaction now. cc().commitTopTxn(); wts->release(); } } QueryResult *qr = (QueryResult *) b.buf(); qr->len = b.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = start; qr->nReturned = n; b.decouple(); return qr; }
std::string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { // Validate the namespace. const char *ns = q.ns; uassert(16332, "can't have an empty ns", ns[0]); const NamespaceString nsString(ns); uassert(16256, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid()); // Set curop information. curop.debug().ns = ns; curop.debug().ntoreturn = q.ntoreturn; curop.debug().query = q.query; curop.setQuery(q.query); // If the query is really a command, run it. if (nsString.isCommand()) { int nToReturn = q.ntoreturn; uassert(16979, str::stream() << "bad numberToReturn (" << nToReturn << ") for $cmd type ns - can only be 1 or -1", nToReturn == 1 || nToReturn == -1); curop.markCommand(); BufBuilder bb; bb.skip(sizeof(QueryResult)); BSONObjBuilder cmdResBuf; if (!runCommands(ns, q.query, curop, bb, cmdResBuf, false, q.queryOptions)) { uasserted(13530, "bad or malformed command request?"); } curop.debug().iscommand = true; // TODO: Does this get overwritten/do we really need to set this twice? curop.debug().query = q.query; QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf()); bb.decouple(); qr->setResultFlagsToOk(); qr->len = bb.len(); curop.debug().responseLength = bb.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; result.setData(qr, true); return ""; } // This is a read lock. We require this because if we're parsing a $where, the // where-specific parsing code assumes we have a lock and creates execution machinery that // requires it. Client::ReadContext ctx(q.ns); Collection* collection = ctx.ctx().db()->getCollection( ns ); // Parse the qm into a CanonicalQuery. CanonicalQuery* cq; Status canonStatus = CanonicalQuery::canonicalize(q, &cq); if (!canonStatus.isOK()) { uasserted(17287, str::stream() << "Can't canonicalize query: " << canonStatus.toString()); } verify(cq); QLOG() << "Running query:\n" << cq->toString(); LOG(2) << "Running query: " << cq->toStringShort(); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner = NULL; // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); // We'll now try to get the query runner that will execute this query for us. There // are a few cases in which we know upfront which runner we should get and, therefore, // we shortcut the selection process here. // // (a) If the query is over a collection that doesn't exist, we get a special runner // that's is so (a runner) which doesn't return results, the EOFRunner. // // (b) if the query is a replication's initial sync one, we get a SingleSolutinRunner // that uses a specifically designed stage that skips extents faster (see details in // exec/oplogstart.h) // // Otherwise we go through the selection of which runner is most suited to the // query + run-time context at hand. Status status = Status::OK(); if (collection == NULL) { rawRunner = new EOFRunner(cq, cq->ns()); } else if (pq.hasOption(QueryOption_OplogReplay)) { status = getOplogStartHack(collection, cq, &rawRunner); } else { // Takes ownership of cq. size_t options = QueryPlannerParams::DEFAULT; if (shardingState.needCollectionMetadata(pq.ns())) { options |= QueryPlannerParams::INCLUDE_SHARD_FILTER; } status = getRunner(cq, &rawRunner, options); } if (!status.isOK()) { // NOTE: Do not access cq as getRunner has deleted it. uasserted(17007, "Unable to execute query: " + status.reason()); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns()); // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; // We turn on auto-yielding for the runner here. The runner registers itself with the // active runners list in ClientCursor. auto_ptr<ScopedRunnerRegistration> safety(new ScopedRunnerRegistration(runner.get())); runner->setYieldPolicy(Runner::YIELD_AUTO); BSONObj obj; Runner::RunnerState state; // uint64_t numMisplacedDocs = 0; // set this outside loop. we will need to use this both within loop and when deciding // to fill in explain information const bool isExplain = pq.isExplain(); // Have we retrieved info about which plan the runner will // use to execute the query yet? bool gotPlanInfo = false; PlanInfo* rawInfo; boost::scoped_ptr<PlanInfo> planInfo; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // Add result to output buffer. This is unnecessary if explain info is requested if (!isExplain) { bb.appendBuf((void*)obj.objdata(), obj.objsize()); } // Count the result. ++numResults; // In the case of the multi plan runner, we may not be able to // successfully retrieve plan info until after the query starts // to run. This is because the multi plan runner doesn't know what // plan it will end up using until it runs candidates and selects // the best. // // TODO: Do we ever want to output what the MPR is comparing? if (!gotPlanInfo) { Status infoStatus = runner->getInfo(NULL, &rawInfo); if (infoStatus.isOK()) { gotPlanInfo = true; planInfo.reset(rawInfo); // planSummary is really a ThreadSafeString which copies the data from // the provided pointer. curop.debug().planSummary = planInfo->planSummary.c_str(); } } // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; if (isExplain) { if (enoughForExplain(pq, numResults)) { break; } } else if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { QLOG() << "Enough for first batch, wantMore=" << pq.wantMore() << " numToReturn=" << pq.getNumToReturn() << " numResults=" << numResults << endl; // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { QLOG() << " runner EOF=" << runner->isEOF() << endl; saveClientCursor = !runner->isEOF(); } break; } } // Try to get information about the plan which the runner // will use to execute the query, it we don't have it already. if (!gotPlanInfo) { Status infoStatus = runner->getInfo(NULL, &rawInfo); if (infoStatus.isOK()) { gotPlanInfo = true; planInfo.reset(rawInfo); // planSummary is really a ThreadSafeString which copies the data from // the provided pointer. curop.debug().planSummary = planInfo->planSummary.c_str(); } } // If we cache the runner later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the runner later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the runner. safety.reset(); // Caller expects exceptions thrown in certain cases. if (Runner::RUNNER_ERROR == state) { TypeExplain* bareExplain; Status res = runner->getInfo(&bareExplain, NULL); if (res.isOK()) { boost::scoped_ptr<TypeExplain> errorExplain(bareExplain); error() << "Runner error, stats:\n" << errorExplain->stats.jsonString(Strict, true); } uasserted(17144, "Runner error: " + WorkingSetCommon::toStatusString(obj)); } // Why save a dead runner? if (Runner::RUNNER_DEAD == state) { saveClientCursor = false; } else if (pq.hasOption(QueryOption_CursorTailable)) { // If we're tailing a capped collection, we don't bother saving the cursor if the // collection is empty. Otherwise, the semantics of the tailable cursor is that the // client will keep trying to read from it. So we'll keep it around. Collection* collection = ctx.ctx().db()->getCollection(cq->ns()); if (collection && collection->numRecords() != 0 && pq.getNumToReturn() != 1) { saveClientCursor = true; } } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } // Used to fill in explain and to determine if the query is slow enough to be logged. int elapsedMillis = curop.elapsedMillis(); // Get explain information if: // 1) it is needed by an explain query; // 2) profiling is enabled; or // 3) profiling is disabled but we still need explain details to log a "slow" query. // Producing explain information is expensive and should be done only if we are certain // the information will be used. boost::scoped_ptr<TypeExplain> explain(NULL); if (isExplain || ctx.ctx().db()->getProfilingLevel() > 0 || elapsedMillis > serverGlobalParams.slowMS) { // Ask the runner to produce explain information. TypeExplain* bareExplain; Status res = runner->getInfo(&bareExplain, NULL); if (res.isOK()) { explain.reset(bareExplain); } else if (isExplain) { error() << "could not produce explain of query '" << pq.getFilter() << "', error: " << res.reason(); // If numResults and the data in bb don't correspond, we'll crash later when rooting // through the reply msg. BSONObj emptyObj; bb.appendBuf((void*)emptyObj.objdata(), emptyObj.objsize()); // The explain output is actually a result. numResults = 1; // TODO: we can fill out millis etc. here just fine even if the plan screwed up. } } // Fill in the missing run-time fields in explain, starting with propeties of // the process running the query. if (isExplain && NULL != explain.get()) { std::string server = mongoutils::str::stream() << getHostNameCached() << ":" << serverGlobalParams.port; explain->setServer(server); // We might have skipped some results due to chunk migration etc. so our count is // correct. explain->setN(numResults); // Clock the whole operation. explain->setMillis(elapsedMillis); BSONObj explainObj = explain->toBSON(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // The explain output is actually a result. numResults = 1; } long long ccId = 0; if (saveClientCursor) { // We won't use the runner until it's getMore'd. runner->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(collection, runner.get(), cq->getParsed().getOptions(), cq->getParsed().getFilter()); ccId = cc->cursorid(); QLOG() << "caching runner with cursorid " << ccId << " after returning " << numResults << " results" << endl; // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); // TODO document if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } else { QLOG() << "Not caching runner but returning " << numResults << " results.\n"; } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; // Set debug information for consumption by the profiler. curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; if (NULL != explain.get()) { if (explain->isScanAndOrderSet()) { curop.debug().scanAndOrder = explain->getScanAndOrder(); } else { curop.debug().scanAndOrder = false; } if (explain->isNScannedSet()) { curop.debug().nscanned = explain->getNScanned(); } if (explain->isNScannedObjectsSet()) { curop.debug().nscannedObjects = explain->getNScannedObjects(); } if (explain->isIDHackSet()) { curop.debug().idhack = explain->getIDHack(); } if (!explain->stats.isEmpty()) { // execStats is a CachedBSONObj because it lives in the race-prone // curop. curop.debug().execStats.set(explain->stats); // Replace exec stats with plan summary if stats cannot fit into CachedBSONObj. if (curop.debug().execStats.tooBig() && !curop.debug().planSummary.empty()) { BSONObjBuilder bob; bob.append("summary", curop.debug().planSummary.toString()); curop.debug().execStats.set(bob.done()); } } } // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
/** * This is called by db/ops/query.cpp. This is the entry point for answering a query. */ std::string newRunQuery(CanonicalQuery* cq, CurOp& curop, Message &result) { QLOG() << "Running query on new system: " << cq->toString(); // This is a read lock. Client::ReadContext ctx(cq->ns(), storageGlobalParams.dbpath); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner = NULL; // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); // Need to call cq->toString() now, since upon error getRunner doesn't guarantee // cq is in a consistent state. string cqStr = cq->toString(); // We'll now try to get the query runner that will execute this query for us. There // are a few cases in which we know upfront which runner we should get and, therefore, // we shortcut the selection process here. // // (a) If the query is over a collection that doesn't exist, we get a special runner // that's is so (a runner) which doesn't return results, the EOFRunner. // // (b) if the query is a replication's initial sync one, we get a SingleSolutinRunner // that uses a specifically designed stage that skips extents faster (see details in // exec/oplogstart.h) // // Otherwise we go through the selection of which runner is most suited to the // query + run-time context at hand. Status status = Status::OK(); if (ctx.ctx().db()->getCollection(cq->ns()) == NULL) { rawRunner = new EOFRunner(cq, cq->ns()); } else if (pq.hasOption(QueryOption_OplogReplay)) { status = getOplogStartHack(cq, &rawRunner); } else { // Takes ownership of cq. size_t options = QueryPlannerParams::DEFAULT; if (shardingState.needCollectionMetadata(pq.ns())) { options |= QueryPlannerParams::INCLUDE_SHARD_FILTER; } status = getRunner(cq, &rawRunner, options); } if (!status.isOK()) { uasserted(17007, "Couldn't get runner for query because: " + status.reason() + " query is " + cqStr); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns()); // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; // We turn on auto-yielding for the runner here. The runner registers itself with the // active runners list in ClientCursor. ClientCursor::registerRunner(runner.get()); runner->setYieldPolicy(Runner::YIELD_AUTO); auto_ptr<DeregisterEvenIfUnderlyingCodeThrows> safety( new DeregisterEvenIfUnderlyingCodeThrows(runner.get())); BSONObj obj; Runner::RunnerState state; // uint64_t numMisplacedDocs = 0; // set this outside loop. we will need to use this both within loop and when deciding // to fill in explain information const bool isExplain = pq.isExplain(); while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // Add result to output buffer. This is unnecessary if explain info is requested if (!isExplain) { bb.appendBuf((void*)obj.objdata(), obj.objsize()); } // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; if (isExplain) { if (enoughForExplain(pq, numResults)) { break; } } else if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { QLOG() << "Enough for first batch, wantMore=" << pq.wantMore() << " numToReturn=" << pq.getNumToReturn() << " numResults=" << numResults << endl; // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { QLOG() << " runner EOF=" << runner->isEOF() << endl; saveClientCursor = !runner->isEOF(); } break; } } // If we cache the runner later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the runner later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the runner. safety.reset(); // Caller expects exceptions thrown in certain cases: // * in-memory sort using too much RAM. if (Runner::RUNNER_ERROR == state) { uasserted(17144, "Runner error, memory limit for sort probably exceeded"); } // Why save a dead runner? if (Runner::RUNNER_DEAD == state) { saveClientCursor = false; } else if (pq.hasOption(QueryOption_CursorTailable)) { // If we're tailing a capped collection, we don't bother saving the cursor if the // collection is empty. Otherwise, the semantics of the tailable cursor is that the // client will keep trying to read from it. So we'll keep it around. Collection* collection = ctx.ctx().db()->getCollection(cq->ns()); if (collection && collection->numRecords() != 0 && pq.getNumToReturn() != 1) { saveClientCursor = true; } } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } // Append explain information to query results by asking the runner to produce them. if (isExplain) { TypeExplain* bareExplain; Status res = runner->getExplainPlan(&bareExplain); if (!res.isOK()) { error() << "could not produce explain of query '" << pq.getFilter() << "', error: " << res.reason(); // If numResults and the data in bb don't correspond, we'll crash later when rooting // through the reply msg. BSONObj emptyObj; bb.appendBuf((void*)emptyObj.objdata(), emptyObj.objsize()); // The explain output is actually a result. numResults = 1; // TODO: we can fill out millis etc. here just fine even if the plan screwed up. } else { boost::scoped_ptr<TypeExplain> explain(bareExplain); // Fill in the missing run-time fields in explain, starting with propeties of // the process running the query. std::string server = mongoutils::str::stream() << getHostNameCached() << ":" << serverGlobalParams.port; explain->setServer(server); // We might have skipped some results due to chunk migration etc. so our count is // correct. explain->setN(numResults); // Clock the whole operation. explain->setMillis(curop.elapsedMillis()); BSONObj explainObj = explain->toBSON(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // The explain output is actually a result. numResults = 1; } } long long ccId = 0; if (saveClientCursor) { // We won't use the runner until it's getMore'd. runner->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(), cq->getParsed().getFilter()); ccId = cc->cursorid(); QLOG() << "caching runner with cursorid " << ccId << " after returning " << numResults << " results" << endl; // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); // TODO document if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } else { QLOG() << "not caching runner but returning " << numResults << " results\n"; } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ) { bool ok = true; MSGID responseTo = m.header()->id; DbMessage d(m); QueryMessage q(d); auto_ptr< Message > resp( new Message() ); CurOp& op = *(c.curop()); shared_ptr<AssertionException> ex; try { dbresponse.exhaust = runQuery(m, q, op, *resp); assert( !resp->empty() ); } catch ( SendStaleConfigException& e ){ ex.reset( new SendStaleConfigException( e.getns(), e.getInfo().msg ) ); ok = false; } catch ( AssertionException& e ) { ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) ); ok = false; } if( ex ){ op.debug().exceptionInfo = ex->getInfo(); LOGWITHRATELIMIT { log() << "assertion " << ex->toString() << " ns:" << q.ns << " query:" << (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl; if( q.ntoskip || q.ntoreturn ) log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << endl; } SendStaleConfigException* scex = NULL; if ( ex->getCode() == SendStaleConfigCode ) scex = static_cast<SendStaleConfigException*>( ex.get() ); BSONObjBuilder err; ex->getInfo().append( err ); if( scex ) err.append( "ns", scex->getns() ); BSONObj errObj = err.done(); log() << errObj << endl; BufBuilder b; b.skip(sizeof(QueryResult)); b.appendBuf((void*) errObj.objdata(), errObj.objsize()); // todo: call replyToQuery() from here instead of this!!! see dbmessage.h QueryResult * msgdata = (QueryResult *) b.buf(); b.decouple(); QueryResult *qr = msgdata; qr->_resultFlags() = ResultFlag_ErrSet; if( scex ) qr->_resultFlags() |= ResultFlag_ShardConfigStale; qr->len = b.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; resp.reset( new Message() ); resp->setData( msgdata, true ); } op.debug().responseLength = resp->header()->dataLen(); dbresponse.response = resp.release(); dbresponse.responseTo = responseTo; return ok; }
bool _handlePossibleShardedMessage( Message &m, DbResponse* dbresponse ) { DEV verify( shardingState.enabled() ); int op = m.operation(); if ( op < 2000 || op >= 3000 || op == dbGetMore // cursors are weird ) return false; DbMessage d(m); const char *ns = d.getns(); string errmsg; // We don't care about the version here, since we're returning it later in the writeback ChunkVersion received, wanted; if ( shardVersionOk( ns , errmsg, received, wanted ) ) { return false; } bool getsAResponse = doesOpGetAResponse( op ); LOG(1) << "connection sharding metadata does not match for collection " << ns << ", will retry (wanted : " << wanted << ", received : " << received << ")" << ( getsAResponse ? "" : " (queuing writeback)" ) << endl; if( getsAResponse ){ verify( dbresponse ); BufBuilder b( 32768 ); b.skip( sizeof( QueryResult ) ); { BSONObjBuilder bob; bob.append( "$err", errmsg ); bob.append( "ns", ns ); wanted.addToBSON( bob, "vWanted" ); received.addToBSON( bob, "vReceived" ); BSONObj obj = bob.obj(); b.appendBuf( obj.objdata() , obj.objsize() ); } QueryResult *qr = (QueryResult*)b.buf(); qr->_resultFlags() = ResultFlag_ErrSet | ResultFlag_ShardConfigStale; qr->len = b.len(); qr->setOperation( opReply ); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; b.decouple(); Message * resp = new Message(); resp->setData( qr , true ); dbresponse->response = resp; dbresponse->responseTo = m.header()->id; return true; } uassert(9517, "cannot queue a writeback operation to the writeback queue", (d.reservedField() & Reserved_FromWriteback) == 0); const OID& clientID = ShardedConnectionInfo::get(false)->getID(); massert( 10422 , "write with bad shard config and no server id!" , clientID.isSet() ); // We need to check this here, since otherwise we'll get errors wrapping the writeback - // not just here, but also when returning as a command result. // We choose 1/2 the overhead of the internal maximum so that we can still handle ops of // 16MB exactly. massert( 16437, "data size of operation is too large to queue for writeback", m.dataSize() < BSONObjMaxInternalSize - (8 * 1024)); LOG(1) << "writeback queued for " << m.toString() << endl; BSONObjBuilder b; b.appendBool( "writeBack" , true ); b.append( "ns" , ns ); b.append( "connectionId" , cc().getConnectionId() ); b.append( "instanceIdent" , prettyHostName() ); wanted.addToBSON( b ); received.addToBSON( b, "yourVersion" ); b.appendBinData( "msg" , m.header()->len , bdtCustom , (char*)(m.singleData()) ); LOG(2) << "writing back msg with len: " << m.header()->len << " op: " << m.operation() << endl; // we pass the builder to queueWriteBack so that it can select the writebackId // this is important so that the id is guaranteed to be ascending // that is important since mongos assumes if its seen a greater writeback // that all former have been processed OID writebackID = writeBackManager.queueWriteBack( clientID.str() , b ); lastError.getSafe()->writeback( writebackID ); return true; }
QueryResult* getMore(const char *ns, int ntoreturn, long long cursorid , stringstream& ss) { ClientCursor *cc = ClientCursor::find(cursorid); int bufSize = 512; if ( cc ){ bufSize += sizeof( QueryResult ); bufSize += ( ntoreturn ? 4 : 1 ) * 1024 * 1024; } BufBuilder b( bufSize ); b.skip(sizeof(QueryResult)); int resultFlags = 0; int start = 0; int n = 0; if ( !cc ) { log() << "getMore: cursorid not found " << ns << " " << cursorid << endl; cursorid = 0; resultFlags = QueryResult::ResultFlag_CursorNotFound; } else { ss << " query: " << cc->query << " "; start = cc->pos; Cursor *c = cc->c.get(); c->checkLocation(); while ( 1 ) { if ( !c->ok() ) { if ( c->tailable() ) { if ( c->advance() ) { continue; } break; } bool ok = ClientCursor::erase(cursorid); assert(ok); cursorid = 0; cc = 0; break; } if ( !cc->matcher->matches(c->currKey(), c->currLoc() ) ) { } else { //out() << "matches " << c->currLoc().toString() << '\n'; if( c->getsetdup(c->currLoc()) ) { //out() << " but it's a dup \n"; } else { BSONObj js = c->current(); fillQueryResultFromObj(b, cc->filter.get(), js); n++; if ( (ntoreturn>0 && (n >= ntoreturn || b.len() > MaxBytesToReturnToClientAtOnce)) || (ntoreturn==0 && b.len()>1*1024*1024) ) { c->advance(); cc->pos += n; //cc->updateLocation(); break; } } } c->advance(); } if ( cc ) { cc->updateLocation(); cc->mayUpgradeStorage(); } } QueryResult *qr = (QueryResult *) b.buf(); qr->len = b.len(); qr->setOperation(opReply); qr->resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = start; qr->nReturned = n; b.decouple(); return qr; }
bool handlePossibleShardedMessage( Message &m, DbResponse* dbresponse ) { if ( ! shardingState.enabled() ) return false; int op = m.operation(); if ( op < 2000 || op >= 3000 || op == dbGetMore // cursors are weird ) return false; DbMessage d(m); const char *ns = d.getns(); string errmsg; if ( shardVersionOk( ns , opIsWrite( op ) , errmsg ) ) { return false; } log(1) << "connection meta data too old - will retry ns:(" << ns << ") op:(" << opToString(op) << ") " << errmsg << endl; if ( doesOpGetAResponse( op ) ) { assert( dbresponse ); BufBuilder b( 32768 ); b.skip( sizeof( QueryResult ) ); { BSONObj obj = BSON( "$err" << errmsg ); b.appendBuf( obj.objdata() , obj.objsize() ); } QueryResult *qr = (QueryResult*)b.buf(); qr->_resultFlags() = ResultFlag_ErrSet | ResultFlag_ShardConfigStale; qr->len = b.len(); qr->setOperation( opReply ); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; b.decouple(); Message * resp = new Message(); resp->setData( qr , true ); dbresponse->response = resp; dbresponse->responseTo = m.header()->id; return true; } OID writebackID; writebackID.init(); lastError.getSafe()->writeback( writebackID ); const OID& clientID = ShardedConnectionInfo::get(false)->getID(); massert( 10422 , "write with bad shard config and no server id!" , clientID.isSet() ); log(1) << "got write with an old config - writing back ns: " << ns << endl; if ( logLevel ) log(1) << debugString( m ) << endl; BSONObjBuilder b; b.appendBool( "writeBack" , true ); b.append( "ns" , ns ); b.append( "id" , writebackID ); b.append( "connectionId" , cc().getConnectionId() ); b.appendTimestamp( "version" , shardingState.getVersion( ns ) ); b.appendTimestamp( "yourVersion" , ShardedConnectionInfo::get( true )->getVersion( ns ) ); b.appendBinData( "msg" , m.header()->len , bdtCustom , (char*)(m.singleData()) ); log(2) << "writing back msg with len: " << m.header()->len << " op: " << m.operation() << endl; writeBackManager.queueWriteBack( clientID.str() , b.obj() ); return true; }
/** * Run a query with a cursor provided by the query optimizer, or FindingStartCursor. * @yields the db lock. */ const char *queryWithQueryOptimizer( Message &m, int queryOptions, const char *ns, const BSONObj &jsobj, CurOp& curop, const BSONObj &query, const BSONObj &order, const shared_ptr<ParsedQuery> &pq_shared, const BSONObj &oldPlan, const ConfigVersion &shardingVersionAtStart, Message &result ) { const ParsedQuery &pq( *pq_shared ); shared_ptr<Cursor> cursor; QueryPlanSummary queryPlan; if ( pq.hasOption( QueryOption_OplogReplay ) ) { cursor = FindingStartCursor::getCursor( ns, query, order ); } else { cursor = NamespaceDetailsTransient::getCursor( ns, query, order, QueryPlanSelectionPolicy::any(), 0, pq_shared, &queryPlan ); } verify( cursor ); QueryResponseBuilder queryResponseBuilder( pq, cursor, queryPlan, oldPlan ); bool saveClientCursor = false; const char *exhaust = 0; OpTime slaveReadTill; ClientCursor::CleanupPointer ccPointer; ccPointer.reset( new ClientCursor( QueryOption_NoCursorTimeout, cursor, ns ) ); for( ; cursor->ok(); cursor->advance() ) { bool yielded = false; if ( !ccPointer->yieldSometimes( ClientCursor::MaybeCovered, &yielded ) || !cursor->ok() ) { cursor.reset(); queryResponseBuilder.noteYield(); // !!! TODO The queryResponseBuilder still holds cursor. Currently it will not do // anything unsafe with the cursor in handoff(), but this is very fragile. // // We don't fail the query since we're fine with returning partial data if the // collection was dropped. // NOTE see SERVER-2454. // TODO This is wrong. The cursor could be gone if the closeAllDatabases command // just ran. break; } if ( yielded ) { queryResponseBuilder.noteYield(); } if ( pq.getMaxScan() && cursor->nscanned() > pq.getMaxScan() ) { break; } if ( !queryResponseBuilder.addMatch() ) { continue; } // Note slave's position in the oplog. if ( pq.hasOption( QueryOption_OplogReplay ) ) { BSONObj current = cursor->current(); BSONElement e = current["ts"]; if ( e.type() == Date || e.type() == Timestamp ) { slaveReadTill = e._opTime(); } } if ( !cursor->supportGetMore() || pq.isExplain() ) { if ( queryResponseBuilder.enoughTotalResults() ) { break; } } else if ( queryResponseBuilder.enoughForFirstBatch() ) { // if only 1 requested, no cursor saved for efficiency...we assume it is findOne() if ( pq.wantMore() && pq.getNumToReturn() != 1 ) { queryResponseBuilder.finishedFirstBatch(); if ( cursor->advance() ) { saveClientCursor = true; } } break; } } if ( cursor ) { if ( pq.hasOption( QueryOption_CursorTailable ) && pq.getNumToReturn() != 1 ) { cursor->setTailable(); } // If the tailing request succeeded. if ( cursor->tailable() ) { saveClientCursor = true; } } if ( shardingState.getVersion( ns ) != shardingVersionAtStart ) { // if the version changed during the query // we might be missing some data // and its safe to send this as mongos can resend // at this point throw SendStaleConfigException( ns , "version changed during initial query", shardingVersionAtStart, shardingState.getVersion( ns ) ); } int nReturned = queryResponseBuilder.handoff( result ); ccPointer.reset(); long long cursorid = 0; if ( saveClientCursor ) { // Create a new ClientCursor, with a default timeout. ccPointer.reset( new ClientCursor( queryOptions, cursor, ns, jsobj.getOwned() ) ); cursorid = ccPointer->cursorid(); DEV tlog(2) << "query has more, cursorid: " << cursorid << endl; if ( cursor->supportYields() ) { ClientCursor::YieldData data; ccPointer->prepareToYield( data ); } else { ccPointer->c()->noteLocation(); } // !!! Save the original message buffer, so it can be referenced in getMore. ccPointer->originalMessage = m; // Save slave's position in the oplog. if ( pq.hasOption( QueryOption_OplogReplay ) && !slaveReadTill.isNull() ) { ccPointer->slaveReadTill( slaveReadTill ); } if ( !ccPointer->ok() && ccPointer->c()->tailable() ) { DEV tlog() << "query has no more but tailable, cursorid: " << cursorid << endl; } if( queryOptions & QueryOption_Exhaust ) { exhaust = ns; curop.debug().exhaust = true; } // Set attributes for getMore. ccPointer->setChunkManager( queryResponseBuilder.chunkManager() ); ccPointer->setPos( nReturned ); ccPointer->pq = pq_shared; ccPointer->fields = pq.getFieldPtr(); ccPointer.release(); } QueryResult *qr = (QueryResult *) result.header(); qr->cursorId = cursorid; curop.debug().cursorid = ( cursorid == 0 ? -1 : qr->cursorId ); qr->setResultFlagsToOk(); // qr->len is updated automatically by appendData() curop.debug().responseLength = qr->len; qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = nReturned; int duration = curop.elapsedMillis(); bool dbprofile = curop.shouldDBProfile( duration ); if ( dbprofile || duration >= cmdLine.slowMS ) { curop.debug().nscanned = (int)( cursor ? cursor->nscanned() : 0 ); curop.debug().ntoskip = pq.getSkip(); } curop.debug().nreturned = nReturned; return exhaust; }