static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ){ bool ok = true; MSGID responseTo = m.header()->id; DbMessage d(m); QueryMessage q(d); auto_ptr< Message > resp( new Message() ); CurOp& op = *(c.curop()); try { dbresponse.exhaust = runQuery(m, q, op, *resp); assert( !resp->empty() ); } catch ( AssertionException& e ) { ok = false; op.debug().str << " exception "; LOGSOME { log() << "assertion " << e.toString() << " ns:" << q.ns << " query:" << (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl; if( q.ntoskip || q.ntoreturn ) log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << endl; } BSONObjBuilder err; e.getInfo().append( err ); BSONObj errObj = err.done(); BufBuilder b; b.skip(sizeof(QueryResult)); b.appendBuf((void*) errObj.objdata(), errObj.objsize()); // todo: call replyToQuery() from here instead of this!!! see dbmessage.h QueryResult * msgdata = (QueryResult *) b.buf(); b.decouple(); QueryResult *qr = msgdata; qr->_resultFlags() = ResultFlag_ErrSet; if ( e.getCode() == StaleConfigInContextCode ) qr->_resultFlags() |= ResultFlag_ShardConfigStale; qr->len = b.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; resp.reset( new Message() ); resp->setData( msgdata, true ); } if ( op.shouldDBProfile( 0 ) ){ op.debug().str << " bytes:" << resp->header()->dataLen(); } dbresponse.response = resp.release(); dbresponse.responseTo = responseTo; return ok; }
bool handlePossibleShardedMessage( Message &m, DbResponse &dbresponse ){ if ( shardConfigServer.empty() ){ return false; } int op = m.data->operation(); if ( op < 2000 || op >= 3000 ) return false; const char *ns = m.data->_data + 4; string errmsg; if ( shardVersionOk( ns , errmsg ) ){ return false; } log() << "shardVersionOk failed ns:" << ns << " " << errmsg << endl; if ( doesOpGetAResponse( op ) ){ BufBuilder b( 32768 ); b.skip( sizeof( QueryResult ) ); { BSONObj obj = BSON( "$err" << errmsg ); b.append( obj.objdata() , obj.objsize() ); } QueryResult *qr = (QueryResult*)b.buf(); qr->_resultFlags() = QueryResult::ResultFlag_ErrSet | QueryResult::ResultFlag_ShardConfigStale; qr->len = b.len(); qr->setOperation( opReply ); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; b.decouple(); Message * resp = new Message(); resp->setData( qr , true ); dbresponse.response = resp; dbresponse.responseTo = m.data->id; return true; } OID * clientID = clientServerIds.get(); massert( 10422 , "write with bad shard config and no server id!" , clientID ); log() << "got write with an old config - writing back" << endl; BSONObjBuilder b; b.appendBool( "writeBack" , true ); b.append( "ns" , ns ); b.appendBinData( "msg" , m.data->len , bdtCustom , (char*)(m.data) ); log() << "writing back msg with len: " << m.data->len << " op: " << m.data->_operation << endl; clientQueues[clientID->str()]->push( b.obj() ); return true; }
void replyToQuery(int queryResultFlags, Message &m, DbResponse &dbresponse, BSONObj obj) { BufBuilder b; b.skip(sizeof(QueryResult)); b.appendBuf((void*) obj.objdata(), obj.objsize()); QueryResult* msgdata = (QueryResult *) b.buf(); b.decouple(); QueryResult *qr = msgdata; qr->_resultFlags() = queryResultFlags; qr->len = b.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; Message *resp = new Message(); resp->setData(msgdata, true); // transport will free dbresponse.response = resp; dbresponse.responseTo = m.header()->id; }
void replyToQuery( int queryResultFlags, Message& response, const BSONObj& resultObj ) { BufBuilder bufBuilder; bufBuilder.skip( sizeof( QueryResult )); bufBuilder.appendBuf( reinterpret_cast< void *>( const_cast< char* >( resultObj.objdata() )), resultObj.objsize() ); QueryResult* queryResult = reinterpret_cast< QueryResult* >( bufBuilder.buf() ); bufBuilder.decouple(); queryResult->_resultFlags() = queryResultFlags; queryResult->len = bufBuilder.len(); queryResult->setOperation( opReply ); queryResult->cursorId = 0; queryResult->startingFrom = 0; queryResult->nReturned = 1; response.setData( queryResult, true ); // transport will free }
void replyToQuery(int queryResultFlags, AbstractMessagingPort* p, Message& requestMsg, void *data, int size, int nReturned, int startingFrom, long long cursorId ) { BufBuilder b(32768); b.skip(sizeof(QueryResult)); b.appendBuf(data, size); QueryResult *qr = (QueryResult *) b.buf(); qr->_resultFlags() = queryResultFlags; qr->len = b.len(); qr->setOperation(opReply); qr->cursorId = cursorId; qr->startingFrom = startingFrom; qr->nReturned = nReturned; b.decouple(); Message resp(qr, true); p->reply(requestMsg, resp, requestMsg.header()->id); }
bool handlePossibleShardedMessage( Message &m, DbResponse* dbresponse ) { if ( ! shardingState.enabled() ) return false; int op = m.operation(); if ( op < 2000 || op >= 3000 || op == dbGetMore // cursors are weird ) return false; DbMessage d(m); const char *ns = d.getns(); string errmsg; if ( shardVersionOk( ns , opIsWrite( op ) , errmsg ) ) { return false; } log(1) << "connection meta data too old - will retry ns:(" << ns << ") op:(" << opToString(op) << ") " << errmsg << endl; if ( doesOpGetAResponse( op ) ) { assert( dbresponse ); BufBuilder b( 32768 ); b.skip( sizeof( QueryResult ) ); { BSONObj obj = BSON( "$err" << errmsg ); b.appendBuf( obj.objdata() , obj.objsize() ); } QueryResult *qr = (QueryResult*)b.buf(); qr->_resultFlags() = ResultFlag_ErrSet | ResultFlag_ShardConfigStale; qr->len = b.len(); qr->setOperation( opReply ); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; b.decouple(); Message * resp = new Message(); resp->setData( qr , true ); dbresponse->response = resp; dbresponse->responseTo = m.header()->id; return true; } OID writebackID; writebackID.init(); lastError.getSafe()->writeback( writebackID ); const OID& clientID = ShardedConnectionInfo::get(false)->getID(); massert( 10422 , "write with bad shard config and no server id!" , clientID.isSet() ); log(1) << "got write with an old config - writing back ns: " << ns << endl; if ( logLevel ) log(1) << debugString( m ) << endl; BSONObjBuilder b; b.appendBool( "writeBack" , true ); b.append( "ns" , ns ); b.append( "id" , writebackID ); b.append( "connectionId" , cc().getConnectionId() ); b.appendTimestamp( "version" , shardingState.getVersion( ns ) ); b.appendTimestamp( "yourVersion" , ShardedConnectionInfo::get( true )->getVersion( ns ) ); b.appendBinData( "msg" , m.header()->len , bdtCustom , (char*)(m.singleData()) ); log(2) << "writing back msg with len: " << m.header()->len << " op: " << m.operation() << endl; writeBackManager.queueWriteBack( clientID.str() , b.obj() ); return true; }
/** * Also called by db/ops/query.cpp. This is the new getMore entry point. */ QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized) { exhaust = false; int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(bufSize); bb.skip(sizeof(QueryResult)); // This is a read lock. TODO: There is a cursor flag for not needing this. Do we care? Client::ReadContext ctx(ns); QLOG() << "running getMore in new system, cursorid " << cursorid << endl; // This checks to make sure the operation is allowed on a replicated node. Since we are not // passing in a query object (necessary to check SlaveOK query option), the only state where // reads are allowed is PRIMARY (or master in master/slave). This function uasserts if // reads are not okay. replVerifyReadsOk(); // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. ClientCursorPin ccPin(cursorid); ClientCursor* cc = ccPin.c(); // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Quote: check for spoofing of the ns such that it does not match the one originally // there for the cursor uassert(17011, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // TODO: fail point? // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // TODO: // curop.debug().query = BSONForQuery // curop.setQuery(curop.debug().query); // TODO: What is pass? if (0 == pass) { cc->updateSlaveLocation(curop); } CollectionMetadataPtr collMetadata = cc->getCollMetadata(); // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); // What gives us results. Runner* runner = cc->getRunner(); const int queryOptions = cc->queryOptions(); // Get results out of the runner. runner->restoreState(); BSONObj obj; Runner::RunnerState state; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (queryOptions & QueryOption_OplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } if ((ntoreturn && numResults >= ntoreturn) || bb.len() > MaxBytesToReturnToClientAtOnce) { break; } } if (Runner::RUNNER_EOF == state && 0 == numResults && (queryOptions & QueryOption_CursorTailable) && (queryOptions & QueryOption_AwaitData) && (pass < 1000)) { // If the cursor is tailable we don't kill it if it's eof. We let it try to get // data some # of times first. return 0; } bool saveClientCursor = false; if (Runner::RUNNER_DEAD == state || Runner::RUNNER_ERROR == state) { // If we're dead there's no way to get more results. saveClientCursor = false; // In the old system tailable capped cursors would be killed off at the // cursorid level. If a tailable capped cursor is nuked the cursorid // would vanish. // // In the new system they die and are cleaned up later (or time out). // So this is where we get to remove the cursorid. if (0 == numResults) { resultFlags = ResultFlag_CursorNotFound; } } else if (Runner::RUNNER_EOF == state) { // EOF is also end of the line unless it's tailable. saveClientCursor = queryOptions & QueryOption_CursorTailable; } else { verify(Runner::RUNNER_ADVANCED == state); saveClientCursor = true; } if (!saveClientCursor) { ccPin.deleteUnderlying(); // cc is now invalid, as is the runner cursorid = 0; cc = NULL; QLOG() << "getMore NOT saving client cursor, ended w/state " << Runner::statestr(state) << endl; } else { // Continue caching the ClientCursor. cc->incPos(numResults); runner->saveState(); QLOG() << "getMore saving client cursor ended w/state " << Runner::statestr(state) << endl; // Possibly note slave's position in the oplog. if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = (queryOptions & QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf()); qr->len = bb.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = startingResult; qr->nReturned = numResults; bb.decouple(); QLOG() << "getMore returned " << numResults << " results\n"; return qr; }
QueryResult* processGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized ) { bool hasRunner = false; // Scoped to kill the pin after seeing if the runner's there. { // See if there's a runner. We do this until agg. is behind a Runner instead of a CC. ClientCursorPin p(cursorid); ClientCursor *cc = p.c(); if (NULL != cc && NULL != cc->getRunner()) { hasRunner = true; } } if (hasRunner) { return newGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust, isCursorAuthorized); } exhaust = false; int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce; BufBuilder b( bufSize ); b.skip(sizeof(QueryResult)); int resultFlags = ResultFlag_AwaitCapable; int start = 0; int n = 0; scoped_ptr<Client::ReadContext> ctx(new Client::ReadContext(ns)); // call this readlocked so state can't change replVerifyReadsOk(); ClientCursorPin p(cursorid); ClientCursor *cc = p.c(); if ( unlikely(!cc) ) { LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl; cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Some internal users create a ClientCursor with a Runner. Don't crash if this // happens. Instead, hand them off to the new framework. if (NULL != cc->getRunner()) { p.release(); return newGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust, isCursorAuthorized); } // check for spoofing of the ns such that it does not match the one originally there for the cursor uassert(14833, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // This must be done after auth check to ensure proper cleanup. uassert(16951, "failing getmore due to set failpoint", !MONGO_FAIL_POINT(getMoreError)); // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros( cc->getLeftoverMaxTimeMicros() ); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. if ( pass == 0 ) cc->updateSlaveLocation( curop ); int queryOptions = cc->queryOptions(); curop.debug().query = cc->query(); curop.setQuery( cc->query() ); start = cc->pos(); Cursor *c = cc->c(); if (!c->requiresLock()) { // make sure it won't be destroyed under us fassert(16952, !c->shouldDestroyOnNSDeletion()); fassert(16953, !c->supportYields()); ctx.reset(); // unlocks } c->recoverFromYield(); DiskLoc last; // This metadata may be stale, but it's the state of chunking when the cursor was // created. CollectionMetadataPtr metadata = cc->getCollMetadata(); KeyPattern keyPattern( metadata ? metadata->getKeyPattern() : BSONObj() ); while ( 1 ) { if ( !c->ok() ) { if ( c->tailable() ) { // when a tailable cursor hits "EOF", ok() goes false, and current() is // null. however advance() can still be retries as a reactivation attempt. // when there is new data, it will return true. that's what we are doing // here. if ( c->advance() ) continue; if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) { return 0; } break; } p.release(); bool ok = ClientCursor::erase(cursorid); verify(ok); cursorid = 0; cc = 0; break; } MatchDetails details; if ( cc->fields && cc->fields->getArrayOpType() == Projection::ARRAY_OP_POSITIONAL ) { // field projection specified, and contains an array operator details.requestElemMatchKey(); } // in some cases (clone collection) there won't be a matcher if ( !c->currentMatches( &details ) ) { } else if ( metadata && !metadata->keyBelongsToMe( extractKey(c, keyPattern ) ) ) { LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl; } else { if( c->getsetdup(c->currLoc()) ) { //out() << " but it's a dup \n"; } else { last = c->currLoc(); n++; // Fill out the fields requested by the query. const Projection::KeyOnly *keyFieldsOnly = c->keyFieldsOnly(); if ( keyFieldsOnly ) { fillQueryResultFromObj( b, 0, keyFieldsOnly->hydrate( c->currKey() ), &details ); } else { DiskLoc loc = c->currLoc(); fillQueryResultFromObj( b, cc->fields.get(), c->current(), &details, ( ( cc->pq.get() && cc->pq->showDiskLoc() ) ? &loc : 0 ) ); } if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) { c->advance(); cc->incPos( n ); break; } } } c->advance(); if ( ! cc->yieldSometimes( ( c->ok() && c->keyFieldsOnly() ) ? ClientCursor::DontNeed : ClientCursor::WillNeed ) ) { ClientCursor::erase(cursorid); cursorid = 0; cc = 0; break; } } if ( cc ) { if ( c->supportYields() ) { ClientCursor::YieldData data; verify( cc->prepareToYield( data ) ); } else { cc->c()->noteLocation(); } cc->storeOpForSlave( last ); exhaust = cc->queryOptions() & QueryOption_Exhaust; // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult *qr = (QueryResult *) b.buf(); qr->len = b.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = start; qr->nReturned = n; b.decouple(); return qr; }
static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ) { bool ok = true; MSGID responseTo = m.header()->id; DbMessage d(m); QueryMessage q(d); auto_ptr< Message > resp( new Message() ); CurOp& op = *(c.curop()); shared_ptr<AssertionException> ex; try { if (!NamespaceString(d.getns()).isCommand()) { // Auth checking for Commands happens later. Status status = cc().getAuthorizationManager()->checkAuthForQuery(d.getns()); uassert(16550, status.reason(), status.isOK()); } dbresponse.exhaustNS = runQuery(m, q, op, *resp); verify( !resp->empty() ); } catch ( SendStaleConfigException& e ){ ex.reset( new SendStaleConfigException( e.getns(), e.getInfo().msg, e.getVersionReceived(), e.getVersionWanted() ) ); ok = false; } catch ( AssertionException& e ) { ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) ); ok = false; } if( ex ){ op.debug().exceptionInfo = ex->getInfo(); LOGWITHRATELIMIT { log() << "assertion " << ex->toString() << " ns:" << q.ns << " query:" << (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl; if( q.ntoskip || q.ntoreturn ) log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << endl; } SendStaleConfigException* scex = NULL; if ( ex->getCode() == SendStaleConfigCode ) scex = static_cast<SendStaleConfigException*>( ex.get() ); BSONObjBuilder err; ex->getInfo().append( err ); if( scex ){ err.append( "ns", scex->getns() ); scex->getVersionReceived().addToBSON( err, "vReceived" ); scex->getVersionWanted().addToBSON( err, "vWanted" ); } BSONObj errObj = err.done(); if( scex ){ log() << "stale version detected during query over " << q.ns << " : " << errObj << endl; } else{ log() << "problem detected during query over " << q.ns << " : " << errObj << endl; } BufBuilder b; b.skip(sizeof(QueryResult)); b.appendBuf((void*) errObj.objdata(), errObj.objsize()); // todo: call replyToQuery() from here instead of this!!! see dbmessage.h QueryResult * msgdata = (QueryResult *) b.buf(); b.decouple(); QueryResult *qr = msgdata; qr->_resultFlags() = ResultFlag_ErrSet; if( scex ) qr->_resultFlags() |= ResultFlag_ShardConfigStale; qr->len = b.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; resp.reset( new Message() ); resp->setData( msgdata, true ); } op.debug().responseLength = resp->header()->dataLen(); dbresponse.response = resp.release(); dbresponse.responseTo = responseTo; return ok; }
bool _handlePossibleShardedMessage( Message &m, DbResponse* dbresponse ) { DEV assert( shardingState.enabled() ); int op = m.operation(); if ( op < 2000 || op >= 3000 || op == dbGetMore // cursors are weird ) return false; DbMessage d(m); const char *ns = d.getns(); string errmsg; // We don't care about the version here, since we're returning it later in the writeback ConfigVersion received, wanted; if ( shardVersionOk( ns , errmsg, received, wanted ) ) { return false; } LOG(1) << "connection meta data too old - will retry ns:(" << ns << ") op:(" << opToString(op) << ") " << errmsg << endl; if ( doesOpGetAResponse( op ) ) { assert( dbresponse ); BufBuilder b( 32768 ); b.skip( sizeof( QueryResult ) ); { BSONObj obj = BSON( "$err" << errmsg << "ns" << ns ); b.appendBuf( obj.objdata() , obj.objsize() ); } QueryResult *qr = (QueryResult*)b.buf(); qr->_resultFlags() = ResultFlag_ErrSet | ResultFlag_ShardConfigStale; qr->len = b.len(); qr->setOperation( opReply ); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; b.decouple(); Message * resp = new Message(); resp->setData( qr , true ); dbresponse->response = resp; dbresponse->responseTo = m.header()->id; return true; } uassert( 9517 , "writeback" , ( d.reservedField() & DbMessage::Reserved_FromWriteback ) == 0 ); OID writebackID; writebackID.init(); lastError.getSafe()->writeback( writebackID ); const OID& clientID = ShardedConnectionInfo::get(false)->getID(); massert( 10422 , "write with bad shard config and no server id!" , clientID.isSet() ); LOG(1) << "got write with an old config - writing back ns: " << ns << endl; LOG(1) << m.toString() << endl; BSONObjBuilder b; b.appendBool( "writeBack" , true ); b.append( "ns" , ns ); b.append( "id" , writebackID ); b.append( "connectionId" , cc().getConnectionId() ); b.append( "instanceIdent" , prettyHostName() ); b.appendTimestamp( "version" , shardingState.getVersion( ns ) ); ShardedConnectionInfo* info = ShardedConnectionInfo::get( false ); b.appendTimestamp( "yourVersion" , info ? info->getVersion(ns) : (ConfigVersion)0 ); b.appendBinData( "msg" , m.header()->len , bdtCustom , (char*)(m.singleData()) ); LOG(2) << "writing back msg with len: " << m.header()->len << " op: " << m.operation() << endl; writeBackManager.queueWriteBack( clientID.str() , b.obj() ); return true; }
static bool receivedQuery(Client& c, DbResponse& dbresponse, Message& m ) { bool ok = true; MSGID responseTo = m.header()->id; DbMessage d(m); QueryMessage q(d); auto_ptr< Message > resp( new Message() ); CurOp& op = *(c.curop()); shared_ptr<AssertionException> ex; try { dbresponse.exhaust = runQuery(m, q, op, *resp); assert( !resp->empty() ); } catch ( SendStaleConfigException& e ){ ex.reset( new SendStaleConfigException( e.getns(), e.getInfo().msg ) ); ok = false; } catch ( AssertionException& e ) { ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) ); ok = false; } if( ex ){ op.debug().exceptionInfo = ex->getInfo(); LOGWITHRATELIMIT { log() << "assertion " << ex->toString() << " ns:" << q.ns << " query:" << (q.query.valid() ? q.query.toString() : "query object is corrupt") << endl; if( q.ntoskip || q.ntoreturn ) log() << " ntoskip:" << q.ntoskip << " ntoreturn:" << q.ntoreturn << endl; } SendStaleConfigException* scex = NULL; if ( ex->getCode() == SendStaleConfigCode ) scex = static_cast<SendStaleConfigException*>( ex.get() ); BSONObjBuilder err; ex->getInfo().append( err ); if( scex ) err.append( "ns", scex->getns() ); BSONObj errObj = err.done(); log() << errObj << endl; BufBuilder b; b.skip(sizeof(QueryResult)); b.appendBuf((void*) errObj.objdata(), errObj.objsize()); // todo: call replyToQuery() from here instead of this!!! see dbmessage.h QueryResult * msgdata = (QueryResult *) b.buf(); b.decouple(); QueryResult *qr = msgdata; qr->_resultFlags() = ResultFlag_ErrSet; if( scex ) qr->_resultFlags() |= ResultFlag_ShardConfigStale; qr->len = b.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; resp.reset( new Message() ); resp->setData( msgdata, true ); } op.debug().responseLength = resp->header()->dataLen(); dbresponse.response = resp.release(); dbresponse.responseTo = responseTo; return ok; }
bool _handlePossibleShardedMessage( Message &m, DbResponse* dbresponse ) { DEV verify( shardingState.enabled() ); int op = m.operation(); if ( op < 2000 || op >= 3000 || op == dbGetMore // cursors are weird ) return false; DbMessage d(m); const char *ns = d.getns(); string errmsg; // We don't care about the version here, since we're returning it later in the writeback ChunkVersion received, wanted; if ( shardVersionOk( ns , errmsg, received, wanted ) ) { return false; } bool getsAResponse = doesOpGetAResponse( op ); LOG(1) << "connection sharding metadata does not match for collection " << ns << ", will retry (wanted : " << wanted << ", received : " << received << ")" << ( getsAResponse ? "" : " (queuing writeback)" ) << endl; if( getsAResponse ){ verify( dbresponse ); BufBuilder b( 32768 ); b.skip( sizeof( QueryResult ) ); { BSONObjBuilder bob; bob.append( "$err", errmsg ); bob.append( "ns", ns ); wanted.addToBSON( bob, "vWanted" ); received.addToBSON( bob, "vReceived" ); BSONObj obj = bob.obj(); b.appendBuf( obj.objdata() , obj.objsize() ); } QueryResult *qr = (QueryResult*)b.buf(); qr->_resultFlags() = ResultFlag_ErrSet | ResultFlag_ShardConfigStale; qr->len = b.len(); qr->setOperation( opReply ); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; b.decouple(); Message * resp = new Message(); resp->setData( qr , true ); dbresponse->response = resp; dbresponse->responseTo = m.header()->id; return true; } uassert(9517, "cannot queue a writeback operation to the writeback queue", (d.reservedField() & Reserved_FromWriteback) == 0); const OID& clientID = ShardedConnectionInfo::get(false)->getID(); massert( 10422 , "write with bad shard config and no server id!" , clientID.isSet() ); // We need to check this here, since otherwise we'll get errors wrapping the writeback - // not just here, but also when returning as a command result. // We choose 1/2 the overhead of the internal maximum so that we can still handle ops of // 16MB exactly. massert( 16437, "data size of operation is too large to queue for writeback", m.dataSize() < BSONObjMaxInternalSize - (8 * 1024)); LOG(1) << "writeback queued for " << m.toString() << endl; BSONObjBuilder b; b.appendBool( "writeBack" , true ); b.append( "ns" , ns ); b.append( "connectionId" , cc().getConnectionId() ); b.append( "instanceIdent" , prettyHostName() ); wanted.addToBSON( b ); received.addToBSON( b, "yourVersion" ); b.appendBinData( "msg" , m.header()->len , bdtCustom , (char*)(m.singleData()) ); LOG(2) << "writing back msg with len: " << m.header()->len << " op: " << m.operation() << endl; // we pass the builder to queueWriteBack so that it can select the writebackId // this is important so that the id is guaranteed to be ascending // that is important since mongos assumes if its seen a greater writeback // that all former have been processed OID writebackID = writeBackManager.queueWriteBack( clientID.str() , b ); lastError.getSafe()->writeback( writebackID ); return true; }
/** * Also called by db/ops/query.cpp. This is the new getMore entry point. */ QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized) { exhaust = false; int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(bufSize); bb.skip(sizeof(QueryResult)); // This is a read lock. TODO: There is a cursor flag for not needing this. Do we care? Client::ReadContext ctx(ns); log() << "running getMore in new system, cursorid " << cursorid << endl; // TODO: Document. // TODO: do this when we can pass in our own parsed query //replVerifyReadsOk(); // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. ClientCursorPin ccPin(cursorid); ClientCursor* cc = ccPin.c(); // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Quote: check for spoofing of the ns such that it does not match the one originally // there for the cursor uassert(17011, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // TODO: fail point? // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); // TODO: // curop.debug().query = BSONForQuery // curop.setQuery(curop.debug().query); // TODO: What is pass? if (0 == pass) { cc->updateSlaveLocation(curop); } CollectionMetadataPtr collMetadata = cc->getCollMetadata(); // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); // What gives us results. Runner* runner = cc->getRunner(); const int queryOptions = cc->queryOptions(); // Get results out of the runner. // TODO: There may be special handling required for tailable cursors? runner->restoreState(); BSONObj obj; Runner::RunnerState state; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // If we're sharded make sure that we don't return any data that hasn't been // migrated off of our shard yet. if (collMetadata) { KeyPattern kp(collMetadata->getKeyPattern()); if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; } } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (queryOptions & QueryOption_OplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } if ((numResults && numResults >= ntoreturn) || bb.len() > MaxBytesToReturnToClientAtOnce) { break; } } if (Runner::RUNNER_DEAD == state || Runner::RUNNER_EOF == state) { log() << "getMore(): runner with id " << cursorid << " EOF/DEAD, state = " << static_cast<int>(state) << endl; // TODO: If the cursor is tailable we don't kill it if it's eof. ccPin.free(); // cc is now invalid, as is the runner cursorid = 0; cc = NULL; } else { // Continue caching the ClientCursor. cc->incPos(numResults); runner->saveState(); // Possibly note slave's position in the oplog. if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = (queryOptions & QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf()); qr->len = bb.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = startingResult; qr->nReturned = numResults; bb.decouple(); return qr; }
QueryResult* processGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized ) { exhaust = false; ClientCursor::Pin p(cursorid); ClientCursor *client_cursor = p.c(); int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce; BufBuilder b( bufSize ); b.skip(sizeof(QueryResult)); int resultFlags = ResultFlag_AwaitCapable; int start = 0; int n = 0; if ( unlikely(!client_cursor) ) { LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl; cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // check for spoofing of the ns such that it does not match the one originally there for the cursor uassert(14833, "auth error", str::equals(ns, client_cursor->ns().c_str())); uassert(16784, "oplog cursor reading data that is too old", !client_cursor->lastOpForSlaveTooOld()); int queryOptions = client_cursor->queryOptions(); OpSettings settings; settings.setBulkFetch(true); settings.setQueryCursorMode(DEFAULT_LOCK_CURSOR); settings.setCappedAppendPK(queryOptions & QueryOption_AddHiddenPK); cc().setOpSettings(settings); // Check if the cursor is part of a multi-statement transaction. If it is // and this is not the right client (meaning the current transaction stack // does not match that in the cursor), it will uassert. If the cursor is // not part of a multi-statement transaction, then we need to use the stack // in the cursor for this scope. const bool cursorPartOfMultiStatementTxn = client_cursor->checkMultiStatementTxn(); scoped_ptr<Client::WithTxnStack> wts; if (!cursorPartOfMultiStatementTxn) { // For simplicity, prevent multi-statement transactions from // reading cursors it didn't create. uassert(16813, "Cannot getMore() on a cursor not created by this multi-statement transaction", !cc().hasTxn()); wts.reset(new Client::WithTxnStack(client_cursor->transactions)); } *isCursorAuthorized = true; if (pass == 0) { client_cursor->updateSlaveLocation( curop ); } curop.debug().query = client_cursor->query(); start = client_cursor->pos(); Cursor *c = client_cursor->c(); // This manager may be stale, but it's the state of chunking when the cursor was created. ShardChunkManagerPtr manager = client_cursor->getChunkManager(); while ( 1 ) { if ( !c->ok() ) { if ( c->tailable() ) { /* when a tailable cursor hits "EOF", ok() goes false, and current() is null. however advance() can still be retries as a reactivation attempt. when there is new data, it will return true. that's what we are doing here. */ if ( c->advance() ) continue; if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) { return 0; } break; } p.release(); // Done with this cursor, steal transaction stack back to commit or abort it here. bool ok = ClientCursor::erase(cursorid); verify(ok); cursorid = 0; client_cursor = 0; break; } MatchDetails details; if ( client_cursor->fields && client_cursor->fields->getArrayOpType() == Projection::ARRAY_OP_POSITIONAL ) { // field projection specified, and contains an array operator details.requestElemMatchKey(); } // in some cases (clone collection) there won't be a matcher if ( !c->currentMatches( &details ) ) { } else if ( manager && ! manager->belongsToMe( client_cursor ) ){ LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl; } else { if( c->getsetdup(c->currPK()) ) { //out() << " but it's a dup \n"; } else { // save this so that at the end of the loop, // we can update the location for write concern // in replication. Note that if this cursor is not // doing replication, this is pointless if ( client_cursor->queryOptions() & QueryOption_OplogReplay ) { client_cursor->storeOpForSlave( c->current() ); } n++; client_cursor->fillQueryResultFromObj( b, &details ); if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) { c->advance(); client_cursor->incPos( n ); break; } } } c->advance(); } if ( client_cursor ) { client_cursor->resetIdleAge(); exhaust = client_cursor->queryOptions() & QueryOption_Exhaust; } else if (!cursorPartOfMultiStatementTxn) { // This cursor is done and it wasn't part of a multi-statement // transaction. We can commit the transaction now. cc().commitTopTxn(); wts->release(); } } QueryResult *qr = (QueryResult *) b.buf(); qr->len = b.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = start; qr->nReturned = n; b.decouple(); return qr; }
/** * Also called by db/ops/query.cpp. This is the new getMore entry point. */ QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized) { exhaust = false; int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(bufSize); bb.skip(sizeof(QueryResult)); // This is a read lock. TODO: There is a cursor flag for not needing this. Do we care? Client::ReadContext ctx(ns); // TODO: Document. replVerifyReadsOk(); ClientCursorPin ccPin(cursorid); ClientCursor* cc = ccPin.c(); // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Quote: check for spoofing of the ns such that it does not match the one originally // there for the cursor uassert(17011, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // TODO: fail point? // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); // TODO: // curop.debug().query = BSONForQuery // curop.setQuery(curop.debug().query); // TODO: What is pass? if (0 == pass) { cc->updateSlaveLocation(curop); } CollectionMetadataPtr collMetadata = cc->getCollMetadata(); // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; startingResult = cc->pos(); Runner* runner = cc->getRunner(); const ParsedQuery& pq = runner->getQuery().getParsed(); // Get results out of the runner. // TODO: There may be special handling required for tailable cursors? runner->restoreState(); BSONObj obj; // TODO: Differentiate EOF from error. while (runner->getNext(&obj)) { // If we're sharded make sure that we don't return any data that hasn't been // migrated off of our shard yet. if (collMetadata) { KeyPattern kp(collMetadata->getKeyPattern()); if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; } } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } if ((numResults && numResults >= ntoreturn) || bb.len() > MaxBytesToReturnToClientAtOnce) { break; } } cc->incPos(numResults); runner->saveState(); // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = pq.hasOption(QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf()); qr->len = bb.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = startingResult; qr->nReturned = numResults; bb.decouple(); return qr; }
QueryResult* processGetMore(const char *ns, int ntoreturn, long long cursorid , CurOp& curop, int pass, bool& exhaust ) { exhaust = false; ClientCursor::Pointer p(cursorid); ClientCursor *cc = p.c(); int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce; BufBuilder b( bufSize ); b.skip(sizeof(QueryResult)); int resultFlags = ResultFlag_AwaitCapable; int start = 0; int n = 0; if ( unlikely(!cc) ) { LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl; cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // check for spoofing of the ns such that it does not match the one originally there for the cursor uassert(14833, "auth error", str::equals(ns, cc->ns().c_str())); if ( pass == 0 ) cc->updateSlaveLocation( curop ); int queryOptions = cc->queryOptions(); curop.debug().query = cc->query(); start = cc->pos(); Cursor *c = cc->c(); c->recoverFromYield(); DiskLoc last; scoped_ptr<Projection::KeyOnly> keyFieldsOnly; if ( cc->modifiedKeys() == false && cc->isMultiKey() == false && cc->fields ) keyFieldsOnly.reset( cc->fields->checkKey( cc->indexKeyPattern() ) ); // This manager may be stale, but it's the state of chunking when the cursor was created. ShardChunkManagerPtr manager = cc->getChunkManager(); while ( 1 ) { if ( !c->ok() ) { if ( c->tailable() ) { /* when a tailable cursor hits "EOF", ok() goes false, and current() is null. however advance() can still be retries as a reactivation attempt. when there is new data, it will return true. that's what we are doing here. */ if ( c->advance() ) continue; if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) { return 0; } break; } p.release(); bool ok = ClientCursor::erase(cursorid); verify(ok); cursorid = 0; cc = 0; break; } // in some cases (clone collection) there won't be a matcher if ( !c->currentMatches() ) { } else if ( manager && ! manager->belongsToMe( cc ) ){ LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl; } else { if( c->getsetdup(c->currLoc()) ) { //out() << " but it's a dup \n"; } else { last = c->currLoc(); n++; if ( keyFieldsOnly ) { fillQueryResultFromObj(b, 0, keyFieldsOnly->hydrate( c->currKey() ) ); } else { BSONObj js = c->current(); // show disk loc should be part of the main query, not in an $or clause, so this should be ok fillQueryResultFromObj(b, cc->fields.get(), js, ( cc->pq.get() && cc->pq->showDiskLoc() ? &last : 0)); } if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) { c->advance(); cc->incPos( n ); break; } } } c->advance(); if ( ! cc->yieldSometimes( ClientCursor::MaybeCovered ) ) { ClientCursor::erase(cursorid); cursorid = 0; cc = 0; p.deleted(); break; } } if ( cc ) { if ( c->supportYields() ) { ClientCursor::YieldData data; verify( cc->prepareToYield( data ) ); } else { cc->c()->noteLocation(); } cc->mayUpgradeStorage(); cc->storeOpForSlave( last ); exhaust = cc->queryOptions() & QueryOption_Exhaust; } } QueryResult *qr = (QueryResult *) b.buf(); qr->len = b.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = start; qr->nReturned = n; b.decouple(); return qr; }