void receivedDelete(Message& m, CurOp& op) { DbMessage d(m); const char *ns = d.getns(); assert(*ns); uassert( 10056 , "not master", isMasterNs( ns ) ); op.debug().str << ns << ' '; int flags = d.pullInt(); bool justOne = flags & RemoveOption_JustOne; bool broadcast = flags & RemoveOption_Broadcast; assert( d.moreJSObjs() ); BSONObj pattern = d.nextJsObj(); { string s = pattern.toString(); op.debug().str << " query: " << s; op.setQuery(pattern); } writelock lk(ns); // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit if ( ! broadcast & handlePossibleShardedMessage( m , 0 ) ) return; Client::Context ctx(ns); long long n = deleteObjects(ns, pattern, justOne, true); lastError.getSafe()->recordDelete( n ); }
void receivedUpdate(Message& m, CurOp& op) { DbMessage d(m); const char *ns = d.getns(); op.debug().ns = ns; int flags = d.pullInt(); BSONObj query = d.nextJsObj(); assert( d.moreJSObjs() ); assert( query.objsize() < m.header()->dataLen() ); BSONObj toupdate = d.nextJsObj(); uassert( 10055 , "update object too large", toupdate.objsize() <= BSONObjMaxUserSize); assert( toupdate.objsize() < m.header()->dataLen() ); assert( query.objsize() + toupdate.objsize() < m.header()->dataLen() ); bool upsert = flags & UpdateOption_Upsert; bool multi = flags & UpdateOption_Multi; bool broadcast = flags & UpdateOption_Broadcast; op.debug().query = query; op.setQuery(query); writelock lk; // writelock is used to synchronize stepdowns w/ writes uassert( 10054 , "not master", isMasterNs( ns ) ); // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) ) return; Client::Context ctx( ns ); UpdateResult res = updateObjects(ns, toupdate, query, upsert, multi, true, op.debug() ); lastError.getSafe()->recordUpdate( res.existing , res.num , res.upserted ); // for getlasterror }
void receivedInsert(Message& m, CurOp& op) { DbMessage d(m); const char *ns = d.getns(); op.debug().ns = ns; // Auth checking for index writes happens later. if (NamespaceString(ns).coll != "system.indexes") { Status status = cc().getAuthorizationManager()->checkAuthForInsert(ns); uassert(16544, status.reason(), status.isOK()); } if( !d.moreJSObjs() ) { // strange. should we complain? return; } BSONObj first = d.nextJsObj(); vector<BSONObj> multi; while (d.moreJSObjs()){ if (multi.empty()) // first pass multi.push_back(first); multi.push_back( d.nextJsObj() ); } PageFaultRetryableSection s; while ( true ) { try { Lock::DBWrite lk(ns); // CONCURRENCY TODO: is being read locked in big log sufficient here? // writelock is used to synchronize stepdowns w/ writes uassert( 10058 , "not master", isMasterNs(ns) ); if ( handlePossibleShardedMessage( m , 0 ) ) return; Client::Context ctx(ns); if( !multi.empty() ) { const bool keepGoing = d.reservedField() & InsertOption_ContinueOnError; insertMulti(keepGoing, ns, multi, op); return; } checkAndInsert(ns, first); globalOpCounters.incInsertInWriteLock(1); op.debug().ninserted = 1; return; } catch ( PageFaultException& e ) { e.touch(); } } }
bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) { bool ok = true; DbMessage d(m); const char *ns = d.getns(); int ntoreturn = d.pullInt(); long long cursorid = d.pullInt64(); curop.debug().ns = ns; curop.debug().ntoreturn = ntoreturn; curop.debug().cursorid = cursorid; time_t start = 0; int pass = 0; bool exhaust = false; QueryResult* msgdata; while( 1 ) { try { readlock lk; Client::Context ctx(ns); msgdata = processGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust); } catch ( AssertionException& e ) { exhaust = false; curop.debug().exceptionInfo = e.getInfo(); msgdata = emptyMoreResult(cursorid); ok = false; } if (msgdata == 0) { exhaust = false; massert(13073, "shutting down", !inShutdown() ); if( pass == 0 ) { start = time(0); } else { if( time(0) - start >= 4 ) { // after about 4 seconds, return. this is a sanity check. pass stops at 1000 normally // for DEV this helps and also if sleep is highly inaccurate on a platform. we want to // return occasionally so slave can checkpoint. pass = 10000; } } pass++; DEV sleepmillis(20); else sleepmillis(2); continue; }
void profile( const Client& c , CurOp& currentOp ) { assertInWriteLock(); Database *db = c.database(); DEV assert( db ); const char *ns = db->profileName.c_str(); // build object profileBufBuilder.reset(); BSONObjBuilder b(profileBufBuilder); b.appendDate("ts", jsTime()); currentOp.debug().append( b ); b.append("client", c.clientAddress() ); if ( c.getAuthenticationInfo() ) b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) ); BSONObj p = b.done(); // write: not replicated NamespaceDetails *d = db->namespaceIndex.details(ns); if( d ) { int len = p.objsize(); Record *r = theDataFileMgr.fast_oplog_insert(d, ns, len); memcpy(getDur().writingPtr(r->data, len), p.objdata(), len); } else { static time_t last; if( time(0) > last+10 ) { log() << "profile: warning ns " << ns << " does not exist" << endl; last = time(0); } } }
void profile( const Client& c , CurOp& currentOp, int millis) { assertInWriteLock(); string info = currentOp.debug().str.str(); profileBufBuilder.reset(); BSONObjBuilder b(profileBufBuilder); b.appendDate("ts", jsTime()); b.append("info", info); b.append("millis", (double) millis); if ( currentOp.getNS() ) b.append( "ns" , currentOp.getNS() ); b.append("client", c.clientAddress() ); BSONObj p = b.done(); // write: not replicated Database *db = c.database(); const char *ns = db->profileName.c_str(); NamespaceDetails *d = db->namespaceIndex.details(ns); if( d ) { int len = p.objsize(); Record *r = theDataFileMgr.fast_oplog_insert(d, ns, len); memcpy(getDur().writingPtr(r->data, len), p.objdata(), len); } else { static time_t last; if( time(0) > last+10 ) { log() << "profile: warning ns " << ns << " does not exist" << endl; last = time(0); } } }
void receivedUpdate(Message& m, CurOp& op) { DbMessage d(m); const char *ns = d.getns(); op.debug().ns = ns; int flags = d.pullInt(); BSONObj query = d.nextJsObj(); verify( d.moreJSObjs() ); verify( query.objsize() < m.header()->dataLen() ); BSONObj toupdate = d.nextJsObj(); uassert( 10055 , "update object too large", toupdate.objsize() <= BSONObjMaxUserSize); verify( toupdate.objsize() < m.header()->dataLen() ); verify( query.objsize() + toupdate.objsize() < m.header()->dataLen() ); bool upsert = flags & UpdateOption_Upsert; bool multi = flags & UpdateOption_Multi; bool broadcast = flags & UpdateOption_Broadcast; Status status = cc().getAuthorizationManager()->checkAuthForUpdate(ns, upsert); uassert(16538, status.reason(), status.isOK()); op.debug().query = query; op.setQuery(query); PageFaultRetryableSection s; while ( 1 ) { try { Lock::DBWrite lk(ns); // void ReplSetImpl::relinquish() uses big write lock so // this is thus synchronized given our lock above. uassert( 10054 , "not master", isMasterNs( ns ) ); // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) ) return; Client::Context ctx( ns ); UpdateResult res = updateObjects(ns, toupdate, query, upsert, multi, true, op.debug() ); lastError.getSafe()->recordUpdate( res.existing , res.num , res.upserted ); // for getlasterror break; } catch ( PageFaultException& e ) { e.touch(); } } }
void receivedUpdate(Message& m, CurOp& op) { DbMessage d(m); const char *ns = d.getns(); op.debug().ns = ns; int flags = d.pullInt(); BSONObj query = d.nextJsObj(); verify(d.moreJSObjs()); verify(query.objsize() < m.header()->dataLen()); const BSONObj updateobj = d.nextJsObj(); uassert(10055, "update object too large", updateobj.objsize() <= BSONObjMaxUserSize); verify(updateobj.objsize() < m.header()->dataLen()); verify(query.objsize() + updateobj.objsize() < m.header()->dataLen()); op.debug().query = query; op.debug().updateobj = updateobj; op.setQuery(query); const bool upsert = flags & UpdateOption_Upsert; const bool multi = flags & UpdateOption_Multi; const bool broadcast = flags & UpdateOption_Broadcast; Status status = cc().getAuthorizationManager()->checkAuthForUpdate(ns, upsert); uassert(16538, status.reason(), status.isOK()); OpSettings settings; settings.setQueryCursorMode(WRITE_LOCK_CURSOR); settings.setJustOne(!multi); cc().setOpSettings(settings); Client::ShardedOperationScope sc; if (!broadcast && sc.handlePossibleShardedMessage(m, 0)) { return; } LOCK_REASON(lockReason, "update"); try { Lock::DBRead lk(ns, lockReason); lockedReceivedUpdate(ns, m, op, updateobj, query, upsert, multi); } catch (RetryWithWriteLock &e) { Lock::DBWrite lk(ns, lockReason); lockedReceivedUpdate(ns, m, op, updateobj, query, upsert, multi); } }
void receivedDelete(Message& m, CurOp& op) { DbMessage d(m); const char *ns = d.getns(); Status status = cc().getAuthorizationManager()->checkAuthForDelete(ns); uassert(16542, status.reason(), status.isOK()); op.debug().ns = ns; int flags = d.pullInt(); verify(d.moreJSObjs()); BSONObj pattern = d.nextJsObj(); op.debug().query = pattern; op.setQuery(pattern); const bool justOne = flags & RemoveOption_JustOne; const bool broadcast = flags & RemoveOption_Broadcast; OpSettings settings; settings.setQueryCursorMode(WRITE_LOCK_CURSOR); settings.setJustOne(justOne); cc().setOpSettings(settings); Client::ShardedOperationScope sc; if (!broadcast && sc.handlePossibleShardedMessage(m, 0)) { return; } LOCK_REASON(lockReason, "delete"); Lock::DBRead lk(ns, lockReason); // writelock is used to synchronize stepdowns w/ writes uassert(10056, "not master", isMasterNs(ns)); Client::Context ctx(ns); long long n; scoped_ptr<Client::AlternateTransactionStack> altStack(opNeedsAltTxn(ns) ? new Client::AlternateTransactionStack : NULL); Client::Transaction transaction(DB_SERIALIZABLE); n = deleteObjects(ns, pattern, justOne, true); transaction.commit(); lastError.getSafe()->recordDelete( n ); op.debug().ndeleted = n; }
void receivedDelete(Message& m, CurOp& op) { DbMessage d(m); const char *ns = d.getns(); Status status = cc().getAuthorizationManager()->checkAuthForDelete(ns); uassert(16542, status.reason(), status.isOK()); op.debug().ns = ns; int flags = d.pullInt(); bool justOne = flags & RemoveOption_JustOne; bool broadcast = flags & RemoveOption_Broadcast; verify( d.moreJSObjs() ); BSONObj pattern = d.nextJsObj(); op.debug().query = pattern; op.setQuery(pattern); PageFaultRetryableSection s; while ( 1 ) { try { Lock::DBWrite lk(ns); // writelock is used to synchronize stepdowns w/ writes uassert( 10056 , "not master", isMasterNs( ns ) ); // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) ) return; Client::Context ctx(ns); long long n = deleteObjects(ns, pattern, justOne, true); lastError.getSafe()->recordDelete( n ); op.debug().ndeleted = n; break; } catch ( PageFaultException& e ) { LOG(2) << "recordDelete got a PageFaultException" << endl; e.touch(); } } }
void profile( const Client& c , CurOp& currentOp ) { verify( Lock::somethingWriteLocked() ); Database *db = c.database(); DEV verify( db ); const char *ns = db->profileName.c_str(); // build object profileBufBuilder.reset(); BSONObjBuilder b(profileBufBuilder); const bool isQueryObjTooBig = !currentOp.debug().append(currentOp, b, MAX_PROFILE_DOC_SIZE_BYTES); b.appendDate("ts", jsTime()); b.append("client", c.clientAddress()); if (c.getAuthenticationInfo()) { b.append("user", c.getAuthenticationInfo()->getUser(nsToDatabase(ns))); } BSONObj p = b.done(); if (static_cast<size_t>(p.objsize()) > MAX_PROFILE_DOC_SIZE_BYTES || isQueryObjTooBig) { string small = p.toString(/*isArray*/false, /*full*/false); warning() << "can't add full line to system.profile: " << small << endl; // rebuild with limited info BSONObjBuilder b(profileBufBuilder); b.appendDate("ts", jsTime()); b.append("client", c.clientAddress() ); if ( c.getAuthenticationInfo() ) b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) ); b.append("err", "profile line too large (max is 100KB)"); // should be much smaller but if not don't break anything if (small.size() < MAX_PROFILE_DOC_SIZE_BYTES){ b.append("abbreviated", small); } p = b.done(); } // write: not replicated // get or create the profiling collection NamespaceDetails *details = getOrCreateProfileCollection(db); if (details) { int len = p.objsize(); Record *r = theDataFileMgr.fast_oplog_insert(details, ns, len); memcpy(getDur().writingPtr(r->data(), len), p.objdata(), len); } }
/** * @return if collection existed or was created */ static bool _profile(OperationContext* txn, const Client& c, Database* db, CurOp& currentOp, BufBuilder& profileBufBuilder) { dassert( db ); // build object BSONObjBuilder b(profileBufBuilder); const bool isQueryObjTooBig = !currentOp.debug().append(currentOp, b, MAX_PROFILE_DOC_SIZE_BYTES); b.appendDate("ts", jsTime()); b.append("client", c.clientAddress()); AuthorizationSession * authSession = c.getAuthorizationSession(); _appendUserInfo(currentOp, b, authSession); BSONObj p = b.done(); if (static_cast<size_t>(p.objsize()) > MAX_PROFILE_DOC_SIZE_BYTES || isQueryObjTooBig) { string small = p.toString(/*isArray*/false, /*full*/false); warning() << "can't add full line to system.profile: " << small << endl; // rebuild with limited info BSONObjBuilder b(profileBufBuilder); b.appendDate("ts", jsTime()); b.append("client", c.clientAddress() ); _appendUserInfo(currentOp, b, authSession); b.append("err", "profile line too large (max is 100KB)"); // should be much smaller but if not don't break anything if (small.size() < MAX_PROFILE_DOC_SIZE_BYTES){ b.append("abbreviated", small); } p = b.done(); } WriteUnitOfWork wunit(txn); // write: not replicated // get or create the profiling collection Collection* profileCollection = getOrCreateProfileCollection(txn, db); if ( !profileCollection ) { return false; } profileCollection->insertDocument( txn, p, false ); wunit.commit(); return true; }
void receivedInsert(Message& m, CurOp& op) { DbMessage d(m); const char *ns = d.getns(); op.debug().ns = ns; StringData coll = nsToCollectionSubstring(ns); // Auth checking for index writes happens later. if (coll != "system.indexes") { Status status = cc().getAuthorizationManager()->checkAuthForInsert(ns); uassert(16544, status.reason(), status.isOK()); } if (!d.moreJSObjs()) { // strange. should we complain? return; } vector<BSONObj> objs; while (d.moreJSObjs()) { objs.push_back(d.nextJsObj()); } const bool keepGoing = d.reservedField() & InsertOption_ContinueOnError; OpSettings settings; settings.setQueryCursorMode(WRITE_LOCK_CURSOR); cc().setOpSettings(settings); if (coll == "system.indexes" && // Can only build non-unique indexes in the background, because the // hot indexer does not know how to perform unique checks. objs[0]["background"].trueValue() && !objs[0]["unique"].trueValue()) { _buildHotIndex(ns, m, objs); return; } scoped_ptr<Client::ShardedOperationScope> scp; if (coll != "system.indexes") { scp.reset(new Client::ShardedOperationScope); if (scp->handlePossibleShardedMessage(m, 0)) { return; } } LOCK_REASON(lockReason, "insert"); try { Lock::DBRead lk(ns, lockReason); lockedReceivedInsert(ns, m, objs, op, keepGoing); } catch (RetryWithWriteLock &e) { Lock::DBWrite lk(ns, lockReason); lockedReceivedInsert(ns, m, objs, op, keepGoing); } }
void receivedUpdate(Message& m, CurOp& op) { DbMessage d(m); const char *ns = d.getns(); assert(*ns); op.debug().str << ns << ' '; int flags = d.pullInt(); BSONObj query = d.nextJsObj(); assert( d.moreJSObjs() ); assert( query.objsize() < m.header()->dataLen() ); BSONObj toupdate = d.nextJsObj(); uassert( 10055 , "update object too large", toupdate.objsize() <= BSONObjMaxUserSize); assert( toupdate.objsize() < m.header()->dataLen() ); assert( query.objsize() + toupdate.objsize() < m.header()->dataLen() ); bool upsert = flags & UpdateOption_Upsert; bool multi = flags & UpdateOption_Multi; bool broadcast = flags & UpdateOption_Broadcast; { string s = query.toString(); /* todo: we shouldn't do all this ss stuff when we don't need it, it will slow us down. instead, let's just story the query BSON in the debug object, and it can toString() lazily */ op.debug().str << " query: " << s; op.setQuery(query); } writelock lk; // writelock is used to synchronize stepdowns w/ writes uassert( 10054 , "not master", isMasterNs( ns ) ); // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) ) return; Client::Context ctx( ns ); UpdateResult res = updateObjects(ns, toupdate, query, upsert, multi, true, op.debug() ); lastError.getSafe()->recordUpdate( res.existing , res.num , res.upserted ); // for getlasterror }
static void lockedReceivedInsert(const char *ns, Message &m, const vector<BSONObj> &objs, CurOp &op, const bool keepGoing) { // writelock is used to synchronize stepdowns w/ writes uassert(10058, "not master", isMasterNs(ns)); Client::Context ctx(ns); scoped_ptr<Client::AlternateTransactionStack> altStack(opNeedsAltTxn(ns) ? new Client::AlternateTransactionStack : NULL); Client::Transaction transaction(DB_SERIALIZABLE); insertObjects(ns, objs, keepGoing, 0, true); transaction.commit(); size_t n = objs.size(); globalOpCounters.gotInsert(n); op.debug().ninserted = n; }
bool _tryQueryByPKHack(const char *ns, const BSONObj &query, const ParsedQuery &pq, CurOp &curop, Message &result) { BSONObj resObject; bool found = false; Collection *cl = getCollection(ns); if (cl == NULL) { return false; // ns doesn't exist, fall through to optimizer for legacy reasons } const BSONObj &pk = cl->getSimplePKFromQuery(query); if (pk.isEmpty()) { return false; // unable to query by PK - resort to using the optimizer } found = queryByPKHack(cl, pk, query, resObject); if ( shardingState.needShardChunkManager( ns ) ) { ShardChunkManagerPtr m = shardingState.getShardChunkManager( ns ); if ( m && ! m->belongsToMe( resObject ) ) { // I have something for this _id // but it doesn't belong to me // so return nothing resObject = BSONObj(); found = false; } } BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32); bb.skip(sizeof(QueryResult)); if ( found ) { fillQueryResultFromObj( bb , pq.getFields() , resObject ); } auto_ptr< QueryResult > qr( (QueryResult *) bb.buf() ); bb.decouple(); qr->setResultFlagsToOk(); qr->len = bb.len(); curop.debug().responseLength = bb.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = found ? 1 : 0; result.setData( qr.release(), true ); return true; }
void receivedInsert(Message& m, CurOp& op) { DbMessage d(m); const char *ns = d.getns(); op.debug().ns = ns; if( !d.moreJSObjs() ) { // strange. should we complain? return; } BSONObj first = d.nextJsObj(); vector<BSONObj> multi; while (d.moreJSObjs()){ if (multi.empty()) // first pass multi.push_back(first); multi.push_back( d.nextJsObj() ); } PageFaultRetryableSection s; while ( true ) { try { Lock::DBWrite lk(ns); // CONCURRENCY TODO: is being read locked in big log sufficient here? // writelock is used to synchronize stepdowns w/ writes uassert( 10058 , "not master", isMasterNs(ns) ); if ( handlePossibleShardedMessage( m , 0 ) ) return; Client::Context ctx(ns); if( !multi.empty() ) { const bool keepGoing = d.reservedField() & InsertOption_ContinueOnError; insertMulti(keepGoing, ns, multi); return; } checkAndInsert(ns, first); globalOpCounters.incInsertInWriteLock(1); return; } catch ( PageFaultException& e ) { e.touch(); } } }
// XXX clean up static bool runCommands(const char *ns, BSONObj& jsobj, CurOp& curop, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) { try { return _runCommands(ns, jsobj, b, anObjBuilder, fromRepl, queryOptions); } catch( SendStaleConfigException& ){ throw; } catch ( AssertionException& e ) { verify( e.getCode() != SendStaleConfigCode && e.getCode() != RecvStaleConfigCode ); Command::appendCommandStatus(anObjBuilder, e.toStatus()); curop.debug().exceptionInfo = e.getInfo(); } BSONObj x = anObjBuilder.done(); b.appendBuf((void*) x.objdata(), x.objsize()); return true; }
static void _profile(const Client& c, CurOp& currentOp, BufBuilder& profileBufBuilder) { Database *db = c.database(); DEV verify( db ); const char *ns = db->profileName.c_str(); // build object BSONObjBuilder b(profileBufBuilder); b.appendDate("ts", jsTime()); currentOp.debug().append( currentOp , b ); b.append("client", c.clientAddress() ); if ( c.getAuthenticationInfo() ) b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) ); BSONObj p = b.done(); if (p.objsize() > 100*1024){ string small = p.toString(/*isArray*/false, /*full*/false); warning() << "can't add full line to system.profile: " << small; // rebuild with limited info BSONObjBuilder b(profileBufBuilder); b.appendDate("ts", jsTime()); b.append("client", c.clientAddress() ); if ( c.getAuthenticationInfo() ) b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) ); b.append("err", "profile line too large (max is 100KB)"); if (small.size() < 100*1024){ // should be much smaller but if not don't break anything b.append("abbreviated", small); } p = b.done(); } // write: not replicated // get or create the profiling collection NamespaceDetails *details = getOrCreateProfileCollection(db); if (details) { int len = p.objsize(); Record *r = theDataFileMgr.fast_oplog_insert(details, ns, len); memcpy(getDur().writingPtr(r->data(), len), p.objdata(), len); } }
NOINLINE_DECL void insertMulti(bool keepGoing, const char *ns, vector<BSONObj>& objs, CurOp& op) { size_t i; for (i=0; i<objs.size(); i++){ try { checkAndInsert(ns, objs[i]); getDur().commitIfNeeded(); } catch (const UserException&) { if (!keepGoing || i == objs.size()-1){ globalOpCounters.incInsertInWriteLock(i); throw; } // otherwise ignore and keep going } } globalOpCounters.incInsertInWriteLock(i); op.debug().ninserted = i; }
bool runCommands(const char *ns, BSONObj& jsobj, CurOp& curop, BufBuilder &b, BSONObjBuilder& anObjBuilder, bool fromRepl, int queryOptions) { try { return _runCommands(ns, jsobj, b, anObjBuilder, fromRepl, queryOptions); } catch( SendStaleConfigException& ){ throw; } catch ( AssertionException& e ) { verify( e.getCode() != SendStaleConfigCode && e.getCode() != RecvStaleConfigCode ); e.getInfo().append( anObjBuilder , "assertion" , "assertionCode" ); curop.debug().exceptionInfo = e.getInfo(); } anObjBuilder.append("errmsg", "db assertion failure"); anObjBuilder.append("ok", 0.0); BSONObj x = anObjBuilder.done(); b.appendBuf((void*) x.objdata(), x.objsize()); return true; }
std::string newRunQuery(OperationContext* txn, Message& m, QueryMessage& q, CurOp& curop, Message &result, bool fromDBDirectClient) { // Validate the namespace. const char *ns = q.ns; uassert(16332, "can't have an empty ns", ns[0]); const NamespaceString nsString(ns); uassert(16256, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid()); // Set curop information. curop.debug().ns = ns; curop.debug().ntoreturn = q.ntoreturn; curop.debug().query = q.query; curop.setQuery(q.query); // If the query is really a command, run it. if (nsString.isCommand()) { int nToReturn = q.ntoreturn; uassert(16979, str::stream() << "bad numberToReturn (" << nToReturn << ") for $cmd type ns - can only be 1 or -1", nToReturn == 1 || nToReturn == -1); curop.markCommand(); BufBuilder bb; bb.skip(sizeof(QueryResult::Value)); BSONObjBuilder cmdResBuf; if (!runCommands(txn, ns, q.query, curop, bb, cmdResBuf, false, q.queryOptions)) { uasserted(13530, "bad or malformed command request?"); } curop.debug().iscommand = true; // TODO: Does this get overwritten/do we really need to set this twice? curop.debug().query = q.query; QueryResult::View qr = bb.buf(); bb.decouple(); qr.setResultFlagsToOk(); qr.msgdata().setLen(bb.len()); curop.debug().responseLength = bb.len(); qr.msgdata().setOperation(opReply); qr.setCursorId(0); qr.setStartingFrom(0); qr.setNReturned(1); result.setData(qr.view2ptr(), true); return ""; } const NamespaceString nss(q.ns); // Parse the qm into a CanonicalQuery. CanonicalQuery* cq; Status canonStatus = CanonicalQuery::canonicalize( q, &cq, WhereCallbackReal(txn, StringData(nss.db()))); if (!canonStatus.isOK()) { uasserted(17287, str::stream() << "Can't canonicalize query: " << canonStatus.toString()); } QLOG() << "Running query:\n" << cq->toString(); LOG(2) << "Running query: " << cq->toStringShort(); // Parse, canonicalize, plan, transcribe, and get a plan executor. PlanExecutor* rawExec = NULL; // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); AutoGetCollectionForRead ctx(txn, nss); const int dbProfilingLevel = (ctx.getDb() != NULL) ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile; Collection* collection = ctx.getCollection(); // We'll now try to get the query executor that will execute this query for us. There // are a few cases in which we know upfront which executor we should get and, therefore, // we shortcut the selection process here. // // (a) If the query is over a collection that doesn't exist, we use an EOFStage. // // (b) if the query is a replication's initial sync one, we use a specifically designed // stage that skips extents faster (see details in exec/oplogstart.h). // // Otherwise we go through the selection of which executor is most suited to the // query + run-time context at hand. Status status = Status::OK(); if (NULL != collection && pq.getOptions().oplogReplay) { // Takes ownership of 'cq'. status = getOplogStartHack(txn, collection, cq, &rawExec); } else { size_t options = QueryPlannerParams::DEFAULT; if (shardingState.needCollectionMetadata(pq.ns())) { options |= QueryPlannerParams::INCLUDE_SHARD_FILTER; } // Takes ownership of 'cq'. status = getExecutor(txn, collection, cq, PlanExecutor::YIELD_AUTO, &rawExec, options); } if (!status.isOK()) { // NOTE: Do not access cq as getExecutor has deleted it. uasserted(17007, "Unable to execute query: " + status.reason()); } verify(NULL != rawExec); auto_ptr<PlanExecutor> exec(rawExec); // If it's actually an explain, do the explain and return rather than falling through // to the normal query execution loop. if (pq.isExplain()) { BufBuilder bb; bb.skip(sizeof(QueryResult::Value)); BSONObjBuilder explainBob; Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob); // Add the resulting object to the return buffer. BSONObj explainObj = explainBob.obj(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); curop.debug().iscommand = true; // TODO: Does this get overwritten/do we really need to set this twice? curop.debug().query = q.query; // Set query result fields. QueryResult::View qr = bb.buf(); bb.decouple(); qr.setResultFlagsToOk(); qr.msgdata().setLen(bb.len()); curop.debug().responseLength = bb.len(); qr.msgdata().setOperation(opReply); qr.setCursorId(0); qr.setStartingFrom(0); qr.setNReturned(1); result.setData(qr.view2ptr(), true); return ""; } // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns()); // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. bool slaveOK = pq.getOptions().slaveOk || pq.hasReadPref(); status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor( txn, NamespaceString(cq->ns()), slaveOK); uassertStatusOK(status); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult::Value)); // How many results have we obtained from the executor? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the PlanExecutor in a ClientCursor for getMore calls later? bool saveClientCursor = false; BSONObj obj; PlanExecutor::ExecState state; // uint64_t numMisplacedDocs = 0; // Get summary info about which plan the executor is using. curop.debug().planSummary = Explain::getPlanSummary(exec.get()); while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.getOptions().oplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { QLOG() << "Enough for first batch, wantMore=" << pq.wantMore() << " numToReturn=" << pq.getNumToReturn() << " numResults=" << numResults << endl; // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { QLOG() << " executor EOF=" << exec->isEOF() << endl; saveClientCursor = !exec->isEOF(); } break; } } // If we cache the executor later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the executor later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the executor. exec->deregisterExec(); // Caller expects exceptions thrown in certain cases. if (PlanExecutor::EXEC_ERROR == state) { scoped_ptr<PlanStageStats> stats(exec->getStats()); error() << "Plan executor error, stats: " << Explain::statsToBSON(*stats); uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj)); } // Why save a dead executor? if (PlanExecutor::DEAD == state) { saveClientCursor = false; } else if (pq.getOptions().tailable) { // If we're tailing a capped collection, we don't bother saving the cursor if the // collection is empty. Otherwise, the semantics of the tailable cursor is that the // client will keep trying to read from it. So we'll keep it around. if (collection && collection->numRecords(txn) != 0 && pq.getNumToReturn() != 1) { saveClientCursor = true; } } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } const logger::LogComponent queryLogComponent = logger::LogComponent::kQuery; const logger::LogSeverity logLevelOne = logger::LogSeverity::Debug(1); PlanSummaryStats summaryStats; Explain::getSummaryStats(exec.get(), &summaryStats); curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; curop.debug().scanAndOrder = summaryStats.hasSortStage; curop.debug().nscanned = summaryStats.totalKeysExamined; curop.debug().nscannedObjects = summaryStats.totalDocsExamined; curop.debug().idhack = summaryStats.isIdhack; // Set debug information for consumption by the profiler. if (dbProfilingLevel > 0 || curop.elapsedMillis() > serverGlobalParams.slowMS || logger::globalLogDomain()->shouldLog(queryLogComponent, logLevelOne)) { // Get BSON stats. scoped_ptr<PlanStageStats> execStats(exec->getStats()); BSONObjBuilder statsBob; Explain::statsToBSON(*execStats, &statsBob); curop.debug().execStats.set(statsBob.obj()); // Replace exec stats with plan summary if stats cannot fit into CachedBSONObj. if (curop.debug().execStats.tooBig() && !curop.debug().planSummary.empty()) { BSONObjBuilder bob; bob.append("summary", curop.debug().planSummary.toString()); curop.debug().execStats.set(bob.done()); } } long long ccId = 0; if (saveClientCursor) { // We won't use the executor until it's getMore'd. exec->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(collection, exec.get(), cq->getParsed().getOptions().toInt(), cq->getParsed().getFilter()); ccId = cc->cursorid(); if (fromDBDirectClient) { cc->setUnownedRecoveryUnit(txn->recoveryUnit()); } else if (state == PlanExecutor::IS_EOF && pq.getOptions().tailable) { // Don't stash the RU for tailable cursors at EOF, let them get a new RU on their // next getMore. } else { // We stash away the RecoveryUnit in the ClientCursor. It's used for subsequent // getMore requests. The calling OpCtx gets a fresh RecoveryUnit. cc->setOwnedRecoveryUnit(txn->releaseRecoveryUnit()); StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); txn->setRecoveryUnit(storageEngine->newRecoveryUnit(txn)); } QLOG() << "caching executor with cursorid " << ccId << " after returning " << numResults << " results" << endl; // ClientCursor takes ownership of executor. Release to make sure it's not deleted. exec.release(); // TODO document if (pq.getOptions().oplogReplay && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.getOptions().exhaust) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } else { QLOG() << "Not caching executor but returning " << numResults << " results.\n"; } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult::View qr = result.header().view2ptr(); qr.setCursorId(ccId); curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr.setResultFlagsToOk(); qr.msgdata().setOperation(opReply); qr.setStartingFrom(0); qr.setNReturned(numResults); // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
/** * This is called by db/ops/query.cpp. This is the entry point for answering a query. */ std::string newRunQuery(CanonicalQuery* cq, CurOp& curop, Message &result) { QLOG() << "Running query on new system: " << cq->toString(); // This is a read lock. Client::ReadContext ctx(cq->ns(), storageGlobalParams.dbpath); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner = NULL; // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); // Need to call cq->toString() now, since upon error getRunner doesn't guarantee // cq is in a consistent state. string cqStr = cq->toString(); // We'll now try to get the query runner that will execute this query for us. There // are a few cases in which we know upfront which runner we should get and, therefore, // we shortcut the selection process here. // // (a) If the query is over a collection that doesn't exist, we get a special runner // that's is so (a runner) which doesn't return results, the EOFRunner. // // (b) if the query is a replication's initial sync one, we get a SingleSolutinRunner // that uses a specifically designed stage that skips extents faster (see details in // exec/oplogstart.h) // // Otherwise we go through the selection of which runner is most suited to the // query + run-time context at hand. Status status = Status::OK(); if (ctx.ctx().db()->getCollection(cq->ns()) == NULL) { rawRunner = new EOFRunner(cq, cq->ns()); } else if (pq.hasOption(QueryOption_OplogReplay)) { status = getOplogStartHack(cq, &rawRunner); } else { // Takes ownership of cq. size_t options = QueryPlannerParams::DEFAULT; if (shardingState.needCollectionMetadata(pq.ns())) { options |= QueryPlannerParams::INCLUDE_SHARD_FILTER; } status = getRunner(cq, &rawRunner, options); } if (!status.isOK()) { uasserted(17007, "Couldn't get runner for query because: " + status.reason() + " query is " + cqStr); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns()); // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; // We turn on auto-yielding for the runner here. The runner registers itself with the // active runners list in ClientCursor. ClientCursor::registerRunner(runner.get()); runner->setYieldPolicy(Runner::YIELD_AUTO); auto_ptr<DeregisterEvenIfUnderlyingCodeThrows> safety( new DeregisterEvenIfUnderlyingCodeThrows(runner.get())); BSONObj obj; Runner::RunnerState state; // uint64_t numMisplacedDocs = 0; // set this outside loop. we will need to use this both within loop and when deciding // to fill in explain information const bool isExplain = pq.isExplain(); while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // Add result to output buffer. This is unnecessary if explain info is requested if (!isExplain) { bb.appendBuf((void*)obj.objdata(), obj.objsize()); } // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; if (isExplain) { if (enoughForExplain(pq, numResults)) { break; } } else if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { QLOG() << "Enough for first batch, wantMore=" << pq.wantMore() << " numToReturn=" << pq.getNumToReturn() << " numResults=" << numResults << endl; // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { QLOG() << " runner EOF=" << runner->isEOF() << endl; saveClientCursor = !runner->isEOF(); } break; } } // If we cache the runner later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the runner later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the runner. safety.reset(); // Caller expects exceptions thrown in certain cases: // * in-memory sort using too much RAM. if (Runner::RUNNER_ERROR == state) { uasserted(17144, "Runner error, memory limit for sort probably exceeded"); } // Why save a dead runner? if (Runner::RUNNER_DEAD == state) { saveClientCursor = false; } else if (pq.hasOption(QueryOption_CursorTailable)) { // If we're tailing a capped collection, we don't bother saving the cursor if the // collection is empty. Otherwise, the semantics of the tailable cursor is that the // client will keep trying to read from it. So we'll keep it around. Collection* collection = ctx.ctx().db()->getCollection(cq->ns()); if (collection && collection->numRecords() != 0 && pq.getNumToReturn() != 1) { saveClientCursor = true; } } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } // Append explain information to query results by asking the runner to produce them. if (isExplain) { TypeExplain* bareExplain; Status res = runner->getExplainPlan(&bareExplain); if (!res.isOK()) { error() << "could not produce explain of query '" << pq.getFilter() << "', error: " << res.reason(); // If numResults and the data in bb don't correspond, we'll crash later when rooting // through the reply msg. BSONObj emptyObj; bb.appendBuf((void*)emptyObj.objdata(), emptyObj.objsize()); // The explain output is actually a result. numResults = 1; // TODO: we can fill out millis etc. here just fine even if the plan screwed up. } else { boost::scoped_ptr<TypeExplain> explain(bareExplain); // Fill in the missing run-time fields in explain, starting with propeties of // the process running the query. std::string server = mongoutils::str::stream() << getHostNameCached() << ":" << serverGlobalParams.port; explain->setServer(server); // We might have skipped some results due to chunk migration etc. so our count is // correct. explain->setN(numResults); // Clock the whole operation. explain->setMillis(curop.elapsedMillis()); BSONObj explainObj = explain->toBSON(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // The explain output is actually a result. numResults = 1; } } long long ccId = 0; if (saveClientCursor) { // We won't use the runner until it's getMore'd. runner->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(), cq->getParsed().getFilter()); ccId = cc->cursorid(); QLOG() << "caching runner with cursorid " << ccId << " after returning " << numResults << " results" << endl; // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); // TODO document if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } else { QLOG() << "not caching runner but returning " << numResults << " results\n"; } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
/** * Run a query with a cursor provided by the query optimizer, or FindingStartCursor. * @yields the db lock. */ string queryWithQueryOptimizer( int queryOptions, const string& ns, const BSONObj &jsobj, CurOp& curop, const BSONObj &query, const BSONObj &order, const shared_ptr<ParsedQuery> &pq_shared, const BSONObj &oldPlan, const ChunkVersion &shardingVersionAtStart, scoped_ptr<PageFaultRetryableSection>& parentPageFaultSection, scoped_ptr<NoPageFaultsAllowed>& noPageFault, Message &result ) { const ParsedQuery &pq( *pq_shared ); shared_ptr<Cursor> cursor; QueryPlanSummary queryPlan; if ( pq.hasOption( QueryOption_OplogReplay ) ) { cursor = FindingStartCursor::getCursor( ns.c_str(), query, order ); } else { cursor = getOptimizedCursor( ns.c_str(), query, order, QueryPlanSelectionPolicy::any(), pq_shared, false, &queryPlan ); } verify( cursor ); scoped_ptr<QueryResponseBuilder> queryResponseBuilder ( QueryResponseBuilder::make( pq, cursor, queryPlan, oldPlan ) ); bool saveClientCursor = false; OpTime slaveReadTill; ClientCursorHolder ccPointer( new ClientCursor( QueryOption_NoCursorTimeout, cursor, ns ) ); for( ; cursor->ok(); cursor->advance() ) { bool yielded = false; if ( !ccPointer->yieldSometimes( ClientCursor::MaybeCovered, &yielded ) || !cursor->ok() ) { cursor.reset(); queryResponseBuilder->noteYield(); // !!! TODO The queryResponseBuilder still holds cursor. Currently it will not do // anything unsafe with the cursor in handoff(), but this is very fragile. // // We don't fail the query since we're fine with returning partial data if the // collection was dropped. // NOTE see SERVER-2454. // TODO This is wrong. The cursor could be gone if the closeAllDatabases command // just ran. break; } if ( yielded ) { queryResponseBuilder->noteYield(); } if ( pq.getMaxScan() && cursor->nscanned() > pq.getMaxScan() ) { break; } if ( !queryResponseBuilder->addMatch() ) { continue; } // Note slave's position in the oplog. if ( pq.hasOption( QueryOption_OplogReplay ) ) { BSONObj current = cursor->current(); BSONElement e = current["ts"]; if ( e.type() == Date || e.type() == Timestamp ) { slaveReadTill = e._opTime(); } } if ( !cursor->supportGetMore() || pq.isExplain() ) { if ( queryResponseBuilder->enoughTotalResults() ) { break; } } else if ( queryResponseBuilder->enoughForFirstBatch() ) { // if only 1 requested, no cursor saved for efficiency...we assume it is findOne() if ( pq.wantMore() && pq.getNumToReturn() != 1 ) { queryResponseBuilder->finishedFirstBatch(); if ( cursor->advance() ) { saveClientCursor = true; } } break; } } if ( cursor ) { if ( pq.hasOption( QueryOption_CursorTailable ) && pq.getNumToReturn() != 1 ) { cursor->setTailable(); } // If the tailing request succeeded. if ( cursor->tailable() ) { saveClientCursor = true; } } if ( ! shardingState.getVersion( ns ).isWriteCompatibleWith( shardingVersionAtStart ) ) { // if the version changed during the query // we might be missing some data // and its safe to send this as mongos can resend // at this point throw SendStaleConfigException(ns, "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(ns)); } parentPageFaultSection.reset(0); noPageFault.reset( new NoPageFaultsAllowed() ); int nReturned = queryResponseBuilder->handoff( result ); ccPointer.reset(); long long cursorid = 0; if ( saveClientCursor ) { // Create a new ClientCursor, with a default timeout. ccPointer.reset( new ClientCursor( queryOptions, cursor, ns, jsobj.getOwned() ) ); cursorid = ccPointer->cursorid(); DEV { MONGO_TLOG(2) << "query has more, cursorid: " << cursorid << endl; } if ( cursor->supportYields() ) { ClientCursor::YieldData data; ccPointer->prepareToYield( data ); } else { ccPointer->c()->noteLocation(); } // Save slave's position in the oplog. if ( pq.hasOption( QueryOption_OplogReplay ) && !slaveReadTill.isNull() ) { ccPointer->slaveReadTill( slaveReadTill ); } if ( !ccPointer->ok() && ccPointer->c()->tailable() ) { DEV { MONGO_TLOG(0) << "query has no more but tailable, cursorid: " << cursorid << endl; } } if( queryOptions & QueryOption_Exhaust ) { curop.debug().exhaust = true; } // Set attributes for getMore. ccPointer->setCollMetadata( queryResponseBuilder->collMetadata() ); ccPointer->setPos( nReturned ); ccPointer->pq = pq_shared; ccPointer->fields = pq.getFieldPtr(); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). ccPointer->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); ccPointer.release(); }
std::string runQuery(OperationContext* txn, QueryMessage& q, const NamespaceString& nss, CurOp& curop, Message &result) { // Validate the namespace. uassert(16256, str::stream() << "Invalid ns [" << nss.ns() << "]", nss.isValid()); invariant(!nss.isCommand()); // Set curop information. beginQueryOp(nss, q.query, q.ntoreturn, q.ntoskip, &curop); // Parse the qm into a CanonicalQuery. std::auto_ptr<CanonicalQuery> cq; { CanonicalQuery* cqRaw; Status canonStatus = CanonicalQuery::canonicalize(q, &cqRaw, WhereCallbackReal(txn, nss.db())); if (!canonStatus.isOK()) { uasserted(17287, str::stream() << "Can't canonicalize query: " << canonStatus.toString()); } cq.reset(cqRaw); } invariant(cq.get()); LOG(5) << "Running query:\n" << cq->toString(); LOG(2) << "Running query: " << cq->toStringShort(); // Parse, canonicalize, plan, transcribe, and get a plan executor. AutoGetCollectionForRead ctx(txn, nss); Collection* collection = ctx.getCollection(); const int dbProfilingLevel = ctx.getDb() ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile; // We have a parsed query. Time to get the execution plan for it. std::unique_ptr<PlanExecutor> exec; { PlanExecutor* rawExec; Status execStatus = getExecutorFind(txn, collection, nss, cq.release(), PlanExecutor::YIELD_AUTO, &rawExec); uassertStatusOK(execStatus); exec.reset(rawExec); } const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed(); // If it's actually an explain, do the explain and return rather than falling through // to the normal query execution loop. if (pq.isExplain()) { BufBuilder bb; bb.skip(sizeof(QueryResult::Value)); BSONObjBuilder explainBob; Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob); // Add the resulting object to the return buffer. BSONObj explainObj = explainBob.obj(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // TODO: Does this get overwritten/do we really need to set this twice? curop.debug().query = q.query; // Set query result fields. QueryResult::View qr = bb.buf(); bb.decouple(); qr.setResultFlagsToOk(); qr.msgdata().setLen(bb.len()); curop.debug().responseLength = bb.len(); qr.msgdata().setOperation(opReply); qr.setCursorId(0); qr.setStartingFrom(0); qr.setNReturned(1); result.setData(qr.view2ptr(), true); return ""; } // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(nss.ns()); // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. bool slaveOK = pq.isSlaveOk() || pq.hasReadPref(); Status serveReadsStatus = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor( txn, nss, slaveOK); uassertStatusOK(serveReadsStatus); // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult::Value)); // How many results have we obtained from the executor? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. Timestamp slaveReadTill; BSONObj obj; PlanExecutor::ExecState state; // uint64_t numMisplacedDocs = 0; // Get summary info about which plan the executor is using. curop.debug().planSummary = Explain::getPlanSummary(exec.get()); while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.isOplogReplay()) { BSONElement e = obj["ts"]; if (Date == e.type() || bsonTimestamp == e.type()) { slaveReadTill = e.timestamp(); } } if (enoughForFirstBatch(pq, numResults, bb.len())) { LOG(5) << "Enough for first batch, wantMore=" << pq.wantMore() << " numToReturn=" << pq.getNumToReturn() << " numResults=" << numResults << endl; break; } } // If we cache the executor later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the executor later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the executor. exec->deregisterExec(); // Caller expects exceptions thrown in certain cases. if (PlanExecutor::FAILURE == state) { scoped_ptr<PlanStageStats> stats(exec->getStats()); error() << "Plan executor error, stats: " << Explain::statsToBSON(*stats); uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj)); } // TODO: Currently, chunk ranges are kept around until all ClientCursors created while the // chunk belonged on this node are gone. Separating chunk lifetime management from // ClientCursor should allow this check to go away. if (!shardingState.getVersion(nss.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(nss.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(nss.ns())); } // Fill out curop based on query results. If we have a cursorid, we will fill out curop with // this cursorid later. long long ccId = 0; if (shouldSaveCursor(txn, collection, state, exec.get())) { // We won't use the executor until it's getMore'd. exec->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(collection->getCursorManager(), exec.release(), nss.ns(), pq.getOptions(), pq.getFilter()); ccId = cc->cursorid(); if (txn->getClient()->isInDirectClient()) { cc->setUnownedRecoveryUnit(txn->recoveryUnit()); } else if (state == PlanExecutor::IS_EOF && pq.isTailable()) { // Don't stash the RU for tailable cursors at EOF, let them get a new RU on their // next getMore. } else { // We stash away the RecoveryUnit in the ClientCursor. It's used for subsequent // getMore requests. The calling OpCtx gets a fresh RecoveryUnit. txn->recoveryUnit()->abandonSnapshot(); cc->setOwnedRecoveryUnit(txn->releaseRecoveryUnit()); StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); invariant(txn->setRecoveryUnit(storageEngine->newRecoveryUnit(), OperationContext::kNotInUnitOfWork) == OperationContext::kNotInUnitOfWork); } LOG(5) << "caching executor with cursorid " << ccId << " after returning " << numResults << " results" << endl; // TODO document if (pq.isOplogReplay() && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.isExhaust()) { curop.debug().exhaust = true; } cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); endQueryOp(cc->getExecutor(), dbProfilingLevel, numResults, ccId, &curop); } else { LOG(5) << "Not caching executor but returning " << numResults << " results.\n"; endQueryOp(exec.get(), dbProfilingLevel, numResults, ccId, &curop); } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult::View qr = result.header().view2ptr(); qr.setCursorId(ccId); qr.setResultFlagsToOk(); qr.msgdata().setOperation(opReply); qr.setStartingFrom(0); qr.setNReturned(numResults); // curop.debug().exhaust is set above. return curop.debug().exhaust ? nss.ns() : ""; }
/** * Called by db/instance.cpp. This is the getMore entry point. * * pass - when QueryOption_AwaitData is in use, the caller will make repeated calls * when this method returns an empty result, incrementing pass on each call. * Thus, pass == 0 indicates this is the first "attempt" before any 'awaiting'. */ QueryResult::View getMore(OperationContext* txn, const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized) { // For testing, we may want to fail if we receive a getmore. if (MONGO_FAIL_POINT(failReceivedGetmore)) { invariant(0); } exhaust = false; const NamespaceString nss(ns); // Depending on the type of cursor being operated on, we hold locks for the whole getMore, // or none of the getMore, or part of the getMore. The three cases in detail: // // 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore. // 2) Cursor owned by global cursor manager: we don't lock anything. These cursors don't // own any collection state. // 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and // "unpinCollLock". This is because agg cursors handle locking internally (hence the // release), but the pin and unpin of the cursor must occur under the collection lock. // We don't use our AutoGetCollectionForRead "ctx" to relock, because // AutoGetCollectionForRead checks the sharding version (and we want the relock for the // unpin to succeed even if the sharding version has changed). // // Note that we declare our locks before our ClientCursorPin, in order to ensure that the // pin's destructor is called before the lock destructors (so that the unpin occurs under // the lock). boost::scoped_ptr<AutoGetCollectionForRead> ctx; boost::scoped_ptr<Lock::DBLock> unpinDBLock; boost::scoped_ptr<Lock::CollectionLock> unpinCollLock; CursorManager* cursorManager; CursorManager* globalCursorManager = CursorManager::getGlobalCursorManager(); if (globalCursorManager->ownsCursorId(cursorid)) { cursorManager = globalCursorManager; } else { ctx.reset(new AutoGetCollectionForRead(txn, nss)); Collection* collection = ctx->getCollection(); uassert( 17356, "collection dropped between getMore calls", collection ); cursorManager = collection->getCursorManager(); } LOG(5) << "Running getMore, cursorid: " << cursorid << endl; // This checks to make sure the operation is allowed on a replicated node. Since we are not // passing in a query object (necessary to check SlaveOK query option), the only state where // reads are allowed is PRIMARY (or master in master/slave). This function uasserts if // reads are not okay. Status status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor( txn, nss, true); uassertStatusOK(status); // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. ClientCursorPin ccPin(cursorManager, cursorid); ClientCursor* cc = ccPin.c(); // If we're not being called from DBDirectClient we want to associate the RecoveryUnit // used to create the execution machinery inside the cursor with our OperationContext. // If we throw or otherwise exit this method in a disorderly fashion, we must ensure // that further calls to getMore won't fail, and that the provided OperationContext // has a valid RecoveryUnit. As such, we use RAII to accomplish this. // // This must be destroyed before the ClientCursor is destroyed. std::auto_ptr<ScopedRecoveryUnitSwapper> ruSwapper; // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; const int InitialBufSize = 512 + sizeof(QueryResult::Value) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(InitialBufSize); bb.skip(sizeof(QueryResult::Value)); if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Check for spoofing of the ns such that it does not match the one originally // there for the cursor. uassert(ErrorCodes::Unauthorized, str::stream() << "Requested getMore on namespace " << ns << ", but cursor " << cursorid << " belongs to namespace " << cc->ns(), ns == cc->ns()); *isCursorAuthorized = true; // Restore the RecoveryUnit if we need to. if (txn->getClient()->isInDirectClient()) { if (cc->hasRecoveryUnit()) invariant(txn->recoveryUnit() == cc->getUnownedRecoveryUnit()); } else { if (!cc->hasRecoveryUnit()) { // Start using a new RecoveryUnit cc->setOwnedRecoveryUnit( getGlobalServiceContext()->getGlobalStorageEngine()->newRecoveryUnit()); } // Swap RecoveryUnit(s) between the ClientCursor and OperationContext. ruSwapper.reset(new ScopedRecoveryUnitSwapper(cc, txn)); } // Reset timeout timer on the cursor since the cursor is still in use. cc->setIdleTime(0); // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. if (0 == pass) { cc->updateSlaveLocation(txn); } if (cc->isAggCursor()) { // Agg cursors handle their own locking internally. ctx.reset(); // unlocks } // If we're replaying the oplog, we save the last time that we read. Timestamp slaveReadTill; // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); // What gives us results. PlanExecutor* exec = cc->getExecutor(); const int queryOptions = cc->queryOptions(); // Get results out of the executor. exec->restoreState(txn); BSONObj obj; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (queryOptions & QueryOption_OplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || bsonTimestamp == e.type()) { slaveReadTill = e.timestamp(); } } if (enoughForGetMore(ntoreturn, numResults, bb.len())) { break; } } if (PlanExecutor::DEAD == state || PlanExecutor::FAILURE == state) { // Propagate this error to caller. if (PlanExecutor::FAILURE == state) { scoped_ptr<PlanStageStats> stats(exec->getStats()); error() << "Plan executor error, stats: " << Explain::statsToBSON(*stats); uasserted(17406, "getMore executor error: " + WorkingSetCommon::toStatusString(obj)); } // In the old system tailable capped cursors would be killed off at the // cursorid level. If a tailable capped cursor is nuked the cursorid // would vanish. // // In the new system they die and are cleaned up later (or time out). // So this is where we get to remove the cursorid. if (0 == numResults) { resultFlags = ResultFlag_CursorNotFound; } } const bool shouldSaveCursor = shouldSaveCursorGetMore(state, exec, isCursorTailable(cc)); // In order to deregister a cursor, we need to be holding the DB + collection lock and // if the cursor is aggregation, we release these locks. if (cc->isAggCursor()) { invariant(NULL == ctx.get()); unpinDBLock.reset(new Lock::DBLock(txn->lockState(), nss.db(), MODE_IS)); unpinCollLock.reset(new Lock::CollectionLock(txn->lockState(), nss.ns(), MODE_IS)); } // Our two possible ClientCursorPin cleanup paths are: // 1) If the cursor is not going to be saved, we call deleteUnderlying() on the pin. // 2) If the cursor is going to be saved, we simply let the pin go out of scope. In // this case, the pin's destructor will be invoked, which will call release() on the // pin. Because our ClientCursorPin is declared after our lock is declared, this // will happen under the lock. if (!shouldSaveCursor) { ruSwapper.reset(); ccPin.deleteUnderlying(); // cc is now invalid, as is the executor cursorid = 0; cc = NULL; curop.debug().cursorExhausted = true; LOG(5) << "getMore NOT saving client cursor, ended with state " << PlanExecutor::statestr(state) << endl; } else { // Continue caching the ClientCursor. cc->incPos(numResults); exec->saveState(); LOG(5) << "getMore saving client cursor ended with state " << PlanExecutor::statestr(state) << endl; if (PlanExecutor::IS_EOF == state && (queryOptions & QueryOption_CursorTailable)) { if (!txn->getClient()->isInDirectClient()) { // Don't stash the RU. Get a new one on the next getMore. ruSwapper->dismiss(); } if ((queryOptions & QueryOption_AwaitData) && (numResults == 0) && (pass < 1000)) { // Bubble up to the AwaitData handling code in receivedGetMore which will // try again. return NULL; } } // Possibly note slave's position in the oplog. if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = (queryOptions & QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult::View qr = bb.buf(); qr.msgdata().setLen(bb.len()); qr.msgdata().setOperation(opReply); qr.setResultFlags(resultFlags); qr.setCursorId(cursorid); qr.setStartingFrom(startingResult); qr.setNReturned(numResults); bb.decouple(); LOG(5) << "getMore returned " << numResults << " results\n"; return qr; }
void receivedInsert(Message& m, CurOp& op) { DbMessage d(m); const char *ns = d.getns(); op.debug().ns = ns; bool isIndexWrite = NamespaceString(ns).coll == "system.indexes"; // Auth checking for index writes happens further down in this function. if (!isIndexWrite) { Status status = cc().getAuthorizationManager()->checkAuthForInsert(ns); uassert(16544, status.reason(), status.isOK()); } if( !d.moreJSObjs() ) { // strange. should we complain? return; } vector<BSONObj> multi; while (d.moreJSObjs()){ BSONObj obj = d.nextJsObj(); multi.push_back(obj); if (isIndexWrite) { string indexNS = obj.getStringField("ns"); uassert(16548, mongoutils::str::stream() << "not authorized to create index on " << indexNS, cc().getAuthorizationManager()->checkAuthorization( indexNS, ActionType::ensureIndex)); } } PageFaultRetryableSection s; while ( true ) { try { Lock::DBWrite lk(ns); // CONCURRENCY TODO: is being read locked in big log sufficient here? // writelock is used to synchronize stepdowns w/ writes uassert( 10058 , "not master", isMasterNs(ns) ); if ( handlePossibleShardedMessage( m , 0 ) ) return; Client::Context ctx(ns); if (multi.size() > 1) { const bool keepGoing = d.reservedField() & InsertOption_ContinueOnError; insertMulti(keepGoing, ns, multi, op); } else { checkAndInsert(ns, multi[0]); globalOpCounters.incInsertInWriteLock(1); op.debug().ninserted = 1; } return; } catch ( PageFaultException& e ) { e.touch(); } } }
bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) { bool ok = true; DbMessage d(m); const char *ns = d.getns(); int ntoreturn = d.pullInt(); long long cursorid = d.pullInt64(); curop.debug().ns = ns; curop.debug().ntoreturn = ntoreturn; curop.debug().cursorid = cursorid; shared_ptr<AssertionException> ex; scoped_ptr<Timer> timer; int pass = 0; bool exhaust = false; QueryResult* msgdata = 0; OpTime last; while( 1 ) { bool isCursorAuthorized = false; try { const NamespaceString nsString( ns ); uassert( 16258, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid() ); Status status = cc().getAuthorizationManager()->checkAuthForGetMore(ns); uassert(16543, status.reason(), status.isOK()); if (str::startsWith(ns, "local.oplog.")){ while (MONGO_FAIL_POINT(rsStopGetMore)) { sleepmillis(0); } if (pass == 0) { mutex::scoped_lock lk(OpTime::m); last = OpTime::getLast(lk); } else { last.waitForDifferent(1000/*ms*/); } } msgdata = processGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust, &isCursorAuthorized); } catch ( AssertionException& e ) { if ( isCursorAuthorized ) { // If a cursor with id 'cursorid' was authorized, it may have been advanced // before an exception terminated processGetMore. Erase the ClientCursor // because it may now be out of sync with the client's iteration state. // SERVER-7952 // TODO Temporary code, see SERVER-4563 for a cleanup overview. ClientCursor::erase( cursorid ); } ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) ); ok = false; break; } if (msgdata == 0) { // this should only happen with QueryOption_AwaitData exhaust = false; massert(13073, "shutting down", !inShutdown() ); if ( ! timer ) { timer.reset( new Timer() ); } else { if ( timer->seconds() >= 4 ) { // after about 4 seconds, return. pass stops at 1000 normally. // we want to return occasionally so slave can checkpoint. pass = 10000; } } pass++; if (debug) sleepmillis(20); else sleepmillis(2); // note: the 1100 is beacuse of the waitForDifferent above // should eventually clean this up a bit curop.setExpectedLatencyMs( 1100 + timer->millis() ); continue; } break; }; if (ex) { exhaust = false; BSONObjBuilder err; ex->getInfo().append( err ); BSONObj errObj = err.done(); log() << errObj << endl; curop.debug().exceptionInfo = ex->getInfo(); if (ex->getCode() == 13436) { replyToQuery(ResultFlag_ErrSet, m, dbresponse, errObj); curop.debug().responseLength = dbresponse.response->header()->dataLen(); curop.debug().nreturned = 1; return ok; } msgdata = emptyMoreResult(cursorid); } Message *resp = new Message(); resp->setData(msgdata, true); curop.debug().responseLength = resp->header()->dataLen(); curop.debug().nreturned = msgdata->nReturned; dbresponse.response = resp; dbresponse.responseTo = m.header()->id; if( exhaust ) { curop.debug().exhaust = true; dbresponse.exhaustNS = ns; } return ok; }
bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) { bool ok = true; DbMessage d(m); const char *ns = d.getns(); int ntoreturn = d.pullInt(); long long cursorid = d.pullInt64(); curop.debug().ns = ns; curop.debug().ntoreturn = ntoreturn; curop.debug().cursorid = cursorid; time_t start = 0; int pass = 0; bool exhaust = false; QueryResult* msgdata; OpTime last; while( 1 ) { try { Client::ReadContext ctx(ns); if (str::startsWith(ns, "local.oplog.")){ if (pass == 0) last = OpTime::last_inlock(); else last.waitForDifferent(1000/*ms*/); } msgdata = processGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust); } catch ( AssertionException& e ) { exhaust = false; curop.debug().exceptionInfo = e.getInfo(); msgdata = emptyMoreResult(cursorid); ok = false; } if (msgdata == 0) { exhaust = false; massert(13073, "shutting down", !inShutdown() ); if( pass == 0 ) { start = time(0); } else { if( time(0) - start >= 4 ) { // after about 4 seconds, return. pass stops at 1000 normally. // we want to return occasionally so slave can checkpoint. pass = 10000; } } pass++; if (debug) sleepmillis(20); else sleepmillis(2); continue; } break; }; Message *resp = new Message(); resp->setData(msgdata, true); curop.debug().responseLength = resp->header()->dataLen(); curop.debug().nreturned = msgdata->nReturned; dbresponse.response = resp; dbresponse.responseTo = m.header()->id; if( exhaust ) { curop.debug().exhaust = true; dbresponse.exhaust = ns; } return ok; }
bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) { bool ok = true; DbMessage d(m); const char *ns = d.getns(); int ntoreturn = d.pullInt(); long long cursorid = d.pullInt64(); curop.debug().ns = ns; curop.debug().ntoreturn = ntoreturn; curop.debug().cursorid = cursorid; shared_ptr<AssertionException> ex; time_t start = 0; int pass = 0; bool exhaust = false; QueryResult* msgdata = 0; OpTime last; while( 1 ) { try { if (str::startsWith(ns, "local.oplog.")){ if (pass == 0) { mutex::scoped_lock lk(OpTime::m); last = OpTime::getLast(lk); } else { last.waitForDifferent(1000/*ms*/); } } Client::ReadContext ctx(ns); // call this readlocked so state can't change replVerifyReadsOk(); msgdata = processGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust); } catch ( AssertionException& e ) { ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) ); ok = false; break; } if (msgdata == 0) { exhaust = false; massert(13073, "shutting down", !inShutdown() ); if( pass == 0 ) { start = time(0); } else { if( time(0) - start >= 4 ) { // after about 4 seconds, return. pass stops at 1000 normally. // we want to return occasionally so slave can checkpoint. pass = 10000; } } pass++; if (debug) sleepmillis(20); else sleepmillis(2); continue; } break; }; if (ex) { exhaust = false; BSONObjBuilder err; ex->getInfo().append( err ); BSONObj errObj = err.done(); log() << errObj << endl; curop.debug().exceptionInfo = ex->getInfo(); if (ex->getCode() == 13436) { replyToQuery(ResultFlag_ErrSet, m, dbresponse, errObj); curop.debug().responseLength = dbresponse.response->header()->dataLen(); curop.debug().nreturned = 1; return ok; } msgdata = emptyMoreResult(cursorid); } Message *resp = new Message(); resp->setData(msgdata, true); curop.debug().responseLength = resp->header()->dataLen(); curop.debug().nreturned = msgdata->nReturned; dbresponse.response = resp; dbresponse.responseTo = m.header()->id; if( exhaust ) { curop.debug().exhaust = true; dbresponse.exhaust = ns; } return ok; }