void receivedUpdate(Message& m, CurOp& op) { DbMessage d(m); const char *ns = d.getns(); op.debug().ns = ns; int flags = d.pullInt(); BSONObj query = d.nextJsObj(); assert( d.moreJSObjs() ); assert( query.objsize() < m.header()->dataLen() ); BSONObj toupdate = d.nextJsObj(); uassert( 10055 , "update object too large", toupdate.objsize() <= BSONObjMaxUserSize); assert( toupdate.objsize() < m.header()->dataLen() ); assert( query.objsize() + toupdate.objsize() < m.header()->dataLen() ); bool upsert = flags & UpdateOption_Upsert; bool multi = flags & UpdateOption_Multi; bool broadcast = flags & UpdateOption_Broadcast; op.debug().query = query; op.setQuery(query); writelock lk; // writelock is used to synchronize stepdowns w/ writes uassert( 10054 , "not master", isMasterNs( ns ) ); // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) ) return; Client::Context ctx( ns ); UpdateResult res = updateObjects(ns, toupdate, query, upsert, multi, true, op.debug() ); lastError.getSafe()->recordUpdate( res.existing , res.num , res.upserted ); // for getlasterror }
void receivedDelete(Message& m, CurOp& op) { DbMessage d(m); const char *ns = d.getns(); assert(*ns); uassert( 10056 , "not master", isMasterNs( ns ) ); op.debug().str << ns << ' '; int flags = d.pullInt(); bool justOne = flags & RemoveOption_JustOne; bool broadcast = flags & RemoveOption_Broadcast; assert( d.moreJSObjs() ); BSONObj pattern = d.nextJsObj(); { string s = pattern.toString(); op.debug().str << " query: " << s; op.setQuery(pattern); } writelock lk(ns); // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit if ( ! broadcast & handlePossibleShardedMessage( m , 0 ) ) return; Client::Context ctx(ns); long long n = deleteObjects(ns, pattern, justOne, true); lastError.getSafe()->recordDelete( n ); }
void receivedUpdate(Message& m, CurOp& op) { DbMessage d(m); const char *ns = d.getns(); op.debug().ns = ns; int flags = d.pullInt(); BSONObj query = d.nextJsObj(); verify( d.moreJSObjs() ); verify( query.objsize() < m.header()->dataLen() ); BSONObj toupdate = d.nextJsObj(); uassert( 10055 , "update object too large", toupdate.objsize() <= BSONObjMaxUserSize); verify( toupdate.objsize() < m.header()->dataLen() ); verify( query.objsize() + toupdate.objsize() < m.header()->dataLen() ); bool upsert = flags & UpdateOption_Upsert; bool multi = flags & UpdateOption_Multi; bool broadcast = flags & UpdateOption_Broadcast; Status status = cc().getAuthorizationManager()->checkAuthForUpdate(ns, upsert); uassert(16538, status.reason(), status.isOK()); op.debug().query = query; op.setQuery(query); PageFaultRetryableSection s; while ( 1 ) { try { Lock::DBWrite lk(ns); // void ReplSetImpl::relinquish() uses big write lock so // this is thus synchronized given our lock above. uassert( 10054 , "not master", isMasterNs( ns ) ); // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) ) return; Client::Context ctx( ns ); UpdateResult res = updateObjects(ns, toupdate, query, upsert, multi, true, op.debug() ); lastError.getSafe()->recordUpdate( res.existing , res.num , res.upserted ); // for getlasterror break; } catch ( PageFaultException& e ) { e.touch(); } } }
void receivedUpdate(Message& m, CurOp& op) { DbMessage d(m); const char *ns = d.getns(); op.debug().ns = ns; int flags = d.pullInt(); BSONObj query = d.nextJsObj(); verify(d.moreJSObjs()); verify(query.objsize() < m.header()->dataLen()); const BSONObj updateobj = d.nextJsObj(); uassert(10055, "update object too large", updateobj.objsize() <= BSONObjMaxUserSize); verify(updateobj.objsize() < m.header()->dataLen()); verify(query.objsize() + updateobj.objsize() < m.header()->dataLen()); op.debug().query = query; op.debug().updateobj = updateobj; op.setQuery(query); const bool upsert = flags & UpdateOption_Upsert; const bool multi = flags & UpdateOption_Multi; const bool broadcast = flags & UpdateOption_Broadcast; Status status = cc().getAuthorizationManager()->checkAuthForUpdate(ns, upsert); uassert(16538, status.reason(), status.isOK()); OpSettings settings; settings.setQueryCursorMode(WRITE_LOCK_CURSOR); settings.setJustOne(!multi); cc().setOpSettings(settings); Client::ShardedOperationScope sc; if (!broadcast && sc.handlePossibleShardedMessage(m, 0)) { return; } LOCK_REASON(lockReason, "update"); try { Lock::DBRead lk(ns, lockReason); lockedReceivedUpdate(ns, m, op, updateobj, query, upsert, multi); } catch (RetryWithWriteLock &e) { Lock::DBWrite lk(ns, lockReason); lockedReceivedUpdate(ns, m, op, updateobj, query, upsert, multi); } }
void receivedDelete(Message& m, CurOp& op) { DbMessage d(m); const char *ns = d.getns(); Status status = cc().getAuthorizationManager()->checkAuthForDelete(ns); uassert(16542, status.reason(), status.isOK()); op.debug().ns = ns; int flags = d.pullInt(); verify(d.moreJSObjs()); BSONObj pattern = d.nextJsObj(); op.debug().query = pattern; op.setQuery(pattern); const bool justOne = flags & RemoveOption_JustOne; const bool broadcast = flags & RemoveOption_Broadcast; OpSettings settings; settings.setQueryCursorMode(WRITE_LOCK_CURSOR); settings.setJustOne(justOne); cc().setOpSettings(settings); Client::ShardedOperationScope sc; if (!broadcast && sc.handlePossibleShardedMessage(m, 0)) { return; } LOCK_REASON(lockReason, "delete"); Lock::DBRead lk(ns, lockReason); // writelock is used to synchronize stepdowns w/ writes uassert(10056, "not master", isMasterNs(ns)); Client::Context ctx(ns); long long n; scoped_ptr<Client::AlternateTransactionStack> altStack(opNeedsAltTxn(ns) ? new Client::AlternateTransactionStack : NULL); Client::Transaction transaction(DB_SERIALIZABLE); n = deleteObjects(ns, pattern, justOne, true); transaction.commit(); lastError.getSafe()->recordDelete( n ); op.debug().ndeleted = n; }
void receivedDelete(Message& m, CurOp& op) { DbMessage d(m); const char *ns = d.getns(); Status status = cc().getAuthorizationManager()->checkAuthForDelete(ns); uassert(16542, status.reason(), status.isOK()); op.debug().ns = ns; int flags = d.pullInt(); bool justOne = flags & RemoveOption_JustOne; bool broadcast = flags & RemoveOption_Broadcast; verify( d.moreJSObjs() ); BSONObj pattern = d.nextJsObj(); op.debug().query = pattern; op.setQuery(pattern); PageFaultRetryableSection s; while ( 1 ) { try { Lock::DBWrite lk(ns); // writelock is used to synchronize stepdowns w/ writes uassert( 10056 , "not master", isMasterNs( ns ) ); // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) ) return; Client::Context ctx(ns); long long n = deleteObjects(ns, pattern, justOne, true); lastError.getSafe()->recordDelete( n ); op.debug().ndeleted = n; break; } catch ( PageFaultException& e ) { LOG(2) << "recordDelete got a PageFaultException" << endl; e.touch(); } } }
void receivedUpdate(Message& m, CurOp& op) { DbMessage d(m); const char *ns = d.getns(); assert(*ns); op.debug().str << ns << ' '; int flags = d.pullInt(); BSONObj query = d.nextJsObj(); assert( d.moreJSObjs() ); assert( query.objsize() < m.header()->dataLen() ); BSONObj toupdate = d.nextJsObj(); uassert( 10055 , "update object too large", toupdate.objsize() <= BSONObjMaxUserSize); assert( toupdate.objsize() < m.header()->dataLen() ); assert( query.objsize() + toupdate.objsize() < m.header()->dataLen() ); bool upsert = flags & UpdateOption_Upsert; bool multi = flags & UpdateOption_Multi; bool broadcast = flags & UpdateOption_Broadcast; { string s = query.toString(); /* todo: we shouldn't do all this ss stuff when we don't need it, it will slow us down. instead, let's just story the query BSON in the debug object, and it can toString() lazily */ op.debug().str << " query: " << s; op.setQuery(query); } writelock lk; // writelock is used to synchronize stepdowns w/ writes uassert( 10054 , "not master", isMasterNs( ns ) ); // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) ) return; Client::Context ctx( ns ); UpdateResult res = updateObjects(ns, toupdate, query, upsert, multi, true, op.debug() ); lastError.getSafe()->recordUpdate( res.existing , res.num , res.upserted ); // for getlasterror }
std::string newRunQuery(OperationContext* txn, Message& m, QueryMessage& q, CurOp& curop, Message &result, bool fromDBDirectClient) { // Validate the namespace. const char *ns = q.ns; uassert(16332, "can't have an empty ns", ns[0]); const NamespaceString nsString(ns); uassert(16256, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid()); // Set curop information. curop.debug().ns = ns; curop.debug().ntoreturn = q.ntoreturn; curop.debug().query = q.query; curop.setQuery(q.query); // If the query is really a command, run it. if (nsString.isCommand()) { int nToReturn = q.ntoreturn; uassert(16979, str::stream() << "bad numberToReturn (" << nToReturn << ") for $cmd type ns - can only be 1 or -1", nToReturn == 1 || nToReturn == -1); curop.markCommand(); BufBuilder bb; bb.skip(sizeof(QueryResult::Value)); BSONObjBuilder cmdResBuf; if (!runCommands(txn, ns, q.query, curop, bb, cmdResBuf, false, q.queryOptions)) { uasserted(13530, "bad or malformed command request?"); } curop.debug().iscommand = true; // TODO: Does this get overwritten/do we really need to set this twice? curop.debug().query = q.query; QueryResult::View qr = bb.buf(); bb.decouple(); qr.setResultFlagsToOk(); qr.msgdata().setLen(bb.len()); curop.debug().responseLength = bb.len(); qr.msgdata().setOperation(opReply); qr.setCursorId(0); qr.setStartingFrom(0); qr.setNReturned(1); result.setData(qr.view2ptr(), true); return ""; } const NamespaceString nss(q.ns); // Parse the qm into a CanonicalQuery. CanonicalQuery* cq; Status canonStatus = CanonicalQuery::canonicalize( q, &cq, WhereCallbackReal(txn, StringData(nss.db()))); if (!canonStatus.isOK()) { uasserted(17287, str::stream() << "Can't canonicalize query: " << canonStatus.toString()); } QLOG() << "Running query:\n" << cq->toString(); LOG(2) << "Running query: " << cq->toStringShort(); // Parse, canonicalize, plan, transcribe, and get a plan executor. PlanExecutor* rawExec = NULL; // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); AutoGetCollectionForRead ctx(txn, nss); const int dbProfilingLevel = (ctx.getDb() != NULL) ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile; Collection* collection = ctx.getCollection(); // We'll now try to get the query executor that will execute this query for us. There // are a few cases in which we know upfront which executor we should get and, therefore, // we shortcut the selection process here. // // (a) If the query is over a collection that doesn't exist, we use an EOFStage. // // (b) if the query is a replication's initial sync one, we use a specifically designed // stage that skips extents faster (see details in exec/oplogstart.h). // // Otherwise we go through the selection of which executor is most suited to the // query + run-time context at hand. Status status = Status::OK(); if (NULL != collection && pq.getOptions().oplogReplay) { // Takes ownership of 'cq'. status = getOplogStartHack(txn, collection, cq, &rawExec); } else { size_t options = QueryPlannerParams::DEFAULT; if (shardingState.needCollectionMetadata(pq.ns())) { options |= QueryPlannerParams::INCLUDE_SHARD_FILTER; } // Takes ownership of 'cq'. status = getExecutor(txn, collection, cq, PlanExecutor::YIELD_AUTO, &rawExec, options); } if (!status.isOK()) { // NOTE: Do not access cq as getExecutor has deleted it. uasserted(17007, "Unable to execute query: " + status.reason()); } verify(NULL != rawExec); auto_ptr<PlanExecutor> exec(rawExec); // If it's actually an explain, do the explain and return rather than falling through // to the normal query execution loop. if (pq.isExplain()) { BufBuilder bb; bb.skip(sizeof(QueryResult::Value)); BSONObjBuilder explainBob; Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob); // Add the resulting object to the return buffer. BSONObj explainObj = explainBob.obj(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); curop.debug().iscommand = true; // TODO: Does this get overwritten/do we really need to set this twice? curop.debug().query = q.query; // Set query result fields. QueryResult::View qr = bb.buf(); bb.decouple(); qr.setResultFlagsToOk(); qr.msgdata().setLen(bb.len()); curop.debug().responseLength = bb.len(); qr.msgdata().setOperation(opReply); qr.setCursorId(0); qr.setStartingFrom(0); qr.setNReturned(1); result.setData(qr.view2ptr(), true); return ""; } // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns()); // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. bool slaveOK = pq.getOptions().slaveOk || pq.hasReadPref(); status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor( txn, NamespaceString(cq->ns()), slaveOK); uassertStatusOK(status); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult::Value)); // How many results have we obtained from the executor? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the PlanExecutor in a ClientCursor for getMore calls later? bool saveClientCursor = false; BSONObj obj; PlanExecutor::ExecState state; // uint64_t numMisplacedDocs = 0; // Get summary info about which plan the executor is using. curop.debug().planSummary = Explain::getPlanSummary(exec.get()); while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.getOptions().oplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { QLOG() << "Enough for first batch, wantMore=" << pq.wantMore() << " numToReturn=" << pq.getNumToReturn() << " numResults=" << numResults << endl; // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { QLOG() << " executor EOF=" << exec->isEOF() << endl; saveClientCursor = !exec->isEOF(); } break; } } // If we cache the executor later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the executor later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the executor. exec->deregisterExec(); // Caller expects exceptions thrown in certain cases. if (PlanExecutor::EXEC_ERROR == state) { scoped_ptr<PlanStageStats> stats(exec->getStats()); error() << "Plan executor error, stats: " << Explain::statsToBSON(*stats); uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj)); } // Why save a dead executor? if (PlanExecutor::DEAD == state) { saveClientCursor = false; } else if (pq.getOptions().tailable) { // If we're tailing a capped collection, we don't bother saving the cursor if the // collection is empty. Otherwise, the semantics of the tailable cursor is that the // client will keep trying to read from it. So we'll keep it around. if (collection && collection->numRecords(txn) != 0 && pq.getNumToReturn() != 1) { saveClientCursor = true; } } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } const logger::LogComponent queryLogComponent = logger::LogComponent::kQuery; const logger::LogSeverity logLevelOne = logger::LogSeverity::Debug(1); PlanSummaryStats summaryStats; Explain::getSummaryStats(exec.get(), &summaryStats); curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; curop.debug().scanAndOrder = summaryStats.hasSortStage; curop.debug().nscanned = summaryStats.totalKeysExamined; curop.debug().nscannedObjects = summaryStats.totalDocsExamined; curop.debug().idhack = summaryStats.isIdhack; // Set debug information for consumption by the profiler. if (dbProfilingLevel > 0 || curop.elapsedMillis() > serverGlobalParams.slowMS || logger::globalLogDomain()->shouldLog(queryLogComponent, logLevelOne)) { // Get BSON stats. scoped_ptr<PlanStageStats> execStats(exec->getStats()); BSONObjBuilder statsBob; Explain::statsToBSON(*execStats, &statsBob); curop.debug().execStats.set(statsBob.obj()); // Replace exec stats with plan summary if stats cannot fit into CachedBSONObj. if (curop.debug().execStats.tooBig() && !curop.debug().planSummary.empty()) { BSONObjBuilder bob; bob.append("summary", curop.debug().planSummary.toString()); curop.debug().execStats.set(bob.done()); } } long long ccId = 0; if (saveClientCursor) { // We won't use the executor until it's getMore'd. exec->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(collection, exec.get(), cq->getParsed().getOptions().toInt(), cq->getParsed().getFilter()); ccId = cc->cursorid(); if (fromDBDirectClient) { cc->setUnownedRecoveryUnit(txn->recoveryUnit()); } else if (state == PlanExecutor::IS_EOF && pq.getOptions().tailable) { // Don't stash the RU for tailable cursors at EOF, let them get a new RU on their // next getMore. } else { // We stash away the RecoveryUnit in the ClientCursor. It's used for subsequent // getMore requests. The calling OpCtx gets a fresh RecoveryUnit. cc->setOwnedRecoveryUnit(txn->releaseRecoveryUnit()); StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); txn->setRecoveryUnit(storageEngine->newRecoveryUnit(txn)); } QLOG() << "caching executor with cursorid " << ccId << " after returning " << numResults << " results" << endl; // ClientCursor takes ownership of executor. Release to make sure it's not deleted. exec.release(); // TODO document if (pq.getOptions().oplogReplay && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.getOptions().exhaust) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } else { QLOG() << "Not caching executor but returning " << numResults << " results.\n"; } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult::View qr = result.header().view2ptr(); qr.setCursorId(ccId); curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr.setResultFlagsToOk(); qr.msgdata().setOperation(opReply); qr.setStartingFrom(0); qr.setNReturned(numResults); // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
/** * Run a query -- includes checking for and running a Command. * @return points to ns if exhaust mode. 0=normal mode * @locks the db mutex for reading (and potentially for writing temporarily to create a new db). * @asserts on scan and order memory exhaustion and other cases. */ string runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) ); ParsedQuery& pq( *pq_shared ); BSONObj jsobj = q.query; int queryOptions = q.queryOptions; const char *ns = q.ns; uassert( 16332 , "can't have an empty ns" , ns[0] ); if( logLevel >= 2 ) log() << "runQuery called " << ns << " " << jsobj << endl; curop.debug().ns = ns; curop.debug().ntoreturn = pq.getNumToReturn(); curop.debug().query = jsobj; curop.setQuery(jsobj); uassert( 16256, str::stream() << "Invalid ns [" << ns << "]", NamespaceString::isValid(ns) ); // Run a command. if ( pq.couldBeCommand() ) { curop.markCommand(); BufBuilder bb; bb.skip(sizeof(QueryResult)); BSONObjBuilder cmdResBuf; if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) { curop.debug().iscommand = true; curop.debug().query = jsobj; auto_ptr< QueryResult > qr; qr.reset( (QueryResult *) bb.buf() ); bb.decouple(); qr->setResultFlagsToOk(); qr->len = bb.len(); curop.debug().responseLength = bb.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; result.setData( qr.release(), true ); } else { uasserted(13530, "bad or malformed command request?"); } return ""; } const bool explain = pq.isExplain(); const bool tailable = pq.hasOption(QueryOption_CursorTailable); BSONObj order = pq.getOrder(); BSONObj query = pq.getFilter(); /* The ElemIter will not be happy if this isn't really an object. So throw exception here when that is true. (Which may indicate bad data from client.) */ if ( query.objsize() == 0 ) { out() << "Bad query object?\n jsobj:"; out() << jsobj.toString() << "\n query:"; out() << query.toString() << endl; uassert( 10110 , "bad query object", false); } // Tailable cursors need to read newly written entries from the tail // of the collection. They manually arbitrate with the collection over // what data is readable and when, so we choose read uncommited isolation. OpSettings settings; settings.setQueryCursorMode(DEFAULT_LOCK_CURSOR); settings.setBulkFetch(true); settings.setCappedAppendPK(pq.hasOption(QueryOption_AddHiddenPK)); cc().setOpSettings(settings); // If our caller has a transaction, it's multi-statement. const bool inMultiStatementTxn = cc().hasTxn(); if (tailable) { // Because it's easier to disable this. It shouldn't be happening in a normal system. uassert(16812, "May not perform a tailable query in a multi-statement transaction.", !inMultiStatementTxn); } // Begin a read-only, snapshot transaction under normal circumstances. // If the cursor is tailable, we need to be able to read uncommitted data. const int txnFlags = (tailable ? DB_READ_UNCOMMITTED : DB_TXN_SNAPSHOT) | DB_TXN_READ_ONLY; LOCK_REASON(lockReason, "query"); Client::ReadContext ctx(ns, lockReason); scoped_ptr<Client::Transaction> transaction(!inMultiStatementTxn ? new Client::Transaction(txnFlags) : NULL); bool hasRetried = false; while ( 1 ) { try { replVerifyReadsOk(&pq); // Fast-path for primary key queries. if (!explain && !tailable) { replVerifyReadsOk(&pq); if (_tryQueryByPKHack(ns, query, pq, curop, result)) { if (transaction) { transaction->commit(); } return ""; } } // sanity check the query and projection if (pq.getFields() != NULL) { pq.getFields()->validateQuery( query ); } if (tailable) { Collection *cl = getCollection( ns ); if (cl != NULL && !(cl->isCapped() || str::equals(ns, rsoplog))) { uasserted( 13051, "tailable cursor requested on non-capped, non-oplog collection" ); } const BSONObj nat1 = BSON( "$natural" << 1 ); if ( order.isEmpty() ) { order = nat1; } else { uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 ); } } // Run a regular query. // these now may stored in a ClientCursor or somewhere else, // so make sure we use a real copy jsobj = jsobj.getOwned(); query = query.getOwned(); order = order.getOwned(); const ConfigVersion shardingVersionAtStart = shardingState.getVersion( ns ); const bool getCachedExplainPlan = ! hasRetried && explain && ! pq.hasIndexSpecifier(); const bool savedCursor = queryWithQueryOptimizer( queryOptions, ns, jsobj, curop, query, order, pq_shared, shardingVersionAtStart, getCachedExplainPlan, inMultiStatementTxn, result ); // Did not save the cursor, so we can commit the transaction now if it exists. if (transaction && !savedCursor) { transaction->commit(); } return curop.debug().exhaust ? ns : ""; } catch ( const QueryRetryException & ) { // In some cases the query may be retried if there is an in memory sort size assertion. verify( ! hasRetried ); hasRetried = true; } } }
std::string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { // Validate the namespace. const char *ns = q.ns; uassert(16332, "can't have an empty ns", ns[0]); const NamespaceString nsString(ns); uassert(16256, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid()); // Set curop information. curop.debug().ns = ns; curop.debug().ntoreturn = q.ntoreturn; curop.debug().query = q.query; curop.setQuery(q.query); // If the query is really a command, run it. if (nsString.isCommand()) { int nToReturn = q.ntoreturn; uassert(16979, str::stream() << "bad numberToReturn (" << nToReturn << ") for $cmd type ns - can only be 1 or -1", nToReturn == 1 || nToReturn == -1); curop.markCommand(); BufBuilder bb; bb.skip(sizeof(QueryResult)); BSONObjBuilder cmdResBuf; if (!runCommands(ns, q.query, curop, bb, cmdResBuf, false, q.queryOptions)) { uasserted(13530, "bad or malformed command request?"); } curop.debug().iscommand = true; // TODO: Does this get overwritten/do we really need to set this twice? curop.debug().query = q.query; QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf()); bb.decouple(); qr->setResultFlagsToOk(); qr->len = bb.len(); curop.debug().responseLength = bb.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; result.setData(qr, true); return ""; } // This is a read lock. We require this because if we're parsing a $where, the // where-specific parsing code assumes we have a lock and creates execution machinery that // requires it. Client::ReadContext ctx(q.ns); Collection* collection = ctx.ctx().db()->getCollection( ns ); // Parse the qm into a CanonicalQuery. CanonicalQuery* cq; Status canonStatus = CanonicalQuery::canonicalize(q, &cq); if (!canonStatus.isOK()) { uasserted(17287, str::stream() << "Can't canonicalize query: " << canonStatus.toString()); } verify(cq); QLOG() << "Running query:\n" << cq->toString(); LOG(2) << "Running query: " << cq->toStringShort(); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner = NULL; // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); // We'll now try to get the query runner that will execute this query for us. There // are a few cases in which we know upfront which runner we should get and, therefore, // we shortcut the selection process here. // // (a) If the query is over a collection that doesn't exist, we get a special runner // that's is so (a runner) which doesn't return results, the EOFRunner. // // (b) if the query is a replication's initial sync one, we get a SingleSolutinRunner // that uses a specifically designed stage that skips extents faster (see details in // exec/oplogstart.h) // // Otherwise we go through the selection of which runner is most suited to the // query + run-time context at hand. Status status = Status::OK(); if (collection == NULL) { rawRunner = new EOFRunner(cq, cq->ns()); } else if (pq.hasOption(QueryOption_OplogReplay)) { status = getOplogStartHack(collection, cq, &rawRunner); } else { // Takes ownership of cq. size_t options = QueryPlannerParams::DEFAULT; if (shardingState.needCollectionMetadata(pq.ns())) { options |= QueryPlannerParams::INCLUDE_SHARD_FILTER; } status = getRunner(cq, &rawRunner, options); } if (!status.isOK()) { // NOTE: Do not access cq as getRunner has deleted it. uasserted(17007, "Unable to execute query: " + status.reason()); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns()); // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; // We turn on auto-yielding for the runner here. The runner registers itself with the // active runners list in ClientCursor. auto_ptr<ScopedRunnerRegistration> safety(new ScopedRunnerRegistration(runner.get())); runner->setYieldPolicy(Runner::YIELD_AUTO); BSONObj obj; Runner::RunnerState state; // uint64_t numMisplacedDocs = 0; // set this outside loop. we will need to use this both within loop and when deciding // to fill in explain information const bool isExplain = pq.isExplain(); // Have we retrieved info about which plan the runner will // use to execute the query yet? bool gotPlanInfo = false; PlanInfo* rawInfo; boost::scoped_ptr<PlanInfo> planInfo; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // Add result to output buffer. This is unnecessary if explain info is requested if (!isExplain) { bb.appendBuf((void*)obj.objdata(), obj.objsize()); } // Count the result. ++numResults; // In the case of the multi plan runner, we may not be able to // successfully retrieve plan info until after the query starts // to run. This is because the multi plan runner doesn't know what // plan it will end up using until it runs candidates and selects // the best. // // TODO: Do we ever want to output what the MPR is comparing? if (!gotPlanInfo) { Status infoStatus = runner->getInfo(NULL, &rawInfo); if (infoStatus.isOK()) { gotPlanInfo = true; planInfo.reset(rawInfo); // planSummary is really a ThreadSafeString which copies the data from // the provided pointer. curop.debug().planSummary = planInfo->planSummary.c_str(); } } // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; if (isExplain) { if (enoughForExplain(pq, numResults)) { break; } } else if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { QLOG() << "Enough for first batch, wantMore=" << pq.wantMore() << " numToReturn=" << pq.getNumToReturn() << " numResults=" << numResults << endl; // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { QLOG() << " runner EOF=" << runner->isEOF() << endl; saveClientCursor = !runner->isEOF(); } break; } } // Try to get information about the plan which the runner // will use to execute the query, it we don't have it already. if (!gotPlanInfo) { Status infoStatus = runner->getInfo(NULL, &rawInfo); if (infoStatus.isOK()) { gotPlanInfo = true; planInfo.reset(rawInfo); // planSummary is really a ThreadSafeString which copies the data from // the provided pointer. curop.debug().planSummary = planInfo->planSummary.c_str(); } } // If we cache the runner later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the runner later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the runner. safety.reset(); // Caller expects exceptions thrown in certain cases. if (Runner::RUNNER_ERROR == state) { TypeExplain* bareExplain; Status res = runner->getInfo(&bareExplain, NULL); if (res.isOK()) { boost::scoped_ptr<TypeExplain> errorExplain(bareExplain); error() << "Runner error, stats:\n" << errorExplain->stats.jsonString(Strict, true); } uasserted(17144, "Runner error: " + WorkingSetCommon::toStatusString(obj)); } // Why save a dead runner? if (Runner::RUNNER_DEAD == state) { saveClientCursor = false; } else if (pq.hasOption(QueryOption_CursorTailable)) { // If we're tailing a capped collection, we don't bother saving the cursor if the // collection is empty. Otherwise, the semantics of the tailable cursor is that the // client will keep trying to read from it. So we'll keep it around. Collection* collection = ctx.ctx().db()->getCollection(cq->ns()); if (collection && collection->numRecords() != 0 && pq.getNumToReturn() != 1) { saveClientCursor = true; } } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } // Used to fill in explain and to determine if the query is slow enough to be logged. int elapsedMillis = curop.elapsedMillis(); // Get explain information if: // 1) it is needed by an explain query; // 2) profiling is enabled; or // 3) profiling is disabled but we still need explain details to log a "slow" query. // Producing explain information is expensive and should be done only if we are certain // the information will be used. boost::scoped_ptr<TypeExplain> explain(NULL); if (isExplain || ctx.ctx().db()->getProfilingLevel() > 0 || elapsedMillis > serverGlobalParams.slowMS) { // Ask the runner to produce explain information. TypeExplain* bareExplain; Status res = runner->getInfo(&bareExplain, NULL); if (res.isOK()) { explain.reset(bareExplain); } else if (isExplain) { error() << "could not produce explain of query '" << pq.getFilter() << "', error: " << res.reason(); // If numResults and the data in bb don't correspond, we'll crash later when rooting // through the reply msg. BSONObj emptyObj; bb.appendBuf((void*)emptyObj.objdata(), emptyObj.objsize()); // The explain output is actually a result. numResults = 1; // TODO: we can fill out millis etc. here just fine even if the plan screwed up. } } // Fill in the missing run-time fields in explain, starting with propeties of // the process running the query. if (isExplain && NULL != explain.get()) { std::string server = mongoutils::str::stream() << getHostNameCached() << ":" << serverGlobalParams.port; explain->setServer(server); // We might have skipped some results due to chunk migration etc. so our count is // correct. explain->setN(numResults); // Clock the whole operation. explain->setMillis(elapsedMillis); BSONObj explainObj = explain->toBSON(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // The explain output is actually a result. numResults = 1; } long long ccId = 0; if (saveClientCursor) { // We won't use the runner until it's getMore'd. runner->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(collection, runner.get(), cq->getParsed().getOptions(), cq->getParsed().getFilter()); ccId = cc->cursorid(); QLOG() << "caching runner with cursorid " << ccId << " after returning " << numResults << " results" << endl; // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); // TODO document if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } else { QLOG() << "Not caching runner but returning " << numResults << " results.\n"; } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; // Set debug information for consumption by the profiler. curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; if (NULL != explain.get()) { if (explain->isScanAndOrderSet()) { curop.debug().scanAndOrder = explain->getScanAndOrder(); } else { curop.debug().scanAndOrder = false; } if (explain->isNScannedSet()) { curop.debug().nscanned = explain->getNScanned(); } if (explain->isNScannedObjectsSet()) { curop.debug().nscannedObjects = explain->getNScannedObjects(); } if (explain->isIDHackSet()) { curop.debug().idhack = explain->getIDHack(); } if (!explain->stats.isEmpty()) { // execStats is a CachedBSONObj because it lives in the race-prone // curop. curop.debug().execStats.set(explain->stats); // Replace exec stats with plan summary if stats cannot fit into CachedBSONObj. if (curop.debug().execStats.tooBig() && !curop.debug().planSummary.empty()) { BSONObjBuilder bob; bob.append("summary", curop.debug().planSummary.toString()); curop.debug().execStats.set(bob.done()); } } } // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }