void run() { Client::WriteContext ctx(&_txn, ns()); insert(BSON("a" << 1 << "b" << 1)); Collection* collection = ctx.ctx().db()->getCollection(&_txn, ns()); BSONObj filterObj = fromjson("{_id: {$gt: 0}, b: {$gt: 0}}"); SingleSolutionRunner* ssr = makeCollScanRunner(ctx.ctx(),filterObj); // Make a client cursor from the runner. ClientCursor* cc = new ClientCursor(collection, ssr, 0, BSONObj()); ClientCursorPin ccPin(collection,cc->cursorid()); // If the cursor is pinned, it sticks around, // even after invalidation. ASSERT_EQUALS(1U, numCursors()); collection->cursorCache()->invalidateAll(false); ASSERT_EQUALS(1U, numCursors()); // The invalidation should have killed the runner. BSONObj objOut; ASSERT_EQUALS(Runner::RUNNER_DEAD, ssr->getNext(&objOut, NULL)); // Deleting the underlying cursor should cause the // number of cursors to return to 0. ccPin.deleteUnderlying(); ASSERT_EQUALS(0U, numCursors()); }
void run() { OldClientWriteContext ctx(&_txn, nss.ns()); insert(BSON("a" << 1 << "b" << 1)); Collection* collection = ctx.getCollection(); BSONObj filterObj = fromjson("{_id: {$gt: 0}, b: {$gt: 0}}"); PlanExecutor* exec = makeCollScanExec(collection, filterObj); // Make a client cursor from the runner. ClientCursor* cc = new ClientCursor(collection->getCursorManager(), exec, nss.ns(), false, 0, BSONObj()); ClientCursorPin ccPin(collection->getCursorManager(), cc->cursorid()); // If the cursor is pinned, it sticks around, // even after invalidation. ASSERT_EQUALS(1U, numCursors()); const std::string invalidateReason("InvalidatePinned Test"); collection->getCursorManager()->invalidateAll(false, invalidateReason); ASSERT_EQUALS(1U, numCursors()); // The invalidation should have killed the runner. BSONObj objOut; ASSERT_EQUALS(PlanExecutor::DEAD, exec->getNext(&objOut, NULL)); ASSERT(WorkingSetCommon::isValidStatusMemberObject(objOut)); const Status status = WorkingSetCommon::getMemberObjectStatus(objOut); ASSERT(status.reason().find(invalidateReason) != string::npos); // Deleting the underlying cursor should cause the // number of cursors to return to 0. ccPin.deleteUnderlying(); ASSERT_EQUALS(0U, numCursors()); }
virtual bool run(const string &db, BSONObj &cmdObj, int options, string &errmsg, BSONObjBuilder &result, bool fromRepl) { string ns = parseNs(db, cmdObj); intrusive_ptr<ExpressionContext> pCtx = new ExpressionContext(InterruptStatusMongod::status, NamespaceString(ns)); /* try to parse the command; if this fails, then we didn't run */ intrusive_ptr<Pipeline> pPipeline = Pipeline::parseCommand(errmsg, cmdObj, pCtx); if (!pPipeline.get()) return false; if (pPipeline->getSplitMongodPipeline()) { // This is only used in testing return executeSplitPipeline(result, errmsg, ns, db, pPipeline, pCtx); } #if _DEBUG // This is outside of the if block to keep the object alive until the pipeline is finished. BSONObj parsed; if (!pPipeline->isExplain() && !pCtx->inShard) { // Make sure all operations round-trip through Pipeline::toBson() // correctly by reparsing every command on DEBUG builds. This is // important because sharded aggregations rely on this ability. // Skipping when inShard because this has already been through the // transformation (and this unsets pCtx->inShard). parsed = pPipeline->serialize().toBson(); pPipeline = Pipeline::parseCommand(errmsg, parsed, pCtx); verify(pPipeline); } #endif // This does the mongod-specific stuff like creating a cursor PipelineD::prepareCursorSource(pPipeline, nsToDatabase(ns), pCtx); pPipeline->stitch(); if (isCursorCommand(cmdObj)) { CursorId id; { // Set up cursor Client::ReadContext ctx(ns); shared_ptr<Cursor> cursor(new PipelineCursor(pPipeline)); // cc will be owned by cursor manager ClientCursor* cc = new ClientCursor(0, cursor, ns, cmdObj.getOwned()); id = cc->cursorid(); } handleCursorCommand(id, cmdObj, result); } else { pPipeline->run(result); } if (DocumentSourceOut* out = dynamic_cast<DocumentSourceOut*>(pPipeline->output())) { result.append("outputNs", out->getOutputNs()); } return true; }
virtual bool run(const string &db, BSONObj &cmdObj, int options, string &errmsg, BSONObjBuilder &result, bool fromRepl) { string ns = parseNs(db, cmdObj); intrusive_ptr<ExpressionContext> pCtx = new ExpressionContext(InterruptStatusMongod::status, NamespaceString(ns)); pCtx->tempDir = storageGlobalParams.dbpath + "/_tmp"; /* try to parse the command; if this fails, then we didn't run */ intrusive_ptr<Pipeline> pPipeline = Pipeline::parseCommand(errmsg, cmdObj, pCtx); if (!pPipeline.get()) return false; #if _DEBUG // This is outside of the if block to keep the object alive until the pipeline is finished. BSONObj parsed; if (!pPipeline->isExplain() && !pCtx->inShard) { // Make sure all operations round-trip through Pipeline::toBson() // correctly by reparsing every command on DEBUG builds. This is // important because sharded aggregations rely on this ability. // Skipping when inShard because this has already been through the // transformation (and this unsets pCtx->inShard). parsed = pPipeline->serialize().toBson(); pPipeline = Pipeline::parseCommand(errmsg, parsed, pCtx); verify(pPipeline); } #endif // This does the mongod-specific stuff like creating a cursor PipelineD::prepareCursorSource(pPipeline, pCtx); pPipeline->stitch(); if (pPipeline->isExplain()) { result << "stages" << Value(pPipeline->writeExplainOps()); return true; // don't do any actual execution } if (isCursorCommand(cmdObj)) { CursorId id; { // Set up cursor Client::ReadContext ctx(ns); ClientCursor* cc = new ClientCursor(new PipelineRunner(pPipeline)); cc->isAggCursor = true; // enable special locking and ns deletion behavior id = cc->cursorid(); } handleCursorCommand(id, cmdObj, result); } else { pPipeline->run(result); } return true; }
void ClientCursor::idleTimeReport(unsigned millis) { bool foundSomeToTimeout = false; // two passes so that we don't need to readlock unless we really do some timeouts // we assume here that incrementing _idleAgeMillis outside readlock is ok. { recursive_scoped_lock lock(ccmutex); { unsigned sz = clientCursorsById.size(); static time_t last; if( sz >= 100000 ) { if( time(0) - last > 300 ) { last = time(0); log() << "warning number of open cursors is very large: " << sz << endl; } } } for ( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); ) { CCById::iterator j = i; i++; if( j->second->shouldTimeout( millis ) ) { foundSomeToTimeout = true; } } } if( foundSomeToTimeout ) { Lock::GlobalRead lk; recursive_scoped_lock cclock(ccmutex); CCById::const_iterator it = clientCursorsById.begin(); while (it != clientCursorsById.end()) { ClientCursor* cc = it->second; if( cc->shouldTimeout(0) ) { numberTimedOut++; LOG(1) << "killing old cursor " << cc->_cursorid << ' ' << cc->_ns << " idle:" << cc->idleTime() << "ms\n"; ClientCursor* toDelete = it->second; CursorId id = toDelete->cursorid(); // This is what winds up removing it from the map. delete toDelete; it = clientCursorsById.upper_bound(id); } else { ++it; } } } }
virtual bool run(OperationContext* txn, const string& db, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result) { const std::string ns = parseNs(db, cmdObj); if (nsToCollectionSubstring(ns).empty()) { errmsg = "missing collection name"; return false; } NamespaceString nss(ns); // Parse the options for this request. auto request = AggregationRequest::parseFromBSON(nss, cmdObj); if (!request.isOK()) { return appendCommandStatus(result, request.getStatus()); } // Set up the ExpressionContext. intrusive_ptr<ExpressionContext> expCtx = new ExpressionContext(txn, request.getValue()); expCtx->tempDir = storageGlobalParams.dbpath + "/_tmp"; // Parse the pipeline. auto statusWithPipeline = Pipeline::parse(request.getValue().getPipeline(), expCtx); if (!statusWithPipeline.isOK()) { return appendCommandStatus(result, statusWithPipeline.getStatus()); } auto pipeline = std::move(statusWithPipeline.getValue()); auto resolvedNamespaces = resolveInvolvedNamespaces(txn, pipeline, expCtx); if (!resolvedNamespaces.isOK()) { return appendCommandStatus(result, resolvedNamespaces.getStatus()); } expCtx->resolvedNamespaces = std::move(resolvedNamespaces.getValue()); unique_ptr<ClientCursorPin> pin; // either this OR the exec will be non-null unique_ptr<PlanExecutor> exec; auto curOp = CurOp::get(txn); { // This will throw if the sharding version for this connection is out of date. If the // namespace is a view, the lock will be released before re-running the aggregation. // Otherwise, the lock must be held continuously from now until we have we created both // the output ClientCursor and the input executor. This ensures that both are using the // same sharding version that we synchronize on here. This is also why we always need to // create a ClientCursor even when we aren't outputting to a cursor. See the comment on // ShardFilterStage for more details. AutoGetCollectionOrViewForRead ctx(txn, nss); Collection* collection = ctx.getCollection(); // If running $collStats on a view, we do not resolve the view since we want stats // on this view namespace. auto startsWithCollStats = [&pipeline]() { const Pipeline::SourceContainer& sources = pipeline->getSources(); return !sources.empty() && dynamic_cast<DocumentSourceCollStats*>(sources.front().get()); }; // If this is a view, resolve it by finding the underlying collection and stitching view // pipelines and this request's pipeline together. We then release our locks before // recursively calling run, which will re-acquire locks on the underlying collection. // (The lock must be released because recursively acquiring locks on the database will // prohibit yielding.) auto view = ctx.getView(); if (view && !startsWithCollStats()) { auto viewDefinition = ViewShardingCheck::getResolvedViewIfSharded(txn, ctx.getDb(), view); if (!viewDefinition.isOK()) { return appendCommandStatus(result, viewDefinition.getStatus()); } if (!viewDefinition.getValue().isEmpty()) { ViewShardingCheck::appendShardedViewStatus(viewDefinition.getValue(), &result); return false; } auto resolvedView = ctx.getDb()->getViewCatalog()->resolveView(txn, nss); if (!resolvedView.isOK()) { return appendCommandStatus(result, resolvedView.getStatus()); } // With the view resolved, we can relinquish locks. ctx.releaseLocksForView(); // Parse the resolved view into a new aggregation request. auto viewCmd = resolvedView.getValue().asExpandedViewAggregation(request.getValue()); if (!viewCmd.isOK()) { return appendCommandStatus(result, viewCmd.getStatus()); } bool status = this->run(txn, db, viewCmd.getValue(), options, errmsg, result); { // Set the namespace of the curop back to the view namespace so ctx records // stats on this view namespace on destruction. stdx::lock_guard<Client>(*txn->getClient()); curOp->setNS_inlock(nss.ns()); } return status; } // If the pipeline does not have a user-specified collation, set it from the collection // default. if (request.getValue().getCollation().isEmpty() && collection && collection->getDefaultCollator()) { invariant(!expCtx->getCollator()); expCtx->setCollator(collection->getDefaultCollator()->clone()); } // Propagate the ExpressionContext throughout all of the pipeline's stages and // expressions. pipeline->injectExpressionContext(expCtx); // The pipeline must be optimized after the correct collator has been set on it (by // injecting the ExpressionContext containing the collator). This is necessary because // optimization may make string comparisons, e.g. optimizing {$eq: [<str1>, <str2>]} to // a constant. pipeline->optimizePipeline(); if (kDebugBuild && !expCtx->isExplain && !expCtx->inShard) { // Make sure all operations round-trip through Pipeline::serialize() correctly by // re-parsing every command in debug builds. This is important because sharded // aggregations rely on this ability. Skipping when inShard because this has // already been through the transformation (and this un-sets expCtx->inShard). pipeline = reparsePipeline(pipeline, request.getValue(), expCtx); } // This does mongod-specific stuff like creating the input PlanExecutor and adding // it to the front of the pipeline if needed. PipelineD::prepareCursorSource(collection, pipeline); // Create the PlanExecutor which returns results from the pipeline. The WorkingSet // ('ws') and the PipelineProxyStage ('proxy') will be owned by the created // PlanExecutor. auto ws = make_unique<WorkingSet>(); auto proxy = make_unique<PipelineProxyStage>(txn, pipeline, ws.get()); auto statusWithPlanExecutor = (NULL == collection) ? PlanExecutor::make( txn, std::move(ws), std::move(proxy), nss.ns(), PlanExecutor::YIELD_MANUAL) : PlanExecutor::make( txn, std::move(ws), std::move(proxy), collection, PlanExecutor::YIELD_MANUAL); invariant(statusWithPlanExecutor.isOK()); exec = std::move(statusWithPlanExecutor.getValue()); { auto planSummary = Explain::getPlanSummary(exec.get()); stdx::lock_guard<Client>(*txn->getClient()); curOp->setPlanSummary_inlock(std::move(planSummary)); } if (collection) { PlanSummaryStats stats; Explain::getSummaryStats(*exec, &stats); collection->infoCache()->notifyOfQuery(txn, stats.indexesUsed); } if (collection) { const bool isAggCursor = true; // enable special locking behavior ClientCursor* cursor = new ClientCursor(collection->getCursorManager(), exec.release(), nss.ns(), txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(), 0, cmdObj.getOwned(), isAggCursor); pin.reset(new ClientCursorPin(collection->getCursorManager(), cursor->cursorid())); // Don't add any code between here and the start of the try block. } // At this point, it is safe to release the collection lock. // - In the case where we have a collection: we will need to reacquire the // collection lock later when cleaning up our ClientCursorPin. // - In the case where we don't have a collection: our PlanExecutor won't be // registered, so it will be safe to clean it up outside the lock. invariant(!exec || !collection); } try { // Unless set to true, the ClientCursor created above will be deleted on block exit. bool keepCursor = false; // Use of the aggregate command without specifying to use a cursor is deprecated. // Applications should migrate to using cursors. Cursors are strictly more useful than // outputting the results as a single document, since results that fit inside a single // BSONObj will also fit inside a single batch. // // We occasionally log a deprecation warning. if (!request.getValue().isCursorCommand()) { RARELY { warning() << "Use of the aggregate command without the 'cursor' " "option is deprecated. See " "http://dochub.mongodb.org/core/aggregate-without-cursor-deprecation."; } } // If both explain and cursor are specified, explain wins. if (expCtx->isExplain) { result << "stages" << Value(pipeline->writeExplainOps()); } else if (request.getValue().isCursorCommand()) { keepCursor = handleCursorCommand(txn, nss.ns(), pin.get(), pin ? pin->c()->getExecutor() : exec.get(), request.getValue(), result); } else { pipeline->run(result); } if (!expCtx->isExplain) { PlanSummaryStats stats; Explain::getSummaryStats(pin ? *pin->c()->getExecutor() : *exec.get(), &stats); curOp->debug().setPlanSummaryMetrics(stats); curOp->debug().nreturned = stats.nReturned; } // Clean up our ClientCursorPin, if needed. We must reacquire the collection lock // in order to do so. if (pin) { // We acquire locks here with DBLock and CollectionLock instead of using // AutoGetCollectionForRead. AutoGetCollectionForRead will throw if the // sharding version is out of date, and we don't care if the sharding version // has changed. Lock::DBLock dbLock(txn->lockState(), nss.db(), MODE_IS); Lock::CollectionLock collLock(txn->lockState(), nss.ns(), MODE_IS); if (keepCursor) { pin->release(); } else { pin->deleteUnderlying(); } } } catch (...) {
bool run(OperationContext* txn, const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool /*fromRepl*/) { boost::scoped_ptr<MatchExpression> matcher; BSONElement filterElt = jsobj["filter"]; if (!filterElt.eoo()) { if (filterElt.type() != mongo::Object) { return appendCommandStatus(result, Status(ErrorCodes::TypeMismatch, str::stream() << "\"filter\" must be of type Object, not " << typeName(filterElt.type()))); } StatusWithMatchExpression statusWithMatcher = MatchExpressionParser::parse(filterElt.Obj()); if (!statusWithMatcher.isOK()) { return appendCommandStatus(result, statusWithMatcher.getStatus()); } matcher.reset(statusWithMatcher.getValue()); } const long long defaultBatchSize = std::numeric_limits<long long>::max(); long long batchSize; Status parseCursorStatus = parseCommandCursorOptions(jsobj, defaultBatchSize, &batchSize); if (!parseCursorStatus.isOK()) { return appendCommandStatus(result, parseCursorStatus); } ScopedTransaction scopedXact(txn, MODE_IS); AutoGetDb autoDb(txn, dbname, MODE_S); const Database* d = autoDb.getDb(); const DatabaseCatalogEntry* dbEntry = NULL; list<string> names; if (d) { dbEntry = d->getDatabaseCatalogEntry(); dbEntry->getCollectionNamespaces(&names); names.sort(); } std::auto_ptr<WorkingSet> ws(new WorkingSet()); std::auto_ptr<QueuedDataStage> root(new QueuedDataStage(ws.get())); for (std::list<std::string>::const_iterator i = names.begin(); i != names.end(); ++i) { const std::string& ns = *i; StringData collection = nsToCollectionSubstring(ns); if (collection == "system.namespaces") { continue; } BSONObjBuilder b; b.append("name", collection); CollectionOptions options = dbEntry->getCollectionCatalogEntry(ns)->getCollectionOptions(txn); b.append("options", options.toBSON()); BSONObj maybe = b.obj(); if (matcher && !matcher->matchesBSON(maybe)) { continue; } WorkingSetMember member; member.state = WorkingSetMember::OWNED_OBJ; member.keyData.clear(); member.loc = RecordId(); member.obj = Snapshotted<BSONObj>(SnapshotId(), maybe); root->pushBack(member); } std::string cursorNamespace = str::stream() << dbname << ".$cmd." << name; dassert(NamespaceString(cursorNamespace).isValid()); dassert(NamespaceString(cursorNamespace).isListCollectionsGetMore()); PlanExecutor* rawExec; Status makeStatus = PlanExecutor::make(txn, ws.release(), root.release(), cursorNamespace, PlanExecutor::YIELD_MANUAL, &rawExec); std::auto_ptr<PlanExecutor> exec(rawExec); if (!makeStatus.isOK()) { return appendCommandStatus(result, makeStatus); } BSONArrayBuilder firstBatch; const int byteLimit = MaxBytesToReturnToClientAtOnce; for (long long objCount = 0; objCount < batchSize && firstBatch.len() < byteLimit; objCount++) { BSONObj next; PlanExecutor::ExecState state = exec->getNext(&next, NULL); if (state == PlanExecutor::IS_EOF) { break; } invariant(state == PlanExecutor::ADVANCED); firstBatch.append(next); } CursorId cursorId = 0LL; if (!exec->isEOF()) { exec->saveState(); ClientCursor* cursor = new ClientCursor( CursorManager::getGlobalCursorManager(), exec.release(), cursorNamespace); cursorId = cursor->cursorid(); } Command::appendCursorResponseObject(cursorId, cursorNamespace, firstBatch.arr(), &result); return true; }
/** * Runs a query using the following steps: * 1) Parsing. * 2) Acquire locks. * 3) Plan query, obtaining an executor that can run it. * 4) Setup a cursor for the query, which may be used on subsequent getMores. * 5) Generate the first batch. * 6) Save state for getMore. * 7) Generate response to send to the client. * * TODO: Rather than using the sharding version available in thread-local storage (i.e. the * call to ShardingState::needCollectionMetadata() below), shard version information * should be passed as part of the command parameter. */ bool run(OperationContext* txn, const std::string& dbname, BSONObj& cmdObj, int options, std::string& errmsg, BSONObjBuilder& result) override { const std::string fullns = parseNs(dbname, cmdObj); const NamespaceString nss(fullns); if (!nss.isValid()) { return appendCommandStatus(result, {ErrorCodes::InvalidNamespace, str::stream() << "Invalid collection name: " << nss.ns()}); } // Although it is a command, a find command gets counted as a query. globalOpCounters.gotQuery(); if (txn->getClient()->isInDirectClient()) { return appendCommandStatus( result, Status(ErrorCodes::IllegalOperation, "Cannot run find command from eval()")); } // 1a) Parse the command BSON to a LiteParsedQuery. const bool isExplain = false; auto lpqStatus = LiteParsedQuery::makeFromFindCommand(nss, cmdObj, isExplain); if (!lpqStatus.isOK()) { return appendCommandStatus(result, lpqStatus.getStatus()); } auto& lpq = lpqStatus.getValue(); // Validate term, if provided. if (auto term = lpq->getReplicationTerm()) { auto replCoord = repl::ReplicationCoordinator::get(txn); Status status = replCoord->updateTerm(*term); // Note: updateTerm returns ok if term stayed the same. if (!status.isOK()) { return appendCommandStatus(result, status); } } // Fill out curop information. long long ntoreturn = lpq->getBatchSize().value_or(0); beginQueryOp(txn, nss, cmdObj, ntoreturn, lpq->getSkip()); // 1b) Finish the parsing step by using the LiteParsedQuery to create a CanonicalQuery. WhereCallbackReal whereCallback(txn, nss.db()); auto statusWithCQ = CanonicalQuery::canonicalize(lpq.release(), whereCallback); if (!statusWithCQ.isOK()) { return appendCommandStatus(result, statusWithCQ.getStatus()); } std::unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue()); // 2) Acquire locks. AutoGetCollectionForRead ctx(txn, nss); Collection* collection = ctx.getCollection(); const int dbProfilingLevel = ctx.getDb() ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile; ShardingState* const shardingState = ShardingState::get(txn); // It is possible that the sharding version will change during yield while we are // retrieving a plan executor. If this happens we will throw an error and mongos will // retry. const ChunkVersion shardingVersionAtStart = shardingState->getVersion(nss.ns()); // 3) Get the execution plan for the query. auto statusWithPlanExecutor = getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO); if (!statusWithPlanExecutor.isOK()) { return appendCommandStatus(result, statusWithPlanExecutor.getStatus()); } std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue()); // TODO: Currently, chunk ranges are kept around until all ClientCursors created while // the chunk belonged on this node are gone. Separating chunk lifetime management from // ClientCursor should allow this check to go away. if (!shardingState->getVersion(nss.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // Version changed while retrieving a PlanExecutor. Terminate the operation, // signaling that mongos should retry. throw SendStaleConfigException(nss.ns(), "version changed during find command", shardingVersionAtStart, shardingState->getVersion(nss.ns())); } if (!collection) { // No collection. Just fill out curop indicating that there were zero results and // there is no ClientCursor id, and then return. const long long numResults = 0; const CursorId cursorId = 0; endQueryOp(txn, *exec, dbProfilingLevel, numResults, cursorId); appendCursorResponseObject(cursorId, nss.ns(), BSONArray(), &result); return true; } const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed(); // 4) If possible, register the execution plan inside a ClientCursor, and pin that // cursor. In this case, ownership of the PlanExecutor is transferred to the // ClientCursor, and 'exec' becomes null. // // First unregister the PlanExecutor so it can be re-registered with ClientCursor. exec->deregisterExec(); // Create a ClientCursor containing this plan executor. We don't have to worry // about leaking it as it's inserted into a global map by its ctor. ClientCursor* cursor = new ClientCursor(collection->getCursorManager(), exec.release(), nss.ns(), txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(), pq.getOptions(), pq.getFilter()); CursorId cursorId = cursor->cursorid(); ClientCursorPin ccPin(collection->getCursorManager(), cursorId); // On early return, get rid of the the cursor. ScopeGuard cursorFreer = MakeGuard(&ClientCursorPin::deleteUnderlying, ccPin); invariant(!exec); PlanExecutor* cursorExec = cursor->getExecutor(); // 5) Stream query results, adding them to a BSONArray as we go. BSONArrayBuilder firstBatch; BSONObj obj; PlanExecutor::ExecState state; long long numResults = 0; while (!enoughForFirstBatch(pq, numResults, firstBatch.len()) && PlanExecutor::ADVANCED == (state = cursorExec->getNext(&obj, NULL))) { // If adding this object will cause us to exceed the BSON size limit, then we stash // it for later. if (firstBatch.len() + obj.objsize() > BSONObjMaxUserSize && numResults > 0) { cursorExec->enqueue(obj); break; } // Add result to output buffer. firstBatch.append(obj); numResults++; } // Throw an assertion if query execution fails for any reason. if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) { const std::unique_ptr<PlanStageStats> stats(cursorExec->getStats()); error() << "Plan executor error during find command: " << PlanExecutor::statestr(state) << ", stats: " << Explain::statsToBSON(*stats); return appendCommandStatus(result, Status(ErrorCodes::OperationFailed, str::stream() << "Executor error during find command: " << WorkingSetCommon::toStatusString(obj))); } // 6) Set up the cursor for getMore. if (shouldSaveCursor(txn, collection, state, cursorExec)) { // State will be restored on getMore. cursorExec->saveState(); cursorExec->detachFromOperationContext(); cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros()); cursor->setPos(numResults); } else { cursorId = 0; } // Fill out curop based on the results. endQueryOp(txn, *cursorExec, dbProfilingLevel, numResults, cursorId); // 7) Generate the response object to send to the client. appendCursorResponseObject(cursorId, nss.ns(), firstBatch.arr(), &result); if (cursorId) { cursorFreer.Dismiss(); } return true; }
virtual bool run( const string& dbname, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl = false ) { NamespaceString ns( dbname, cmdObj[name].String() ); Client::ReadContext ctx(ns.ns()); Database* db = ctx.ctx().db(); Collection* collection = db->getCollection( ns ); if ( !collection ) return appendCommandStatus( result, Status( ErrorCodes::NamespaceNotFound, str::stream() << "ns does not exist: " << ns.ns() ) ); size_t numCursors = static_cast<size_t>( cmdObj["numCursors"].numberInt() ); if ( numCursors == 0 || numCursors > 10000 ) return appendCommandStatus( result, Status( ErrorCodes::BadValue, str::stream() << "numCursors has to be between 1 and 10000" << " was: " << numCursors ) ); OwnedPointerVector<RecordIterator> iterators(collection->getManyIterators()); if (iterators.size() < numCursors) { numCursors = iterators.size(); } OwnedPointerVector<MultiIteratorRunner> runners; for ( size_t i = 0; i < numCursors; i++ ) { runners.push_back(new MultiIteratorRunner(ns.ns(), collection)); } // transfer iterators to runners using a round-robin distribution. // TODO consider using a common work queue once invalidation issues go away. for (size_t i = 0; i < iterators.size(); i++) { runners[i % runners.size()]->addIterator(iterators.releaseAt(i)); } { BSONArrayBuilder bucketsBuilder; for (size_t i = 0; i < runners.size(); i++) { // transfer ownership of a runner to the ClientCursor (which manages its own // lifetime). ClientCursor* cc = new ClientCursor( collection, runners.releaseAt(i) ); // we are mimicking the aggregation cursor output here // that is why there are ns, ok and empty firstBatch BSONObjBuilder threadResult; { BSONObjBuilder cursor; cursor.appendArray( "firstBatch", BSONObj() ); cursor.append( "ns", ns ); cursor.append( "id", cc->cursorid() ); threadResult.append( "cursor", cursor.obj() ); } threadResult.appendBool( "ok", 1 ); bucketsBuilder.append( threadResult.obj() ); } result.appendArray( "cursors", bucketsBuilder.obj() ); } return true; }
virtual bool run(OperationContext* txn, const string& db, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result) { const std::string ns = parseNs(db, cmdObj); if (nsToCollectionSubstring(ns).empty()) { errmsg = "missing collection name"; return false; } NamespaceString nss(ns); intrusive_ptr<ExpressionContext> pCtx = new ExpressionContext(txn, nss); pCtx->tempDir = storageGlobalParams.dbpath + "/_tmp"; /* try to parse the command; if this fails, then we didn't run */ intrusive_ptr<Pipeline> pPipeline = Pipeline::parseCommand(errmsg, cmdObj, pCtx); if (!pPipeline.get()) return false; // Save and reset the write concern so that it doesn't get changed accidentally by // DBDirectClient. auto oldWC = txn->getWriteConcern(); ON_BLOCK_EXIT([txn, oldWC] { txn->setWriteConcern(oldWC); }); // This is outside of the if block to keep the object alive until the pipeline is finished. BSONObj parsed; if (kDebugBuild && !pPipeline->isExplain() && !pCtx->inShard) { // Make sure all operations round-trip through Pipeline::toBson() correctly by // reparsing every command in debug builds. This is important because sharded // aggregations rely on this ability. Skipping when inShard because this has // already been through the transformation (and this unsets pCtx->inShard). parsed = pPipeline->serialize().toBson(); pPipeline = Pipeline::parseCommand(errmsg, parsed, pCtx); verify(pPipeline); } unique_ptr<ClientCursorPin> pin; // either this OR the exec will be non-null unique_ptr<PlanExecutor> exec; { // This will throw if the sharding version for this connection is out of date. The // lock must be held continuously from now until we have we created both the output // ClientCursor and the input executor. This ensures that both are using the same // sharding version that we synchronize on here. This is also why we always need to // create a ClientCursor even when we aren't outputting to a cursor. See the comment // on ShardFilterStage for more details. AutoGetCollectionForRead ctx(txn, nss.ns()); Collection* collection = ctx.getCollection(); // This does mongod-specific stuff like creating the input PlanExecutor and adding // it to the front of the pipeline if needed. std::shared_ptr<PlanExecutor> input = PipelineD::prepareCursorSource(txn, collection, nss, pPipeline, pCtx); pPipeline->stitch(); if (collection && input) { // Record the indexes used by the input executor. Retrieval of summary stats for a // PlanExecutor is normally done post execution. DocumentSourceCursor however will // destroy the input PlanExecutor once the result set has been exhausted. For // that reason we need to collect the indexes used prior to plan execution. PlanSummaryStats stats; Explain::getSummaryStats(*input, &stats); collection->infoCache()->notifyOfQuery(txn, stats.indexesUsed); // TODO SERVER-23265: Confirm whether this is the correct place to gather all // metrics. There is no harm adding here for the time being. CurOp::get(txn)->debug().setPlanSummaryMetrics(stats); } // Create the PlanExecutor which returns results from the pipeline. The WorkingSet // ('ws') and the PipelineProxyStage ('proxy') will be owned by the created // PlanExecutor. auto ws = make_unique<WorkingSet>(); auto proxy = make_unique<PipelineProxyStage>(txn, pPipeline, input, ws.get()); auto statusWithPlanExecutor = (NULL == collection) ? PlanExecutor::make( txn, std::move(ws), std::move(proxy), nss.ns(), PlanExecutor::YIELD_MANUAL) : PlanExecutor::make( txn, std::move(ws), std::move(proxy), collection, PlanExecutor::YIELD_MANUAL); invariant(statusWithPlanExecutor.isOK()); exec = std::move(statusWithPlanExecutor.getValue()); if (!collection && input) { // If we don't have a collection, we won't be able to register any executors, so // make sure that the input PlanExecutor (likely wrapping an EOFStage) doesn't // need to be registered. invariant(!input->collection()); } if (collection) { const bool isAggCursor = true; // enable special locking behavior ClientCursor* cursor = new ClientCursor(collection->getCursorManager(), exec.release(), nss.ns(), txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(), 0, cmdObj.getOwned(), isAggCursor); pin.reset(new ClientCursorPin(collection->getCursorManager(), cursor->cursorid())); // Don't add any code between here and the start of the try block. } // At this point, it is safe to release the collection lock. // - In the case where we have a collection: we will need to reacquire the // collection lock later when cleaning up our ClientCursorPin. // - In the case where we don't have a collection: our PlanExecutor won't be // registered, so it will be safe to clean it up outside the lock. invariant(NULL == exec.get() || NULL == exec->collection()); } try { // Unless set to true, the ClientCursor created above will be deleted on block exit. bool keepCursor = false; const bool isCursorCommand = !cmdObj["cursor"].eoo(); // If both explain and cursor are specified, explain wins. if (pPipeline->isExplain()) { result << "stages" << Value(pPipeline->writeExplainOps()); } else if (isCursorCommand) { keepCursor = handleCursorCommand(txn, nss.ns(), pin.get(), pin ? pin->c()->getExecutor() : exec.get(), cmdObj, result); } else { pPipeline->run(result); } // Clean up our ClientCursorPin, if needed. We must reacquire the collection lock // in order to do so. if (pin) { // We acquire locks here with DBLock and CollectionLock instead of using // AutoGetCollectionForRead. AutoGetCollectionForRead will throw if the // sharding version is out of date, and we don't care if the sharding version // has changed. Lock::DBLock dbLock(txn->lockState(), nss.db(), MODE_IS); Lock::CollectionLock collLock(txn->lockState(), nss.ns(), MODE_IS); if (keepCursor) { pin->release(); } else { pin->deleteUnderlying(); } } } catch (...) { // On our way out of scope, we clean up our ClientCursorPin if needed. if (pin) { Lock::DBLock dbLock(txn->lockState(), nss.db(), MODE_IS); Lock::CollectionLock collLock(txn->lockState(), nss.ns(), MODE_IS); pin->deleteUnderlying(); } throw; } // Any code that needs the cursor pinned must be inside the try block, above. return true; }
/** * Runs a query using the following steps: * --Parsing. * --Acquire locks. * --Plan query, obtaining an executor that can run it. * --Generate the first batch. * --Save state for getMore, transferring ownership of the executor to a ClientCursor. * --Generate response to send to the client. */ bool run(OperationContext* txn, const std::string& dbname, BSONObj& cmdObj, int options, std::string& errmsg, BSONObjBuilder& result) override { const NamespaceString nss(parseNs(dbname, cmdObj)); if (!nss.isValid() || nss.isCommand() || nss.isSpecialCommand()) { return appendCommandStatus(result, {ErrorCodes::InvalidNamespace, str::stream() << "Invalid collection name: " << nss.ns()}); } // Although it is a command, a find command gets counted as a query. globalOpCounters.gotQuery(); if (txn->getClient()->isInDirectClient()) { return appendCommandStatus( result, Status(ErrorCodes::IllegalOperation, "Cannot run find command from eval()")); } // Parse the command BSON to a QueryRequest. const bool isExplain = false; auto qrStatus = QueryRequest::makeFromFindCommand(nss, cmdObj, isExplain); if (!qrStatus.isOK()) { return appendCommandStatus(result, qrStatus.getStatus()); } auto& qr = qrStatus.getValue(); // Validate term before acquiring locks, if provided. if (auto term = qr->getReplicationTerm()) { auto replCoord = repl::ReplicationCoordinator::get(txn); Status status = replCoord->updateTerm(txn, *term); // Note: updateTerm returns ok if term stayed the same. if (!status.isOK()) { return appendCommandStatus(result, status); } } // Fill out curop information. // // We pass negative values for 'ntoreturn' and 'ntoskip' to indicate that these values // should be omitted from the log line. Limit and skip information is already present in the // find command parameters, so these fields are redundant. const int ntoreturn = -1; const int ntoskip = -1; beginQueryOp(txn, nss, cmdObj, ntoreturn, ntoskip); // Finish the parsing step by using the QueryRequest to create a CanonicalQuery. ExtensionsCallbackReal extensionsCallback(txn, &nss); auto statusWithCQ = CanonicalQuery::canonicalize(txn, std::move(qr), extensionsCallback); if (!statusWithCQ.isOK()) { return appendCommandStatus(result, statusWithCQ.getStatus()); } std::unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue()); // Acquire locks. AutoGetCollectionForRead ctx(txn, nss); Collection* collection = ctx.getCollection(); // Get the execution plan for the query. auto statusWithPlanExecutor = getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO); if (!statusWithPlanExecutor.isOK()) { return appendCommandStatus(result, statusWithPlanExecutor.getStatus()); } std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue()); { stdx::lock_guard<Client>(*txn->getClient()); CurOp::get(txn)->setPlanSummary_inlock(Explain::getPlanSummary(exec.get())); } if (!collection) { // No collection. Just fill out curop indicating that there were zero results and // there is no ClientCursor id, and then return. const long long numResults = 0; const CursorId cursorId = 0; endQueryOp(txn, collection, *exec, numResults, cursorId); appendCursorResponseObject(cursorId, nss.ns(), BSONArray(), &result); return true; } const QueryRequest& originalQR = exec->getCanonicalQuery()->getQueryRequest(); // Stream query results, adding them to a BSONArray as we go. CursorResponseBuilder firstBatch(/*isInitialResponse*/ true, &result); BSONObj obj; PlanExecutor::ExecState state = PlanExecutor::ADVANCED; long long numResults = 0; while (!FindCommon::enoughForFirstBatch(originalQR, numResults) && PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // If we can't fit this result inside the current batch, then we stash it for later. if (!FindCommon::haveSpaceForNext(obj, numResults, firstBatch.bytesUsed())) { exec->enqueue(obj); break; } // Add result to output buffer. firstBatch.append(obj); numResults++; } // Throw an assertion if query execution fails for any reason. if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) { firstBatch.abandon(); error() << "Plan executor error during find command: " << PlanExecutor::statestr(state) << ", stats: " << Explain::getWinningPlanStats(exec.get()); return appendCommandStatus(result, Status(ErrorCodes::OperationFailed, str::stream() << "Executor error during find command: " << WorkingSetCommon::toStatusString(obj))); } // Before saving the cursor, ensure that whatever plan we established happened with the // expected collection version auto css = CollectionShardingState::get(txn, nss); css->checkShardVersionOrThrow(txn); // Set up the cursor for getMore. CursorId cursorId = 0; if (shouldSaveCursor(txn, collection, state, exec.get())) { // Register the execution plan inside a ClientCursor. Ownership of the PlanExecutor is // transferred to the ClientCursor. // // First unregister the PlanExecutor so it can be re-registered with ClientCursor. exec->deregisterExec(); // Create a ClientCursor containing this plan executor. We don't have to worry about // leaking it as it's inserted into a global map by its ctor. ClientCursor* cursor = new ClientCursor(collection->getCursorManager(), exec.release(), nss.ns(), txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(), originalQR.getOptions(), cmdObj.getOwned()); cursorId = cursor->cursorid(); invariant(!exec); PlanExecutor* cursorExec = cursor->getExecutor(); // State will be restored on getMore. cursorExec->saveState(); cursorExec->detachFromOperationContext(); cursor->setLeftoverMaxTimeMicros(txn->getRemainingMaxTimeMicros()); cursor->setPos(numResults); // Fill out curop based on the results. endQueryOp(txn, collection, *cursorExec, numResults, cursorId); } else { endQueryOp(txn, collection, *exec, numResults, cursorId); } // Generate the response object to send to the client. firstBatch.done(cursorId, nss.ns()); return true; }
virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl = false ) { NamespaceString ns( dbname, cmdObj[name].String() ); AutoGetCollectionForRead ctx(txn, ns.ns()); Collection* collection = ctx.getCollection(); if ( !collection ) return appendCommandStatus( result, Status( ErrorCodes::NamespaceNotFound, str::stream() << "ns does not exist: " << ns.ns() ) ); size_t numCursors = static_cast<size_t>( cmdObj["numCursors"].numberInt() ); if ( numCursors == 0 || numCursors > 10000 ) return appendCommandStatus( result, Status( ErrorCodes::BadValue, str::stream() << "numCursors has to be between 1 and 10000" << " was: " << numCursors ) ); OwnedPointerVector<RecordIterator> iterators(collection->getManyIterators(txn)); if (iterators.size() < numCursors) { numCursors = iterators.size(); } OwnedPointerVector<PlanExecutor> execs; for ( size_t i = 0; i < numCursors; i++ ) { WorkingSet* ws = new WorkingSet(); MultiIteratorStage* mis = new MultiIteratorStage(txn, ws, collection); PlanExecutor* rawExec; // Takes ownership of 'ws' and 'mis'. Status execStatus = PlanExecutor::make(txn, ws, mis, collection, PlanExecutor::YIELD_AUTO, &rawExec); invariant(execStatus.isOK()); auto_ptr<PlanExecutor> curExec(rawExec); // The PlanExecutor was registered on construction due to the YIELD_AUTO policy. // We have to deregister it, as it will be registered with ClientCursor. curExec->deregisterExec(); // Need to save state while yielding locks between now and getMore(). curExec->saveState(); execs.push_back(curExec.release()); } // transfer iterators to executors using a round-robin distribution. // TODO consider using a common work queue once invalidation issues go away. for (size_t i = 0; i < iterators.size(); i++) { PlanExecutor* theExec = execs[i % execs.size()]; MultiIteratorStage* mis = static_cast<MultiIteratorStage*>(theExec->getRootStage()); // This wasn't called above as they weren't assigned yet iterators[i]->saveState(); mis->addIterator(iterators.releaseAt(i)); } { BSONArrayBuilder bucketsBuilder; for (size_t i = 0; i < execs.size(); i++) { // transfer ownership of an executor to the ClientCursor (which manages its own // lifetime). ClientCursor* cc = new ClientCursor( collection->getCursorManager(), execs.releaseAt(i), ns.ns() ); BSONObjBuilder threadResult; appendCursorResponseObject( cc->cursorid(), ns.ns(), BSONArray(), &threadResult ); threadResult.appendBool( "ok", 1 ); bucketsBuilder.append( threadResult.obj() ); } result.appendArray( "cursors", bucketsBuilder.obj() ); } return true; }
void ClientCursor::LockedIterator::deleteAndAdvance() { ClientCursor *cc = current(); CursorId id = cc->cursorid(); delete cc; _i = clientCursorsById.upper_bound( id ); }
std::string runQuery(OperationContext* txn, QueryMessage& q, const NamespaceString& nss, CurOp& curop, Message &result) { // Validate the namespace. uassert(16256, str::stream() << "Invalid ns [" << nss.ns() << "]", nss.isValid()); invariant(!nss.isCommand()); // Set curop information. beginQueryOp(nss, q.query, q.ntoreturn, q.ntoskip, &curop); // Parse the qm into a CanonicalQuery. std::auto_ptr<CanonicalQuery> cq; { CanonicalQuery* cqRaw; Status canonStatus = CanonicalQuery::canonicalize(q, &cqRaw, WhereCallbackReal(txn, nss.db())); if (!canonStatus.isOK()) { uasserted(17287, str::stream() << "Can't canonicalize query: " << canonStatus.toString()); } cq.reset(cqRaw); } invariant(cq.get()); LOG(5) << "Running query:\n" << cq->toString(); LOG(2) << "Running query: " << cq->toStringShort(); // Parse, canonicalize, plan, transcribe, and get a plan executor. AutoGetCollectionForRead ctx(txn, nss); Collection* collection = ctx.getCollection(); const int dbProfilingLevel = ctx.getDb() ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile; // We have a parsed query. Time to get the execution plan for it. std::unique_ptr<PlanExecutor> exec; { PlanExecutor* rawExec; Status execStatus = getExecutorFind(txn, collection, nss, cq.release(), PlanExecutor::YIELD_AUTO, &rawExec); uassertStatusOK(execStatus); exec.reset(rawExec); } const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed(); // If it's actually an explain, do the explain and return rather than falling through // to the normal query execution loop. if (pq.isExplain()) { BufBuilder bb; bb.skip(sizeof(QueryResult::Value)); BSONObjBuilder explainBob; Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob); // Add the resulting object to the return buffer. BSONObj explainObj = explainBob.obj(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // TODO: Does this get overwritten/do we really need to set this twice? curop.debug().query = q.query; // Set query result fields. QueryResult::View qr = bb.buf(); bb.decouple(); qr.setResultFlagsToOk(); qr.msgdata().setLen(bb.len()); curop.debug().responseLength = bb.len(); qr.msgdata().setOperation(opReply); qr.setCursorId(0); qr.setStartingFrom(0); qr.setNReturned(1); result.setData(qr.view2ptr(), true); return ""; } // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(nss.ns()); // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. bool slaveOK = pq.isSlaveOk() || pq.hasReadPref(); Status serveReadsStatus = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor( txn, nss, slaveOK); uassertStatusOK(serveReadsStatus); // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult::Value)); // How many results have we obtained from the executor? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. Timestamp slaveReadTill; BSONObj obj; PlanExecutor::ExecState state; // uint64_t numMisplacedDocs = 0; // Get summary info about which plan the executor is using. curop.debug().planSummary = Explain::getPlanSummary(exec.get()); while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.isOplogReplay()) { BSONElement e = obj["ts"]; if (Date == e.type() || bsonTimestamp == e.type()) { slaveReadTill = e.timestamp(); } } if (enoughForFirstBatch(pq, numResults, bb.len())) { LOG(5) << "Enough for first batch, wantMore=" << pq.wantMore() << " numToReturn=" << pq.getNumToReturn() << " numResults=" << numResults << endl; break; } } // If we cache the executor later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the executor later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the executor. exec->deregisterExec(); // Caller expects exceptions thrown in certain cases. if (PlanExecutor::FAILURE == state) { scoped_ptr<PlanStageStats> stats(exec->getStats()); error() << "Plan executor error, stats: " << Explain::statsToBSON(*stats); uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj)); } // TODO: Currently, chunk ranges are kept around until all ClientCursors created while the // chunk belonged on this node are gone. Separating chunk lifetime management from // ClientCursor should allow this check to go away. if (!shardingState.getVersion(nss.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(nss.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(nss.ns())); } // Fill out curop based on query results. If we have a cursorid, we will fill out curop with // this cursorid later. long long ccId = 0; if (shouldSaveCursor(txn, collection, state, exec.get())) { // We won't use the executor until it's getMore'd. exec->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(collection->getCursorManager(), exec.release(), nss.ns(), pq.getOptions(), pq.getFilter()); ccId = cc->cursorid(); if (txn->getClient()->isInDirectClient()) { cc->setUnownedRecoveryUnit(txn->recoveryUnit()); } else if (state == PlanExecutor::IS_EOF && pq.isTailable()) { // Don't stash the RU for tailable cursors at EOF, let them get a new RU on their // next getMore. } else { // We stash away the RecoveryUnit in the ClientCursor. It's used for subsequent // getMore requests. The calling OpCtx gets a fresh RecoveryUnit. txn->recoveryUnit()->abandonSnapshot(); cc->setOwnedRecoveryUnit(txn->releaseRecoveryUnit()); StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); invariant(txn->setRecoveryUnit(storageEngine->newRecoveryUnit(), OperationContext::kNotInUnitOfWork) == OperationContext::kNotInUnitOfWork); } LOG(5) << "caching executor with cursorid " << ccId << " after returning " << numResults << " results" << endl; // TODO document if (pq.isOplogReplay() && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.isExhaust()) { curop.debug().exhaust = true; } cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); endQueryOp(cc->getExecutor(), dbProfilingLevel, numResults, ccId, &curop); } else { LOG(5) << "Not caching executor but returning " << numResults << " results.\n"; endQueryOp(exec.get(), dbProfilingLevel, numResults, ccId, &curop); } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult::View qr = result.header().view2ptr(); qr.setCursorId(ccId); qr.setResultFlagsToOk(); qr.msgdata().setOperation(opReply); qr.setStartingFrom(0); qr.setNReturned(numResults); // curop.debug().exhaust is set above. return curop.debug().exhaust ? nss.ns() : ""; }
virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl = false ) { NamespaceString ns( dbname, cmdObj[name].String() ); AutoGetCollectionForRead ctx(txn, ns.ns()); Collection* collection = ctx.getCollection(); if ( !collection ) return appendCommandStatus( result, Status( ErrorCodes::NamespaceNotFound, str::stream() << "ns does not exist: " << ns.ns() ) ); size_t numCursors = static_cast<size_t>( cmdObj["numCursors"].numberInt() ); if ( numCursors == 0 || numCursors > 10000 ) return appendCommandStatus( result, Status( ErrorCodes::BadValue, str::stream() << "numCursors has to be between 1 and 10000" << " was: " << numCursors ) ); OwnedPointerVector<RecordIterator> iterators(collection->getManyIterators(txn)); if (iterators.size() < numCursors) { numCursors = iterators.size(); } OwnedPointerVector<PlanExecutor> execs; for ( size_t i = 0; i < numCursors; i++ ) { WorkingSet* ws = new WorkingSet(); MultiIteratorStage* mis = new MultiIteratorStage(txn, ws, collection); // Takes ownership of 'ws' and 'mis'. auto_ptr<PlanExecutor> curExec(new PlanExecutor(txn, ws, mis, collection)); // Each of the plan executors should yield automatically. We pass "false" to // indicate that 'curExec' should not register itself, as it will get registered // by ClientCursor instead. curExec->setYieldPolicy(PlanExecutor::YIELD_AUTO, false); // Need to save state while yielding locks between now and newGetMore. curExec->saveState(); execs.push_back(curExec.release()); } // transfer iterators to executors using a round-robin distribution. // TODO consider using a common work queue once invalidation issues go away. for (size_t i = 0; i < iterators.size(); i++) { PlanExecutor* theExec = execs[i % execs.size()]; MultiIteratorStage* mis = static_cast<MultiIteratorStage*>(theExec->getRootStage()); mis->addIterator(iterators.releaseAt(i)); } { BSONArrayBuilder bucketsBuilder; for (size_t i = 0; i < execs.size(); i++) { // transfer ownership of an executor to the ClientCursor (which manages its own // lifetime). ClientCursor* cc = new ClientCursor( collection, execs.releaseAt(i) ); // we are mimicking the aggregation cursor output here // that is why there are ns, ok and empty firstBatch BSONObjBuilder threadResult; { BSONObjBuilder cursor; cursor.appendArray( "firstBatch", BSONObj() ); cursor.append( "ns", ns ); cursor.append( "id", cc->cursorid() ); threadResult.append( "cursor", cursor.obj() ); } threadResult.appendBool( "ok", 1 ); bucketsBuilder.append( threadResult.obj() ); } result.appendArray( "cursors", bucketsBuilder.obj() ); } return true; }
void ClientCursor::invalidate(const StringData& ns) { Lock::assertWriteLocked(ns); size_t dot = ns.find( '.' ); verify( dot != string::npos ); // first (and only) dot is the last char bool isDB = dot == ns.size() - 1; Database *db = cc().database(); verify(db); verify(ns.startsWith(db->name())); recursive_scoped_lock cclock(ccmutex); // Look at all active non-cached Runners. These are the runners that are in auto-yield mode // that are not attached to the the client cursor. For example, all internal runners don't // need to be cached -- there will be no getMore. for (set<Runner*>::iterator it = nonCachedRunners.begin(); it != nonCachedRunners.end(); ++it) { Runner* runner = *it; const string& runnerNS = runner->ns(); if ( ( isDB && StringData(runnerNS).startsWith(ns) ) || ns == runnerNS ) { runner->kill(); } } // Look at all cached ClientCursor(s). The CC may have a Runner, a Cursor, or nothing (see // sharding_block.h). CCById::const_iterator it = clientCursorsById.begin(); while (it != clientCursorsById.end()) { ClientCursor* cc = it->second; // We're only interested in cursors over one db. if (cc->_db != db) { ++it; continue; } // Note that a valid ClientCursor state is "no cursor no runner." This is because // the set of active cursor IDs in ClientCursor is used as representation of query // state. See sharding_block.h. TODO(greg,hk): Move this out. if (NULL == cc->c() && NULL == cc->_runner.get()) { ++it; continue; } bool shouldDelete = false; // We will only delete CCs with runners that are not actively in use. The runners that // are actively in use are instead kill()-ed. if (NULL != cc->_runner.get()) { verify(NULL == cc->c()); if (isDB || cc->_runner->ns() == ns) { // If there is a pinValue >= 100, somebody is actively using the CC and we do // not delete it. Instead we notify the holder that we killed it. The holder // will then delete the CC. if (cc->_pinValue >= 100) { cc->_runner->kill(); } else { // pinvalue is <100, so there is nobody actively holding the CC. We can // safely delete it as nobody is holding the CC. shouldDelete = true; } } } // Begin cursor-only DEPRECATED else if (cc->c()->shouldDestroyOnNSDeletion()) { verify(NULL == cc->_runner.get()); if (isDB) { // already checked that db matched above dassert( StringData(cc->_ns).startsWith( ns ) ); shouldDelete = true; } else { if ( ns == cc->_ns ) { shouldDelete = true; } } } // End cursor-only DEPRECATED if (shouldDelete) { ClientCursor* toDelete = it->second; CursorId id = toDelete->cursorid(); delete toDelete; // We're not following the usual paradigm of saving it, ++it, and deleting the saved // 'it' because deleting 'it' might invalidate the next thing in clientCursorsById. // TODO: Why? it = clientCursorsById.upper_bound(id); } else { ++it; } } }
/** * Runs a query using the following steps: * --Parsing. * --Acquire locks. * --Plan query, obtaining an executor that can run it. * --Generate the first batch. * --Save state for getMore, transferring ownership of the executor to a ClientCursor. * --Generate response to send to the client. */ bool run(OperationContext* txn, const std::string& dbname, BSONObj& cmdObj, int options, std::string& errmsg, BSONObjBuilder& result) override { const std::string fullns = parseNs(dbname, cmdObj); const NamespaceString nss(fullns); if (!nss.isValid() || nss.isCommand() || nss.isSpecialCommand()) { return appendCommandStatus(result, {ErrorCodes::InvalidNamespace, str::stream() << "Invalid collection name: " << nss.ns()}); } // Although it is a command, a find command gets counted as a query. globalOpCounters.gotQuery(); if (txn->getClient()->isInDirectClient()) { return appendCommandStatus( result, Status(ErrorCodes::IllegalOperation, "Cannot run find command from eval()")); } // Parse the command BSON to a LiteParsedQuery. const bool isExplain = false; auto lpqStatus = LiteParsedQuery::makeFromFindCommand(nss, cmdObj, isExplain); if (!lpqStatus.isOK()) { return appendCommandStatus(result, lpqStatus.getStatus()); } auto& lpq = lpqStatus.getValue(); // Validate term before acquiring locks, if provided. if (auto term = lpq->getReplicationTerm()) { auto replCoord = repl::ReplicationCoordinator::get(txn); Status status = replCoord->updateTerm(txn, *term); // Note: updateTerm returns ok if term stayed the same. if (!status.isOK()) { return appendCommandStatus(result, status); } } // Fill out curop information. // // We pass negative values for 'ntoreturn' and 'ntoskip' to indicate that these values // should be omitted from the log line. Limit and skip information is already present in the // find command parameters, so these fields are redundant. const int ntoreturn = -1; const int ntoskip = -1; beginQueryOp(txn, nss, cmdObj, ntoreturn, ntoskip); // Finish the parsing step by using the LiteParsedQuery to create a CanonicalQuery. ExtensionsCallbackReal extensionsCallback(txn, &nss); auto statusWithCQ = CanonicalQuery::canonicalize(lpq.release(), extensionsCallback); if (!statusWithCQ.isOK()) { return appendCommandStatus(result, statusWithCQ.getStatus()); } std::unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue()); ShardingState* const shardingState = ShardingState::get(txn); if (OperationShardVersion::get(txn).hasShardVersion() && shardingState->enabled()) { ChunkVersion receivedVersion = OperationShardVersion::get(txn).getShardVersion(nss); ChunkVersion latestVersion; // Wait for migration completion to get the correct chunk version. const int maxTimeoutSec = 30; int timeoutSec = cq->getParsed().getMaxTimeMS() / 1000; if (!timeoutSec || timeoutSec > maxTimeoutSec) { timeoutSec = maxTimeoutSec; } if (!shardingState->waitTillNotInCriticalSection(timeoutSec)) { uasserted(ErrorCodes::LockTimeout, "Timeout while waiting for migration commit"); } // If the received version is newer than the version cached in 'shardingState', then we // have to refresh 'shardingState' from the config servers. We do this before acquiring // locks so that we don't hold locks while waiting on the network. uassertStatusOK(shardingState->refreshMetadataIfNeeded( txn, nss.ns(), receivedVersion, &latestVersion)); } // Acquire locks. AutoGetCollectionForRead ctx(txn, nss); Collection* collection = ctx.getCollection(); const int dbProfilingLevel = ctx.getDb() ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile; // It is possible that the sharding version will change during yield while we are // retrieving a plan executor. If this happens we will throw an error and mongos will // retry. const ChunkVersion shardingVersionAtStart = shardingState->getVersion(nss.ns()); // Get the execution plan for the query. auto statusWithPlanExecutor = getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO); if (!statusWithPlanExecutor.isOK()) { return appendCommandStatus(result, statusWithPlanExecutor.getStatus()); } std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue()); if (!collection) { // No collection. Just fill out curop indicating that there were zero results and // there is no ClientCursor id, and then return. const long long numResults = 0; const CursorId cursorId = 0; endQueryOp(txn, collection, *exec, dbProfilingLevel, numResults, cursorId); appendCursorResponseObject(cursorId, nss.ns(), BSONArray(), &result); return true; } const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed(); // Stream query results, adding them to a BSONArray as we go. BSONArrayBuilder firstBatch; BSONObj obj; PlanExecutor::ExecState state = PlanExecutor::ADVANCED; long long numResults = 0; while (!FindCommon::enoughForFirstBatch(pq, numResults, firstBatch.len()) && PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // If adding this object will cause us to exceed the BSON size limit, then we stash // it for later. if (firstBatch.len() + obj.objsize() > BSONObjMaxUserSize && numResults > 0) { exec->enqueue(obj); break; } // Add result to output buffer. firstBatch.append(obj); numResults++; } // Throw an assertion if query execution fails for any reason. if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) { const std::unique_ptr<PlanStageStats> stats(exec->getStats()); error() << "Plan executor error during find command: " << PlanExecutor::statestr(state) << ", stats: " << Explain::statsToBSON(*stats); return appendCommandStatus(result, Status(ErrorCodes::OperationFailed, str::stream() << "Executor error during find command: " << WorkingSetCommon::toStatusString(obj))); } // TODO: Currently, chunk ranges are kept around until all ClientCursors created while the // chunk belonged on this node are gone. Separating chunk lifetime management from // ClientCursor should allow this check to go away. if (!shardingState->getVersion(nss.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // Version changed while retrieving a PlanExecutor. Terminate the operation, // signaling that mongos should retry. throw SendStaleConfigException(nss.ns(), "version changed during find command", shardingVersionAtStart, shardingState->getVersion(nss.ns())); } // Set up the cursor for getMore. CursorId cursorId = 0; if (shouldSaveCursor(txn, collection, state, exec.get())) { // Register the execution plan inside a ClientCursor. Ownership of the PlanExecutor is // transferred to the ClientCursor. // // First unregister the PlanExecutor so it can be re-registered with ClientCursor. exec->deregisterExec(); // Create a ClientCursor containing this plan executor. We don't have to worry about // leaking it as it's inserted into a global map by its ctor. ClientCursor* cursor = new ClientCursor(collection->getCursorManager(), exec.release(), nss.ns(), txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(), pq.getOptions(), pq.getFilter()); cursorId = cursor->cursorid(); invariant(!exec); PlanExecutor* cursorExec = cursor->getExecutor(); // State will be restored on getMore. cursorExec->saveState(); cursorExec->detachFromOperationContext(); cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros()); cursor->setPos(numResults); // Fill out curop based on the results. endQueryOp(txn, collection, *cursorExec, dbProfilingLevel, numResults, cursorId); } else { endQueryOp(txn, collection, *exec, dbProfilingLevel, numResults, cursorId); } // Generate the response object to send to the client. appendCursorResponseObject(cursorId, nss.ns(), firstBatch.arr(), &result); return true; }
/** * This is called by db/ops/query.cpp. This is the entry point for answering a query. */ string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { log() << "Running query on new system: " << q.query.toString() << endl; // This is a read lock. Client::ReadContext ctx(q.ns, dbpath); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner; CanonicalQuery* cq; Status status = getRunner(q, &rawRunner, &cq); if (!status.isOK()) { uasserted(17007, "Couldn't process query " + q.query.toString() + " why: " + status.reason()); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(q.ns); // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); // TODO: Document why we do this. // TODO: do this when we can pass in our own parsed query //replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; // We turn on auto-yielding for the runner here, so we must register it with the active // runners list in ClientCursor. ClientCursor::registerRunner(runner.get()); runner->setYieldPolicy(Runner::YIELD_AUTO); BSONObj obj; Runner::RunnerState state; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // If we're sharded make sure that we don't return any data that hasn't been migrated // off of our shared yet. if (collMetadata) { // This information can change if we yield and as such we must make sure to re-fetch // it if we yield. KeyPattern kp(collMetadata->getKeyPattern()); // This performs excessive BSONObj creation but that's OK for now. if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; } } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; const bool isExplain = pq.isExplain(); if (isExplain && enoughForExplain(pq, numResults)) { break; } else if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { saveClientCursor = true; } break; } } // If we cache the runner later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the runner later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the runner. ClientCursor::deregisterRunner(runner.get()); // Why save a dead runner? if (Runner::RUNNER_DEAD == state) { saveClientCursor = false; } // TODO: Stage creation can set tailable depending on what's in the parsed query. We have // the full parsed query available during planning...set it there. // // TODO: If we're tailable we want to save the client cursor. Make sure we do this later. //if (pq.hasOption(QueryOption_CursorTailable) && pq.getNumToReturn() != 1) { ... } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } long long ccId = 0; if (saveClientCursor) { // We won't use the runner until it's getMore'd. runner->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(), cq->getParsed().getFilter()); ccId = cc->cursorid(); log() << "caching runner with cursorid " << ccId << endl; // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); // TODO document if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; // TODO: nscanned is bogus. // curop.debug().nscanned = ( cursor ? cursor->nscanned() : 0LL ); curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result) { NamespaceString ns(dbname, cmdObj[name].String()); AutoGetCollectionForRead ctx(txn, ns.ns()); Collection* collection = ctx.getCollection(); if (!collection) return appendCommandStatus(result, Status(ErrorCodes::NamespaceNotFound, str::stream() << "ns does not exist: " << ns.ns())); size_t numCursors = static_cast<size_t>(cmdObj["numCursors"].numberInt()); if (numCursors == 0 || numCursors > 10000) return appendCommandStatus(result, Status(ErrorCodes::BadValue, str::stream() << "numCursors has to be between 1 and 10000" << " was: " << numCursors)); auto iterators = collection->getManyCursors(txn); if (iterators.size() < numCursors) { numCursors = iterators.size(); } std::vector<std::unique_ptr<PlanExecutor>> execs; for (size_t i = 0; i < numCursors; i++) { unique_ptr<WorkingSet> ws = make_unique<WorkingSet>(); unique_ptr<MultiIteratorStage> mis = make_unique<MultiIteratorStage>(txn, ws.get(), collection); // Takes ownership of 'ws' and 'mis'. auto statusWithPlanExecutor = PlanExecutor::make( txn, std::move(ws), std::move(mis), collection, PlanExecutor::YIELD_AUTO); invariant(statusWithPlanExecutor.isOK()); execs.push_back(std::move(statusWithPlanExecutor.getValue())); } // transfer iterators to executors using a round-robin distribution. // TODO consider using a common work queue once invalidation issues go away. for (size_t i = 0; i < iterators.size(); i++) { auto& planExec = execs[i % execs.size()]; MultiIteratorStage* mis = checked_cast<MultiIteratorStage*>(planExec->getRootStage()); mis->addIterator(std::move(iterators[i])); } { BSONArrayBuilder bucketsBuilder; for (auto&& exec : execs) { // The PlanExecutor was registered on construction due to the YIELD_AUTO policy. // We have to deregister it, as it will be registered with ClientCursor. exec->deregisterExec(); // Need to save state while yielding locks between now and getMore(). exec->saveState(); exec->detachFromOperationContext(); // transfer ownership of an executor to the ClientCursor (which manages its own // lifetime). ClientCursor* cc = new ClientCursor(collection->getCursorManager(), exec.release(), ns.ns(), txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot()); BSONObjBuilder threadResult; appendCursorResponseObject(cc->cursorid(), ns.ns(), BSONArray(), &threadResult); threadResult.appendBool("ok", 1); bucketsBuilder.append(threadResult.obj()); } result.appendArray("cursors", bucketsBuilder.obj()); } return true; }
std::string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { // Validate the namespace. const char *ns = q.ns; uassert(16332, "can't have an empty ns", ns[0]); const NamespaceString nsString(ns); uassert(16256, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid()); // Set curop information. curop.debug().ns = ns; curop.debug().ntoreturn = q.ntoreturn; curop.debug().query = q.query; curop.setQuery(q.query); // If the query is really a command, run it. if (nsString.isCommand()) { int nToReturn = q.ntoreturn; uassert(16979, str::stream() << "bad numberToReturn (" << nToReturn << ") for $cmd type ns - can only be 1 or -1", nToReturn == 1 || nToReturn == -1); curop.markCommand(); BufBuilder bb; bb.skip(sizeof(QueryResult)); BSONObjBuilder cmdResBuf; if (!runCommands(ns, q.query, curop, bb, cmdResBuf, false, q.queryOptions)) { uasserted(13530, "bad or malformed command request?"); } curop.debug().iscommand = true; // TODO: Does this get overwritten/do we really need to set this twice? curop.debug().query = q.query; QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf()); bb.decouple(); qr->setResultFlagsToOk(); qr->len = bb.len(); curop.debug().responseLength = bb.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; result.setData(qr, true); return ""; } // This is a read lock. We require this because if we're parsing a $where, the // where-specific parsing code assumes we have a lock and creates execution machinery that // requires it. Client::ReadContext ctx(q.ns); Collection* collection = ctx.ctx().db()->getCollection( ns ); // Parse the qm into a CanonicalQuery. CanonicalQuery* cq; Status canonStatus = CanonicalQuery::canonicalize(q, &cq); if (!canonStatus.isOK()) { uasserted(17287, str::stream() << "Can't canonicalize query: " << canonStatus.toString()); } verify(cq); QLOG() << "Running query:\n" << cq->toString(); LOG(2) << "Running query: " << cq->toStringShort(); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner = NULL; // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); // We'll now try to get the query runner that will execute this query for us. There // are a few cases in which we know upfront which runner we should get and, therefore, // we shortcut the selection process here. // // (a) If the query is over a collection that doesn't exist, we get a special runner // that's is so (a runner) which doesn't return results, the EOFRunner. // // (b) if the query is a replication's initial sync one, we get a SingleSolutinRunner // that uses a specifically designed stage that skips extents faster (see details in // exec/oplogstart.h) // // Otherwise we go through the selection of which runner is most suited to the // query + run-time context at hand. Status status = Status::OK(); if (collection == NULL) { rawRunner = new EOFRunner(cq, cq->ns()); } else if (pq.hasOption(QueryOption_OplogReplay)) { status = getOplogStartHack(collection, cq, &rawRunner); } else { // Takes ownership of cq. size_t options = QueryPlannerParams::DEFAULT; if (shardingState.needCollectionMetadata(pq.ns())) { options |= QueryPlannerParams::INCLUDE_SHARD_FILTER; } status = getRunner(cq, &rawRunner, options); } if (!status.isOK()) { // NOTE: Do not access cq as getRunner has deleted it. uasserted(17007, "Unable to execute query: " + status.reason()); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns()); // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; // We turn on auto-yielding for the runner here. The runner registers itself with the // active runners list in ClientCursor. auto_ptr<ScopedRunnerRegistration> safety(new ScopedRunnerRegistration(runner.get())); runner->setYieldPolicy(Runner::YIELD_AUTO); BSONObj obj; Runner::RunnerState state; // uint64_t numMisplacedDocs = 0; // set this outside loop. we will need to use this both within loop and when deciding // to fill in explain information const bool isExplain = pq.isExplain(); // Have we retrieved info about which plan the runner will // use to execute the query yet? bool gotPlanInfo = false; PlanInfo* rawInfo; boost::scoped_ptr<PlanInfo> planInfo; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // Add result to output buffer. This is unnecessary if explain info is requested if (!isExplain) { bb.appendBuf((void*)obj.objdata(), obj.objsize()); } // Count the result. ++numResults; // In the case of the multi plan runner, we may not be able to // successfully retrieve plan info until after the query starts // to run. This is because the multi plan runner doesn't know what // plan it will end up using until it runs candidates and selects // the best. // // TODO: Do we ever want to output what the MPR is comparing? if (!gotPlanInfo) { Status infoStatus = runner->getInfo(NULL, &rawInfo); if (infoStatus.isOK()) { gotPlanInfo = true; planInfo.reset(rawInfo); // planSummary is really a ThreadSafeString which copies the data from // the provided pointer. curop.debug().planSummary = planInfo->planSummary.c_str(); } } // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; if (isExplain) { if (enoughForExplain(pq, numResults)) { break; } } else if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { QLOG() << "Enough for first batch, wantMore=" << pq.wantMore() << " numToReturn=" << pq.getNumToReturn() << " numResults=" << numResults << endl; // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { QLOG() << " runner EOF=" << runner->isEOF() << endl; saveClientCursor = !runner->isEOF(); } break; } } // Try to get information about the plan which the runner // will use to execute the query, it we don't have it already. if (!gotPlanInfo) { Status infoStatus = runner->getInfo(NULL, &rawInfo); if (infoStatus.isOK()) { gotPlanInfo = true; planInfo.reset(rawInfo); // planSummary is really a ThreadSafeString which copies the data from // the provided pointer. curop.debug().planSummary = planInfo->planSummary.c_str(); } } // If we cache the runner later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the runner later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the runner. safety.reset(); // Caller expects exceptions thrown in certain cases. if (Runner::RUNNER_ERROR == state) { TypeExplain* bareExplain; Status res = runner->getInfo(&bareExplain, NULL); if (res.isOK()) { boost::scoped_ptr<TypeExplain> errorExplain(bareExplain); error() << "Runner error, stats:\n" << errorExplain->stats.jsonString(Strict, true); } uasserted(17144, "Runner error: " + WorkingSetCommon::toStatusString(obj)); } // Why save a dead runner? if (Runner::RUNNER_DEAD == state) { saveClientCursor = false; } else if (pq.hasOption(QueryOption_CursorTailable)) { // If we're tailing a capped collection, we don't bother saving the cursor if the // collection is empty. Otherwise, the semantics of the tailable cursor is that the // client will keep trying to read from it. So we'll keep it around. Collection* collection = ctx.ctx().db()->getCollection(cq->ns()); if (collection && collection->numRecords() != 0 && pq.getNumToReturn() != 1) { saveClientCursor = true; } } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } // Used to fill in explain and to determine if the query is slow enough to be logged. int elapsedMillis = curop.elapsedMillis(); // Get explain information if: // 1) it is needed by an explain query; // 2) profiling is enabled; or // 3) profiling is disabled but we still need explain details to log a "slow" query. // Producing explain information is expensive and should be done only if we are certain // the information will be used. boost::scoped_ptr<TypeExplain> explain(NULL); if (isExplain || ctx.ctx().db()->getProfilingLevel() > 0 || elapsedMillis > serverGlobalParams.slowMS) { // Ask the runner to produce explain information. TypeExplain* bareExplain; Status res = runner->getInfo(&bareExplain, NULL); if (res.isOK()) { explain.reset(bareExplain); } else if (isExplain) { error() << "could not produce explain of query '" << pq.getFilter() << "', error: " << res.reason(); // If numResults and the data in bb don't correspond, we'll crash later when rooting // through the reply msg. BSONObj emptyObj; bb.appendBuf((void*)emptyObj.objdata(), emptyObj.objsize()); // The explain output is actually a result. numResults = 1; // TODO: we can fill out millis etc. here just fine even if the plan screwed up. } } // Fill in the missing run-time fields in explain, starting with propeties of // the process running the query. if (isExplain && NULL != explain.get()) { std::string server = mongoutils::str::stream() << getHostNameCached() << ":" << serverGlobalParams.port; explain->setServer(server); // We might have skipped some results due to chunk migration etc. so our count is // correct. explain->setN(numResults); // Clock the whole operation. explain->setMillis(elapsedMillis); BSONObj explainObj = explain->toBSON(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // The explain output is actually a result. numResults = 1; } long long ccId = 0; if (saveClientCursor) { // We won't use the runner until it's getMore'd. runner->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(collection, runner.get(), cq->getParsed().getOptions(), cq->getParsed().getFilter()); ccId = cc->cursorid(); QLOG() << "caching runner with cursorid " << ccId << " after returning " << numResults << " results" << endl; // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); // TODO document if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } else { QLOG() << "Not caching runner but returning " << numResults << " results.\n"; } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; // Set debug information for consumption by the profiler. curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; if (NULL != explain.get()) { if (explain->isScanAndOrderSet()) { curop.debug().scanAndOrder = explain->getScanAndOrder(); } else { curop.debug().scanAndOrder = false; } if (explain->isNScannedSet()) { curop.debug().nscanned = explain->getNScanned(); } if (explain->isNScannedObjectsSet()) { curop.debug().nscannedObjects = explain->getNScannedObjects(); } if (explain->isIDHackSet()) { curop.debug().idhack = explain->getIDHack(); } if (!explain->stats.isEmpty()) { // execStats is a CachedBSONObj because it lives in the race-prone // curop. curop.debug().execStats.set(explain->stats); // Replace exec stats with plan summary if stats cannot fit into CachedBSONObj. if (curop.debug().execStats.tooBig() && !curop.debug().planSummary.empty()) { BSONObjBuilder bob; bob.append("summary", curop.debug().planSummary.toString()); curop.debug().execStats.set(bob.done()); } } } // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
bool run(OperationContext* txn, const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result) { unique_ptr<MatchExpression> matcher; BSONElement filterElt = jsobj["filter"]; if (!filterElt.eoo()) { if (filterElt.type() != mongo::Object) { return appendCommandStatus( result, Status(ErrorCodes::BadValue, "\"filter\" must be an object")); } StatusWithMatchExpression statusWithMatcher = MatchExpressionParser::parse(filterElt.Obj()); if (!statusWithMatcher.isOK()) { return appendCommandStatus(result, statusWithMatcher.getStatus()); } matcher = std::move(statusWithMatcher.getValue()); } const long long defaultBatchSize = std::numeric_limits<long long>::max(); long long batchSize; Status parseCursorStatus = parseCommandCursorOptions(jsobj, defaultBatchSize, &batchSize); if (!parseCursorStatus.isOK()) { return appendCommandStatus(result, parseCursorStatus); } ScopedTransaction scopedXact(txn, MODE_IS); AutoGetDb autoDb(txn, dbname, MODE_S); const Database* d = autoDb.getDb(); const DatabaseCatalogEntry* dbEntry = NULL; list<string> names; if (d) { dbEntry = d->getDatabaseCatalogEntry(); dbEntry->getCollectionNamespaces(&names); names.sort(); } auto ws = make_unique<WorkingSet>(); auto root = make_unique<QueuedDataStage>(txn, ws.get()); for (std::list<std::string>::const_iterator i = names.begin(); i != names.end(); ++i) { const std::string& ns = *i; StringData collection = nsToCollectionSubstring(ns); if (collection == "system.namespaces") { continue; } BSONObjBuilder b; b.append("name", collection); CollectionOptions options = dbEntry->getCollectionCatalogEntry(ns)->getCollectionOptions(txn); b.append("options", options.toBSON()); BSONObj maybe = b.obj(); if (matcher && !matcher->matchesBSON(maybe)) { continue; } WorkingSetID id = ws->allocate(); WorkingSetMember* member = ws->get(id); member->keyData.clear(); member->loc = RecordId(); member->obj = Snapshotted<BSONObj>(SnapshotId(), maybe); member->transitionToOwnedObj(); root->pushBack(id); } std::string cursorNamespace = str::stream() << dbname << ".$cmd." << name; dassert(NamespaceString(cursorNamespace).isValid()); dassert(NamespaceString(cursorNamespace).isListCollectionsCursorNS()); auto statusWithPlanExecutor = PlanExecutor::make( txn, std::move(ws), std::move(root), cursorNamespace, PlanExecutor::YIELD_MANUAL); if (!statusWithPlanExecutor.isOK()) { return appendCommandStatus(result, statusWithPlanExecutor.getStatus()); } unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue()); BSONArrayBuilder firstBatch; const int byteLimit = FindCommon::kMaxBytesToReturnToClientAtOnce; for (long long objCount = 0; objCount < batchSize && firstBatch.len() < byteLimit; objCount++) { BSONObj next; PlanExecutor::ExecState state = exec->getNext(&next, NULL); if (state == PlanExecutor::IS_EOF) { break; } invariant(state == PlanExecutor::ADVANCED); firstBatch.append(next); } CursorId cursorId = 0LL; if (!exec->isEOF()) { exec->saveState(); exec->detachFromOperationContext(); ClientCursor* cursor = new ClientCursor(CursorManager::getGlobalCursorManager(), exec.release(), cursorNamespace, txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot()); cursorId = cursor->cursorid(); } appendCursorResponseObject(cursorId, cursorNamespace, firstBatch.arr(), &result); return true; }
bool run(OperationContext* txn, const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result) { unique_ptr<MatchExpression> matcher; BSONElement filterElt = jsobj["filter"]; if (!filterElt.eoo()) { if (filterElt.type() != mongo::Object) { return appendCommandStatus( result, Status(ErrorCodes::BadValue, "\"filter\" must be an object")); } StatusWithMatchExpression statusWithMatcher = MatchExpressionParser::parse( filterElt.Obj(), ExtensionsCallbackDisallowExtensions()); if (!statusWithMatcher.isOK()) { return appendCommandStatus(result, statusWithMatcher.getStatus()); } matcher = std::move(statusWithMatcher.getValue()); } const long long defaultBatchSize = std::numeric_limits<long long>::max(); long long batchSize; Status parseCursorStatus = parseCommandCursorOptions(jsobj, defaultBatchSize, &batchSize); if (!parseCursorStatus.isOK()) { return appendCommandStatus(result, parseCursorStatus); } ScopedTransaction scopedXact(txn, MODE_IS); AutoGetDb autoDb(txn, dbname, MODE_S); const Database* db = autoDb.getDb(); auto ws = make_unique<WorkingSet>(); auto root = make_unique<QueuedDataStage>(txn, ws.get()); if (db) { if (auto collNames = _getExactNameMatches(matcher.get())) { for (auto&& collName : *collNames) { auto nss = NamespaceString(db->name(), collName); _addWorkingSetMember( txn, db->getCollection(nss), matcher.get(), ws.get(), root.get()); } } else { for (auto&& collection : *db) { _addWorkingSetMember(txn, collection, matcher.get(), ws.get(), root.get()); } } } std::string cursorNamespace = str::stream() << dbname << ".$cmd." << name; dassert(NamespaceString(cursorNamespace).isValid()); dassert(NamespaceString(cursorNamespace).isListCollectionsCursorNS()); auto statusWithPlanExecutor = PlanExecutor::make( txn, std::move(ws), std::move(root), cursorNamespace, PlanExecutor::YIELD_MANUAL); if (!statusWithPlanExecutor.isOK()) { return appendCommandStatus(result, statusWithPlanExecutor.getStatus()); } unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue()); BSONArrayBuilder firstBatch; for (long long objCount = 0; objCount < batchSize; objCount++) { BSONObj next; PlanExecutor::ExecState state = exec->getNext(&next, NULL); if (state == PlanExecutor::IS_EOF) { break; } invariant(state == PlanExecutor::ADVANCED); // If we can't fit this result inside the current batch, then we stash it for later. if (!FindCommon::haveSpaceForNext(next, objCount, firstBatch.len())) { exec->enqueue(next); break; } firstBatch.append(next); } CursorId cursorId = 0LL; if (!exec->isEOF()) { exec->saveState(); exec->detachFromOperationContext(); ClientCursor* cursor = new ClientCursor(CursorManager::getGlobalCursorManager(), exec.release(), cursorNamespace, txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot()); cursorId = cursor->cursorid(); } appendCursorResponseObject(cursorId, cursorNamespace, firstBatch.arr(), &result); return true; }
/** * This is called by db/ops/query.cpp. This is the entry point for answering a query. */ std::string newRunQuery(CanonicalQuery* cq, CurOp& curop, Message &result) { QLOG() << "Running query on new system: " << cq->toString(); // This is a read lock. Client::ReadContext ctx(cq->ns(), storageGlobalParams.dbpath); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner = NULL; // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); // Need to call cq->toString() now, since upon error getRunner doesn't guarantee // cq is in a consistent state. string cqStr = cq->toString(); // We'll now try to get the query runner that will execute this query for us. There // are a few cases in which we know upfront which runner we should get and, therefore, // we shortcut the selection process here. // // (a) If the query is over a collection that doesn't exist, we get a special runner // that's is so (a runner) which doesn't return results, the EOFRunner. // // (b) if the query is a replication's initial sync one, we get a SingleSolutinRunner // that uses a specifically designed stage that skips extents faster (see details in // exec/oplogstart.h) // // Otherwise we go through the selection of which runner is most suited to the // query + run-time context at hand. Status status = Status::OK(); if (ctx.ctx().db()->getCollection(cq->ns()) == NULL) { rawRunner = new EOFRunner(cq, cq->ns()); } else if (pq.hasOption(QueryOption_OplogReplay)) { status = getOplogStartHack(cq, &rawRunner); } else { // Takes ownership of cq. size_t options = QueryPlannerParams::DEFAULT; if (shardingState.needCollectionMetadata(pq.ns())) { options |= QueryPlannerParams::INCLUDE_SHARD_FILTER; } status = getRunner(cq, &rawRunner, options); } if (!status.isOK()) { uasserted(17007, "Couldn't get runner for query because: " + status.reason() + " query is " + cqStr); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns()); // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; // We turn on auto-yielding for the runner here. The runner registers itself with the // active runners list in ClientCursor. ClientCursor::registerRunner(runner.get()); runner->setYieldPolicy(Runner::YIELD_AUTO); auto_ptr<DeregisterEvenIfUnderlyingCodeThrows> safety( new DeregisterEvenIfUnderlyingCodeThrows(runner.get())); BSONObj obj; Runner::RunnerState state; // uint64_t numMisplacedDocs = 0; // set this outside loop. we will need to use this both within loop and when deciding // to fill in explain information const bool isExplain = pq.isExplain(); while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // Add result to output buffer. This is unnecessary if explain info is requested if (!isExplain) { bb.appendBuf((void*)obj.objdata(), obj.objsize()); } // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; if (isExplain) { if (enoughForExplain(pq, numResults)) { break; } } else if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { QLOG() << "Enough for first batch, wantMore=" << pq.wantMore() << " numToReturn=" << pq.getNumToReturn() << " numResults=" << numResults << endl; // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { QLOG() << " runner EOF=" << runner->isEOF() << endl; saveClientCursor = !runner->isEOF(); } break; } } // If we cache the runner later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the runner later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the runner. safety.reset(); // Caller expects exceptions thrown in certain cases: // * in-memory sort using too much RAM. if (Runner::RUNNER_ERROR == state) { uasserted(17144, "Runner error, memory limit for sort probably exceeded"); } // Why save a dead runner? if (Runner::RUNNER_DEAD == state) { saveClientCursor = false; } else if (pq.hasOption(QueryOption_CursorTailable)) { // If we're tailing a capped collection, we don't bother saving the cursor if the // collection is empty. Otherwise, the semantics of the tailable cursor is that the // client will keep trying to read from it. So we'll keep it around. Collection* collection = ctx.ctx().db()->getCollection(cq->ns()); if (collection && collection->numRecords() != 0 && pq.getNumToReturn() != 1) { saveClientCursor = true; } } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } // Append explain information to query results by asking the runner to produce them. if (isExplain) { TypeExplain* bareExplain; Status res = runner->getExplainPlan(&bareExplain); if (!res.isOK()) { error() << "could not produce explain of query '" << pq.getFilter() << "', error: " << res.reason(); // If numResults and the data in bb don't correspond, we'll crash later when rooting // through the reply msg. BSONObj emptyObj; bb.appendBuf((void*)emptyObj.objdata(), emptyObj.objsize()); // The explain output is actually a result. numResults = 1; // TODO: we can fill out millis etc. here just fine even if the plan screwed up. } else { boost::scoped_ptr<TypeExplain> explain(bareExplain); // Fill in the missing run-time fields in explain, starting with propeties of // the process running the query. std::string server = mongoutils::str::stream() << getHostNameCached() << ":" << serverGlobalParams.port; explain->setServer(server); // We might have skipped some results due to chunk migration etc. so our count is // correct. explain->setN(numResults); // Clock the whole operation. explain->setMillis(curop.elapsedMillis()); BSONObj explainObj = explain->toBSON(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // The explain output is actually a result. numResults = 1; } } long long ccId = 0; if (saveClientCursor) { // We won't use the runner until it's getMore'd. runner->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(), cq->getParsed().getFilter()); ccId = cc->cursorid(); QLOG() << "caching runner with cursorid " << ccId << " after returning " << numResults << " results" << endl; // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); // TODO document if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } else { QLOG() << "not caching runner but returning " << numResults << " results\n"; } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
std::string runQuery(OperationContext* txn, QueryMessage& q, const NamespaceString& nss, Message& result) { CurOp& curop = *CurOp::get(txn); uassert(ErrorCodes::InvalidNamespace, str::stream() << "Invalid ns [" << nss.ns() << "]", nss.isValid()); invariant(!nss.isCommand()); // Set curop information. beginQueryOp(txn, nss, q.query, q.ntoreturn, q.ntoskip); // Parse the qm into a CanonicalQuery. auto statusWithCQ = CanonicalQuery::canonicalize(q, ExtensionsCallbackReal(txn, &nss)); if (!statusWithCQ.isOK()) { uasserted( 17287, str::stream() << "Can't canonicalize query: " << statusWithCQ.getStatus().toString()); } unique_ptr<CanonicalQuery> cq = std::move(statusWithCQ.getValue()); invariant(cq.get()); LOG(5) << "Running query:\n" << cq->toString(); LOG(2) << "Running query: " << cq->toStringShort(); // Parse, canonicalize, plan, transcribe, and get a plan executor. AutoGetCollectionForRead ctx(txn, nss); Collection* collection = ctx.getCollection(); const int dbProfilingLevel = ctx.getDb() ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile; // We have a parsed query. Time to get the execution plan for it. std::unique_ptr<PlanExecutor> exec = uassertStatusOK( getExecutorFind(txn, collection, nss, std::move(cq), PlanExecutor::YIELD_AUTO)); const LiteParsedQuery& pq = exec->getCanonicalQuery()->getParsed(); // If it's actually an explain, do the explain and return rather than falling through // to the normal query execution loop. if (pq.isExplain()) { BufBuilder bb; bb.skip(sizeof(QueryResult::Value)); BSONObjBuilder explainBob; Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob); // Add the resulting object to the return buffer. BSONObj explainObj = explainBob.obj(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // TODO: Does this get overwritten/do we really need to set this twice? curop.debug().query = q.query; // Set query result fields. QueryResult::View qr = bb.buf(); bb.decouple(); qr.setResultFlagsToOk(); qr.msgdata().setLen(bb.len()); curop.debug().responseLength = bb.len(); qr.msgdata().setOperation(opReply); qr.setCursorId(0); qr.setStartingFrom(0); qr.setNReturned(1); result.setData(qr.view2ptr(), true); return ""; } // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. bool slaveOK = pq.isSlaveOk() || pq.hasReadPref(); Status serveReadsStatus = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor(txn, nss, slaveOK); uassertStatusOK(serveReadsStatus); // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(FindCommon::kInitReplyBufferSize); bb.skip(sizeof(QueryResult::Value)); // How many results have we obtained from the executor? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. Timestamp slaveReadTill; BSONObj obj; PlanExecutor::ExecState state; // Get summary info about which plan the executor is using. { stdx::lock_guard<Client> lk(*txn->getClient()); curop.setPlanSummary_inlock(Explain::getPlanSummary(exec.get())); } while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // If we can't fit this result inside the current batch, then we stash it for later. if (!FindCommon::haveSpaceForNext(obj, numResults, bb.len())) { exec->enqueue(obj); break; } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.isOplogReplay()) { BSONElement e = obj["ts"]; if (Date == e.type() || bsonTimestamp == e.type()) { slaveReadTill = e.timestamp(); } } if (FindCommon::enoughForFirstBatch(pq, numResults)) { LOG(5) << "Enough for first batch, wantMore=" << pq.wantMore() << " ntoreturn=" << pq.getNToReturn().value_or(0) << " numResults=" << numResults << endl; break; } } // If we cache the executor later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the executor later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the executor. exec->deregisterExec(); // Caller expects exceptions thrown in certain cases. if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) { error() << "Plan executor error during find: " << PlanExecutor::statestr(state) << ", stats: " << Explain::getWinningPlanStats(exec.get()); uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj)); } // Before saving the cursor, ensure that whatever plan we established happened with the expected // collection version auto css = CollectionShardingState::get(txn, nss); css->checkShardVersionOrThrow(txn); // Fill out curop based on query results. If we have a cursorid, we will fill out curop with // this cursorid later. long long ccId = 0; if (shouldSaveCursor(txn, collection, state, exec.get())) { // We won't use the executor until it's getMore'd. exec->saveState(); exec->detachFromOperationContext(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(collection->getCursorManager(), exec.release(), nss.ns(), txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(), pq.getOptions(), pq.getFilter()); ccId = cc->cursorid(); LOG(5) << "caching executor with cursorid " << ccId << " after returning " << numResults << " results" << endl; // TODO document if (pq.isOplogReplay() && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.isExhaust()) { curop.debug().exhaust = true; } cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); endQueryOp(txn, collection, *cc->getExecutor(), dbProfilingLevel, numResults, ccId); } else { LOG(5) << "Not caching executor but returning " << numResults << " results.\n"; endQueryOp(txn, collection, *exec, dbProfilingLevel, numResults, ccId); } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult::View qr = result.header().view2ptr(); qr.setCursorId(ccId); qr.setResultFlagsToOk(); qr.msgdata().setOperation(opReply); qr.setStartingFrom(0); qr.setNReturned(numResults); // curop.debug().exhaust is set above. return curop.debug().exhaust ? nss.ns() : ""; }
std::string newRunQuery(OperationContext* txn, Message& m, QueryMessage& q, CurOp& curop, Message &result, bool fromDBDirectClient) { // Validate the namespace. const char *ns = q.ns; uassert(16332, "can't have an empty ns", ns[0]); const NamespaceString nsString(ns); uassert(16256, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid()); // Set curop information. curop.debug().ns = ns; curop.debug().ntoreturn = q.ntoreturn; curop.debug().query = q.query; curop.setQuery(q.query); // If the query is really a command, run it. if (nsString.isCommand()) { int nToReturn = q.ntoreturn; uassert(16979, str::stream() << "bad numberToReturn (" << nToReturn << ") for $cmd type ns - can only be 1 or -1", nToReturn == 1 || nToReturn == -1); curop.markCommand(); BufBuilder bb; bb.skip(sizeof(QueryResult::Value)); BSONObjBuilder cmdResBuf; if (!runCommands(txn, ns, q.query, curop, bb, cmdResBuf, false, q.queryOptions)) { uasserted(13530, "bad or malformed command request?"); } curop.debug().iscommand = true; // TODO: Does this get overwritten/do we really need to set this twice? curop.debug().query = q.query; QueryResult::View qr = bb.buf(); bb.decouple(); qr.setResultFlagsToOk(); qr.msgdata().setLen(bb.len()); curop.debug().responseLength = bb.len(); qr.msgdata().setOperation(opReply); qr.setCursorId(0); qr.setStartingFrom(0); qr.setNReturned(1); result.setData(qr.view2ptr(), true); return ""; } const NamespaceString nss(q.ns); // Parse the qm into a CanonicalQuery. CanonicalQuery* cq; Status canonStatus = CanonicalQuery::canonicalize( q, &cq, WhereCallbackReal(txn, StringData(nss.db()))); if (!canonStatus.isOK()) { uasserted(17287, str::stream() << "Can't canonicalize query: " << canonStatus.toString()); } QLOG() << "Running query:\n" << cq->toString(); LOG(2) << "Running query: " << cq->toStringShort(); // Parse, canonicalize, plan, transcribe, and get a plan executor. PlanExecutor* rawExec = NULL; // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); AutoGetCollectionForRead ctx(txn, nss); const int dbProfilingLevel = (ctx.getDb() != NULL) ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile; Collection* collection = ctx.getCollection(); // We'll now try to get the query executor that will execute this query for us. There // are a few cases in which we know upfront which executor we should get and, therefore, // we shortcut the selection process here. // // (a) If the query is over a collection that doesn't exist, we use an EOFStage. // // (b) if the query is a replication's initial sync one, we use a specifically designed // stage that skips extents faster (see details in exec/oplogstart.h). // // Otherwise we go through the selection of which executor is most suited to the // query + run-time context at hand. Status status = Status::OK(); if (NULL != collection && pq.getOptions().oplogReplay) { // Takes ownership of 'cq'. status = getOplogStartHack(txn, collection, cq, &rawExec); } else { size_t options = QueryPlannerParams::DEFAULT; if (shardingState.needCollectionMetadata(pq.ns())) { options |= QueryPlannerParams::INCLUDE_SHARD_FILTER; } // Takes ownership of 'cq'. status = getExecutor(txn, collection, cq, PlanExecutor::YIELD_AUTO, &rawExec, options); } if (!status.isOK()) { // NOTE: Do not access cq as getExecutor has deleted it. uasserted(17007, "Unable to execute query: " + status.reason()); } verify(NULL != rawExec); auto_ptr<PlanExecutor> exec(rawExec); // If it's actually an explain, do the explain and return rather than falling through // to the normal query execution loop. if (pq.isExplain()) { BufBuilder bb; bb.skip(sizeof(QueryResult::Value)); BSONObjBuilder explainBob; Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob); // Add the resulting object to the return buffer. BSONObj explainObj = explainBob.obj(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); curop.debug().iscommand = true; // TODO: Does this get overwritten/do we really need to set this twice? curop.debug().query = q.query; // Set query result fields. QueryResult::View qr = bb.buf(); bb.decouple(); qr.setResultFlagsToOk(); qr.msgdata().setLen(bb.len()); curop.debug().responseLength = bb.len(); qr.msgdata().setOperation(opReply); qr.setCursorId(0); qr.setStartingFrom(0); qr.setNReturned(1); result.setData(qr.view2ptr(), true); return ""; } // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns()); // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. bool slaveOK = pq.getOptions().slaveOk || pq.hasReadPref(); status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor( txn, NamespaceString(cq->ns()), slaveOK); uassertStatusOK(status); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult::Value)); // How many results have we obtained from the executor? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the PlanExecutor in a ClientCursor for getMore calls later? bool saveClientCursor = false; BSONObj obj; PlanExecutor::ExecState state; // uint64_t numMisplacedDocs = 0; // Get summary info about which plan the executor is using. curop.debug().planSummary = Explain::getPlanSummary(exec.get()); while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.getOptions().oplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { QLOG() << "Enough for first batch, wantMore=" << pq.wantMore() << " numToReturn=" << pq.getNumToReturn() << " numResults=" << numResults << endl; // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { QLOG() << " executor EOF=" << exec->isEOF() << endl; saveClientCursor = !exec->isEOF(); } break; } } // If we cache the executor later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the executor later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the executor. exec->deregisterExec(); // Caller expects exceptions thrown in certain cases. if (PlanExecutor::EXEC_ERROR == state) { scoped_ptr<PlanStageStats> stats(exec->getStats()); error() << "Plan executor error, stats: " << Explain::statsToBSON(*stats); uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj)); } // Why save a dead executor? if (PlanExecutor::DEAD == state) { saveClientCursor = false; } else if (pq.getOptions().tailable) { // If we're tailing a capped collection, we don't bother saving the cursor if the // collection is empty. Otherwise, the semantics of the tailable cursor is that the // client will keep trying to read from it. So we'll keep it around. if (collection && collection->numRecords(txn) != 0 && pq.getNumToReturn() != 1) { saveClientCursor = true; } } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } const logger::LogComponent queryLogComponent = logger::LogComponent::kQuery; const logger::LogSeverity logLevelOne = logger::LogSeverity::Debug(1); PlanSummaryStats summaryStats; Explain::getSummaryStats(exec.get(), &summaryStats); curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; curop.debug().scanAndOrder = summaryStats.hasSortStage; curop.debug().nscanned = summaryStats.totalKeysExamined; curop.debug().nscannedObjects = summaryStats.totalDocsExamined; curop.debug().idhack = summaryStats.isIdhack; // Set debug information for consumption by the profiler. if (dbProfilingLevel > 0 || curop.elapsedMillis() > serverGlobalParams.slowMS || logger::globalLogDomain()->shouldLog(queryLogComponent, logLevelOne)) { // Get BSON stats. scoped_ptr<PlanStageStats> execStats(exec->getStats()); BSONObjBuilder statsBob; Explain::statsToBSON(*execStats, &statsBob); curop.debug().execStats.set(statsBob.obj()); // Replace exec stats with plan summary if stats cannot fit into CachedBSONObj. if (curop.debug().execStats.tooBig() && !curop.debug().planSummary.empty()) { BSONObjBuilder bob; bob.append("summary", curop.debug().planSummary.toString()); curop.debug().execStats.set(bob.done()); } } long long ccId = 0; if (saveClientCursor) { // We won't use the executor until it's getMore'd. exec->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(collection, exec.get(), cq->getParsed().getOptions().toInt(), cq->getParsed().getFilter()); ccId = cc->cursorid(); if (fromDBDirectClient) { cc->setUnownedRecoveryUnit(txn->recoveryUnit()); } else if (state == PlanExecutor::IS_EOF && pq.getOptions().tailable) { // Don't stash the RU for tailable cursors at EOF, let them get a new RU on their // next getMore. } else { // We stash away the RecoveryUnit in the ClientCursor. It's used for subsequent // getMore requests. The calling OpCtx gets a fresh RecoveryUnit. cc->setOwnedRecoveryUnit(txn->releaseRecoveryUnit()); StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); txn->setRecoveryUnit(storageEngine->newRecoveryUnit(txn)); } QLOG() << "caching executor with cursorid " << ccId << " after returning " << numResults << " results" << endl; // ClientCursor takes ownership of executor. Release to make sure it's not deleted. exec.release(); // TODO document if (pq.getOptions().oplogReplay && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.getOptions().exhaust) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } else { QLOG() << "Not caching executor but returning " << numResults << " results.\n"; } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult::View qr = result.header().view2ptr(); qr.setCursorId(ccId); curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr.setResultFlagsToOk(); qr.msgdata().setOperation(opReply); qr.setStartingFrom(0); qr.setNReturned(numResults); // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
virtual bool run(OperationContext* txn, const string &db, BSONObj &cmdObj, int options, string &errmsg, BSONObjBuilder &result, bool fromRepl) { NamespaceString nss(parseNs(db, cmdObj)); if (nss.coll().empty()) { errmsg = "missing collection name"; return false; } intrusive_ptr<ExpressionContext> pCtx = new ExpressionContext(txn, nss); pCtx->tempDir = storageGlobalParams.dbpath + "/_tmp"; /* try to parse the command; if this fails, then we didn't run */ intrusive_ptr<Pipeline> pPipeline = Pipeline::parseCommand(errmsg, cmdObj, pCtx); if (!pPipeline.get()) return false; #if _DEBUG // This is outside of the if block to keep the object alive until the pipeline is finished. BSONObj parsed; if (!pPipeline->isExplain() && !pCtx->inShard) { // Make sure all operations round-trip through Pipeline::toBson() // correctly by reparsing every command on DEBUG builds. This is // important because sharded aggregations rely on this ability. // Skipping when inShard because this has already been through the // transformation (and this unsets pCtx->inShard). parsed = pPipeline->serialize().toBson(); pPipeline = Pipeline::parseCommand(errmsg, parsed, pCtx); verify(pPipeline); } #endif PlanExecutor* exec = NULL; scoped_ptr<ClientCursorPin> pin; // either this OR the execHolder will be non-null auto_ptr<PlanExecutor> execHolder; { // This will throw if the sharding version for this connection is out of date. The // lock must be held continuously from now until we have we created both the output // ClientCursor and the input executor. This ensures that both are using the same // sharding version that we synchronize on here. This is also why we always need to // create a ClientCursor even when we aren't outputting to a cursor. See the comment // on ShardFilterStage for more details. AutoGetCollectionForRead ctx(txn, nss.ns()); Collection* collection = ctx.getCollection(); // This does mongod-specific stuff like creating the input PlanExecutor and adding // it to the front of the pipeline if needed. boost::shared_ptr<PlanExecutor> input = PipelineD::prepareCursorSource(txn, collection, pPipeline, pCtx); pPipeline->stitch(); // Create the PlanExecutor which returns results from the pipeline. The WorkingSet // ('ws') and the PipelineProxyStage ('proxy') will be owned by the created // PlanExecutor. auto_ptr<WorkingSet> ws(new WorkingSet()); auto_ptr<PipelineProxyStage> proxy( new PipelineProxyStage(pPipeline, input, ws.get())); Status execStatus = Status::OK(); if (NULL == collection) { execStatus = PlanExecutor::make(txn, ws.release(), proxy.release(), nss.ns(), PlanExecutor::YIELD_MANUAL, &exec); } else { execStatus = PlanExecutor::make(txn, ws.release(), proxy.release(), collection, PlanExecutor::YIELD_MANUAL, &exec); } invariant(execStatus.isOK()); execHolder.reset(exec); if (!collection && input) { // If we don't have a collection, we won't be able to register any executors, so // make sure that the input PlanExecutor (likely wrapping an EOFStage) doesn't // need to be registered. invariant(!input->collection()); } if (collection) { // XXX const bool isAggCursor = true; // enable special locking behavior ClientCursor* cursor = new ClientCursor(collection, execHolder.release(), 0, BSONObj(), isAggCursor); pin.reset(new ClientCursorPin(collection, cursor->cursorid())); // Don't add any code between here and the start of the try block. } } try { // Unless set to true, the ClientCursor created above will be deleted on block exit. bool keepCursor = false; // If both explain and cursor are specified, explain wins. if (pPipeline->isExplain()) { result << "stages" << Value(pPipeline->writeExplainOps()); } else if (isCursorCommand(cmdObj)) { handleCursorCommand(txn, nss.ns(), pin.get(), exec, cmdObj, result); keepCursor = true; } else { pPipeline->run(result); } if (!keepCursor && pin) pin->deleteUnderlying(); } catch (...) { // Clean up cursor on way out of scope. if (pin) pin->deleteUnderlying(); throw; } // Any code that needs the cursor pinned must be inside the try block, above. return true; }
bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result) { BSONElement first = cmdObj.firstElement(); uassert(28528, str::stream() << "Argument to listIndexes must be of type String, not " << typeName(first.type()), first.type() == String); StringData collectionName = first.valueStringData(); uassert(28529, str::stream() << "Argument to listIndexes must be a collection name, " << "not the empty string", !collectionName.empty()); const NamespaceString ns(dbname, collectionName); const long long defaultBatchSize = std::numeric_limits<long long>::max(); long long batchSize; Status parseCursorStatus = parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize); if (!parseCursorStatus.isOK()) { return appendCommandStatus(result, parseCursorStatus); } AutoGetCollectionForRead autoColl(txn, ns); if (!autoColl.getDb()) { return appendCommandStatus(result, Status(ErrorCodes::NamespaceNotFound, "no database")); } const Collection* collection = autoColl.getCollection(); if (!collection) { return appendCommandStatus(result, Status(ErrorCodes::NamespaceNotFound, "no collection")); } const CollectionCatalogEntry* cce = collection->getCatalogEntry(); invariant(cce); vector<string> indexNames; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { indexNames.clear(); cce->getAllIndexes(txn, &indexNames); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns()); std::unique_ptr<WorkingSet> ws(new WorkingSet()); std::unique_ptr<QueuedDataStage> root(new QueuedDataStage(ws.get())); for (size_t i = 0; i < indexNames.size(); i++) { BSONObj indexSpec; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { indexSpec = cce->getIndexSpec(txn, indexNames[i]); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns()); WorkingSetID id = ws->allocate(); WorkingSetMember* member = ws->get(id); member->keyData.clear(); member->loc = RecordId(); member->obj = Snapshotted<BSONObj>(SnapshotId(), indexSpec.getOwned()); member->transitionToOwnedObj(); root->pushBack(id); } std::string cursorNamespace = str::stream() << dbname << ".$cmd." << name << "." << ns.coll(); dassert(NamespaceString(cursorNamespace).isValid()); dassert(NamespaceString(cursorNamespace).isListIndexesCursorNS()); dassert(ns == NamespaceString(cursorNamespace).getTargetNSForListIndexes()); auto statusWithPlanExecutor = PlanExecutor::make( txn, std::move(ws), std::move(root), cursorNamespace, PlanExecutor::YIELD_MANUAL); if (!statusWithPlanExecutor.isOK()) { return appendCommandStatus(result, statusWithPlanExecutor.getStatus()); } std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue()); BSONArrayBuilder firstBatch; const int byteLimit = MaxBytesToReturnToClientAtOnce; for (long long objCount = 0; objCount < batchSize && firstBatch.len() < byteLimit; objCount++) { BSONObj next; PlanExecutor::ExecState state = exec->getNext(&next, NULL); if (state == PlanExecutor::IS_EOF) { break; } invariant(state == PlanExecutor::ADVANCED); firstBatch.append(next); } CursorId cursorId = 0LL; if (!exec->isEOF()) { exec->saveState(); ClientCursor* cursor = new ClientCursor( CursorManager::getGlobalCursorManager(), exec.release(), cursorNamespace); cursorId = cursor->cursorid(); } appendCursorResponseObject(cursorId, cursorNamespace, firstBatch.arr(), &result); return true; }
static void handleCursorCommand(OperationContext* txn, const string& ns, ClientCursorPin* pin, PlanExecutor* exec, const BSONObj& cmdObj, BSONObjBuilder& result) { ClientCursor* cursor = pin ? pin->c() : NULL; if (pin) { invariant(cursor); invariant(cursor->getExecutor() == exec); invariant(cursor->isAggCursor()); } BSONElement batchSizeElem = cmdObj.getFieldDotted("cursor.batchSize"); const long long batchSize = batchSizeElem.isNumber() ? batchSizeElem.numberLong() : 101; // same as query // can't use result BSONObjBuilder directly since it won't handle exceptions correctly. BSONArrayBuilder resultsArray; const int byteLimit = MaxBytesToReturnToClientAtOnce; BSONObj next; for (int objCount = 0; objCount < batchSize; objCount++) { // The initial getNext() on a PipelineProxyStage may be very expensive so we don't // do it when batchSize is 0 since that indicates a desire for a fast return. if (exec->getNext(&next, NULL) != PlanExecutor::ADVANCED) { if (pin) pin->deleteUnderlying(); // make it an obvious error to use cursor or executor after this point cursor = NULL; exec = NULL; break; } if (resultsArray.len() + next.objsize() > byteLimit) { // Get the pipeline proxy stage wrapped by this PlanExecutor. PipelineProxyStage* proxy = static_cast<PipelineProxyStage*>(exec->getRootStage()); // too big. next will be the first doc in the second batch proxy->pushBack(next); break; } resultsArray.append(next); } // NOTE: exec->isEOF() can have side effects such as writing by $out. However, it should // be relatively quick since if there was no pin then the input is empty. Also, this // violates the contract for batchSize==0. Sharding requires a cursor to be returned in that // case. This is ok for now however, since you can't have a sharded collection that doesn't // exist. const bool canReturnMoreBatches = pin; if (!canReturnMoreBatches && exec && !exec->isEOF()) { // msgasserting since this shouldn't be possible to trigger from today's aggregation // language. The wording assumes that the only reason pin would be null is if the // collection doesn't exist. msgasserted(17391, str::stream() << "Aggregation has more results than fit in initial batch, but can't " << "create cursor since collection " << ns << " doesn't exist"); } if (cursor) { // If a time limit was set on the pipeline, remaining time is "rolled over" to the // cursor (for use by future getmore ops). cursor->setLeftoverMaxTimeMicros( txn->getCurOp()->getRemainingMaxTimeMicros() ); // We stash away the RecoveryUnit in the ClientCursor. It's used for subsequent // getMore requests. The calling OpCtx gets a fresh RecoveryUnit. cursor->setOwnedRecoveryUnit(txn->releaseRecoveryUnit()); StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); txn->setRecoveryUnit(storageEngine->newRecoveryUnit()); // Cursor needs to be in a saved state while we yield locks for getmore. State // will be restored in getMore(). exec->saveState(); } BSONObjBuilder cursorObj(result.subobjStart("cursor")); cursorObj.append("id", cursor ? cursor->cursorid() : 0LL); cursorObj.append("ns", ns); cursorObj.append("firstBatch", resultsArray.arr()); cursorObj.done(); }
/** * Returns true if we need to keep a ClientCursor saved for this pipeline (for future getMore * requests). Otherwise, returns false. */ static bool handleCursorCommand(OperationContext* txn, const string& ns, ClientCursorPin* pin, PlanExecutor* exec, const BSONObj& cmdObj, BSONObjBuilder& result) { ClientCursor* cursor = pin ? pin->c() : NULL; if (pin) { invariant(cursor); invariant(cursor->getExecutor() == exec); invariant(cursor->isAggCursor()); } const long long defaultBatchSize = 101; // Same as query. long long batchSize; uassertStatusOK(Command::parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize)); // can't use result BSONObjBuilder directly since it won't handle exceptions correctly. BSONArrayBuilder resultsArray; BSONObj next; for (int objCount = 0; objCount < batchSize; objCount++) { // The initial getNext() on a PipelineProxyStage may be very expensive so we don't // do it when batchSize is 0 since that indicates a desire for a fast return. PlanExecutor::ExecState state; if ((state = exec->getNext(&next, NULL)) == PlanExecutor::IS_EOF) { // make it an obvious error to use cursor or executor after this point cursor = NULL; exec = NULL; break; } uassert(34426, "Plan executor error during aggregation: " + WorkingSetCommon::toStatusString(next), PlanExecutor::ADVANCED == state); // If adding this object will cause us to exceed the message size limit, then we stash it // for later. if (!FindCommon::haveSpaceForNext(next, objCount, resultsArray.len())) { exec->enqueue(next); break; } resultsArray.append(next); } // NOTE: exec->isEOF() can have side effects such as writing by $out. However, it should // be relatively quick since if there was no pin then the input is empty. Also, this // violates the contract for batchSize==0. Sharding requires a cursor to be returned in that // case. This is ok for now however, since you can't have a sharded collection that doesn't // exist. const bool canReturnMoreBatches = pin; if (!canReturnMoreBatches && exec && !exec->isEOF()) { // msgasserting since this shouldn't be possible to trigger from today's aggregation // language. The wording assumes that the only reason pin would be null is if the // collection doesn't exist. msgasserted( 17391, str::stream() << "Aggregation has more results than fit in initial batch, but can't " << "create cursor since collection " << ns << " doesn't exist"); } if (cursor) { // If a time limit was set on the pipeline, remaining time is "rolled over" to the // cursor (for use by future getmore ops). cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros()); CurOp::get(txn)->debug().cursorid = cursor->cursorid(); // Cursor needs to be in a saved state while we yield locks for getmore. State // will be restored in getMore(). exec->saveState(); exec->detachFromOperationContext(); } const long long cursorId = cursor ? cursor->cursorid() : 0LL; appendCursorResponseObject(cursorId, ns, resultsArray.arr(), &result); return static_cast<bool>(cursor); }
virtual bool run(OperationContext* txn, const string &db, BSONObj &cmdObj, int options, string &errmsg, BSONObjBuilder &result, bool fromRepl) { string ns = parseNs(db, cmdObj); intrusive_ptr<ExpressionContext> pCtx = new ExpressionContext(InterruptStatusMongod::status, NamespaceString(ns)); pCtx->tempDir = storageGlobalParams.dbpath + "/_tmp"; /* try to parse the command; if this fails, then we didn't run */ intrusive_ptr<Pipeline> pPipeline = Pipeline::parseCommand(errmsg, cmdObj, pCtx); if (!pPipeline.get()) return false; #if _DEBUG // This is outside of the if block to keep the object alive until the pipeline is finished. BSONObj parsed; if (!pPipeline->isExplain() && !pCtx->inShard) { // Make sure all operations round-trip through Pipeline::toBson() // correctly by reparsing every command on DEBUG builds. This is // important because sharded aggregations rely on this ability. // Skipping when inShard because this has already been through the // transformation (and this unsets pCtx->inShard). parsed = pPipeline->serialize().toBson(); pPipeline = Pipeline::parseCommand(errmsg, parsed, pCtx); verify(pPipeline); } #endif PipelineRunner* runner = NULL; scoped_ptr<ClientCursorPin> pin; // either this OR the runnerHolder will be non-null auto_ptr<PipelineRunner> runnerHolder; { // This will throw if the sharding version for this connection is out of date. The // lock must be held continuously from now until we have we created both the output // ClientCursor and the input Runner. This ensures that both are using the same // sharding version that we synchronize on here. This is also why we always need to // create a ClientCursor even when we aren't outputting to a cursor. See the comment // on ShardFilterStage for more details. Client::ReadContext ctx(ns); Collection* collection = ctx.ctx().db()->getCollection(ns); // This does mongod-specific stuff like creating the input Runner and adding to the // front of the pipeline if needed. boost::shared_ptr<Runner> input = PipelineD::prepareCursorSource(collection, pPipeline, pCtx); pPipeline->stitch(); runnerHolder.reset(new PipelineRunner(pPipeline, input)); runner = runnerHolder.get(); if (!collection && input) { // If we don't have a collection, we won't be able to register any Runners, so // make sure that the input Runner (likely an EOFRunner) doesn't need to be // registered. invariant(!input->collection()); } if (collection) { ClientCursor* cursor = new ClientCursor(collection, runnerHolder.release()); cursor->isAggCursor = true; // enable special locking behavior pin.reset(new ClientCursorPin(collection, cursor->cursorid())); // Don't add any code between here and the start of the try block. } } try { // Unless set to true, the ClientCursor created above will be deleted on block exit. bool keepCursor = false; // If both explain and cursor are specified, explain wins. if (pPipeline->isExplain()) { result << "stages" << Value(pPipeline->writeExplainOps()); } else if (isCursorCommand(cmdObj)) { handleCursorCommand(ns, pin.get(), runner, cmdObj, result); keepCursor = true; } else { pPipeline->run(result); } if (!keepCursor && pin) pin->deleteUnderlying(); } catch (...) { // Clean up cursor on way out of scope. if (pin) pin->deleteUnderlying(); throw; } // Any code that needs the cursor pinned must be inside the try block, above. return true; }