virtual bool run(const string &db, BSONObj &cmdObj, int options, string &errmsg, BSONObjBuilder &result, bool fromRepl) { string ns = parseNs(db, cmdObj); intrusive_ptr<ExpressionContext> pCtx = new ExpressionContext(InterruptStatusMongod::status, NamespaceString(ns)); /* try to parse the command; if this fails, then we didn't run */ intrusive_ptr<Pipeline> pPipeline = Pipeline::parseCommand(errmsg, cmdObj, pCtx); if (!pPipeline.get()) return false; if (pPipeline->getSplitMongodPipeline()) { // This is only used in testing return executeSplitPipeline(result, errmsg, ns, db, pPipeline, pCtx); } #if _DEBUG // This is outside of the if block to keep the object alive until the pipeline is finished. BSONObj parsed; if (!pPipeline->isExplain() && !pCtx->inShard) { // Make sure all operations round-trip through Pipeline::toBson() // correctly by reparsing every command on DEBUG builds. This is // important because sharded aggregations rely on this ability. // Skipping when inShard because this has already been through the // transformation (and this unsets pCtx->inShard). parsed = pPipeline->serialize().toBson(); pPipeline = Pipeline::parseCommand(errmsg, parsed, pCtx); verify(pPipeline); } #endif // This does the mongod-specific stuff like creating a cursor PipelineD::prepareCursorSource(pPipeline, nsToDatabase(ns), pCtx); pPipeline->stitch(); if (isCursorCommand(cmdObj)) { CursorId id; { // Set up cursor Client::ReadContext ctx(ns); shared_ptr<Cursor> cursor(new PipelineCursor(pPipeline)); // cc will be owned by cursor manager ClientCursor* cc = new ClientCursor(0, cursor, ns, cmdObj.getOwned()); id = cc->cursorid(); } handleCursorCommand(id, cmdObj, result); } else { pPipeline->run(result); } if (DocumentSourceOut* out = dynamic_cast<DocumentSourceOut*>(pPipeline->output())) { result.append("outputNs", out->getOutputNs()); } return true; }
void run() { OldClientWriteContext ctx(&_txn, nss.ns()); insert(BSON("a" << 1 << "b" << 1)); Collection* collection = ctx.getCollection(); BSONObj filterObj = fromjson("{_id: {$gt: 0}, b: {$gt: 0}}"); PlanExecutor* exec = makeCollScanExec(collection, filterObj); // Make a client cursor from the runner. ClientCursor* cc = new ClientCursor(collection->getCursorManager(), exec, nss.ns(), false, 0, BSONObj()); ClientCursorPin ccPin(collection->getCursorManager(), cc->cursorid()); // If the cursor is pinned, it sticks around, // even after invalidation. ASSERT_EQUALS(1U, numCursors()); const std::string invalidateReason("InvalidatePinned Test"); collection->getCursorManager()->invalidateAll(false, invalidateReason); ASSERT_EQUALS(1U, numCursors()); // The invalidation should have killed the runner. BSONObj objOut; ASSERT_EQUALS(PlanExecutor::DEAD, exec->getNext(&objOut, NULL)); ASSERT(WorkingSetCommon::isValidStatusMemberObject(objOut)); const Status status = WorkingSetCommon::getMemberObjectStatus(objOut); ASSERT(status.reason().find(invalidateReason) != string::npos); // Deleting the underlying cursor should cause the // number of cursors to return to 0. ccPin.deleteUnderlying(); ASSERT_EQUALS(0U, numCursors()); }
void run() { Client::WriteContext ctx(&_txn, ns()); insert(BSON("a" << 1 << "b" << 1)); Collection* collection = ctx.ctx().db()->getCollection(&_txn, ns()); BSONObj filterObj = fromjson("{_id: {$gt: 0}, b: {$gt: 0}}"); SingleSolutionRunner* ssr = makeCollScanRunner(ctx.ctx(),filterObj); // Make a client cursor from the runner. ClientCursor* cc = new ClientCursor(collection, ssr, 0, BSONObj()); ClientCursorPin ccPin(collection,cc->cursorid()); // If the cursor is pinned, it sticks around, // even after invalidation. ASSERT_EQUALS(1U, numCursors()); collection->cursorCache()->invalidateAll(false); ASSERT_EQUALS(1U, numCursors()); // The invalidation should have killed the runner. BSONObj objOut; ASSERT_EQUALS(Runner::RUNNER_DEAD, ssr->getNext(&objOut, NULL)); // Deleting the underlying cursor should cause the // number of cursors to return to 0. ccPin.deleteUnderlying(); ASSERT_EQUALS(0U, numCursors()); }
Value DocumentSourceCursor::serialize(bool explain) const { // we never parse a documentSourceCursor, so we only serialize for explain if (!explain) return Value(); Lock::DBRead lk(_ns); Client::Context ctx(_ns, storageGlobalParams.dbpath, /*doVersion=*/false); ClientCursorPin pin(_cursorId); ClientCursor* cursor = pin.c(); uassert(17135, "Cursor deleted. Was the collection or database dropped?", cursor); Runner* runner = cursor->getRunner(); runner->restoreState(); return Value(DOC(getSourceName() << DOC("query" << Value(_query) << "sort" << (!_sort.isEmpty() ? Value(_sort) : Value()) << "limit" << (_limit ? Value(_limit->getLimit()) : Value()) << "fields" << (!_projection.isEmpty() ? Value(_projection) : Value()) // << "indexOnly" << canUseCoveredIndex(cursor) // << "cursorType" << cursor->c()->toString() ))); // TODO get more plan information }
bool ClientCursor::eraseIfAuthorized(CursorId id) { std::string ns; { recursive_scoped_lock lock(ccmutex); ClientCursor* cursor = find_inlock(id); if (!cursor) { return false; } ns = cursor->ns(); } // Can't be in a lock when checking authorization if (!cc().getAuthorizationManager()->checkAuthorization(ns, ActionType::killCursors)) { return false; } // It is safe to lookup the cursor again after temporarily releasing the mutex because // of 2 invariants: that the cursor ID won't be re-used in a short period of time, and that // the namespace associated with a cursor cannot change. recursive_scoped_lock lock(ccmutex); ClientCursor* cursor = find_inlock(id); if (!cursor) { // Cursor was deleted in another thread since we found it earlier in this function. return false; } if (cursor->ns() != ns) { warning() << "Cursor namespace changed. Previous ns: " << ns << ", current ns: " << cursor->ns() << endl; return false; } return _erase_inlock(cursor); }
static void handleCursorCommand(CursorId id, BSONObj& cmdObj, BSONObjBuilder& result) { BSONElement batchSizeElem = cmdObj.getFieldDotted("cursor.batchSize"); const long long batchSize = batchSizeElem.isNumber() ? batchSizeElem.numberLong() : 101; // same as query ClientCursorPin pin(id); ClientCursor* cursor = pin.c(); massert(16958, "Cursor shouldn't have been deleted", cursor); verify(cursor->isAggCursor); PipelineRunner* runner = dynamic_cast<PipelineRunner*>(cursor->getRunner()); verify(runner); try { const string cursorNs = cursor->ns(); // we need this after cursor may have been deleted // can't use result BSONObjBuilder directly since it won't handle exceptions correctly. BSONArrayBuilder resultsArray; const int byteLimit = MaxBytesToReturnToClientAtOnce; BSONObj next; for (int objCount = 0; objCount < batchSize; objCount++) { // The initial getNext() on a PipelineRunner may be very expensive so we don't do it // when batchSize is 0 since that indicates a desire for a fast return. if (runner->getNext(&next, NULL) != Runner::RUNNER_ADVANCED) { pin.deleteUnderlying(); id = 0; cursor = NULL; // make it an obvious error to use cursor after this point break; } if (resultsArray.len() + next.objsize() > byteLimit) { // too big. next will be the first doc in the second batch runner->pushBack(next); break; } resultsArray.append(next); } if (cursor) { // If a time limit was set on the pipeline, remaining time is "rolled over" to the // cursor (for use by future getmore ops). cursor->setLeftoverMaxTimeMicros( cc().curop()->getRemainingMaxTimeMicros() ); } BSONObjBuilder cursorObj(result.subobjStart("cursor")); cursorObj.append("id", id); cursorObj.append("ns", cursorNs); cursorObj.append("firstBatch", resultsArray.arr()); cursorObj.done(); } catch (...) { // Clean up cursor on way out of scope. pin.deleteUnderlying(); throw; } }
void DocumentSourceCursor::loadBatch() { if (!_cursorId) { dispose(); return; } // We have already validated the sharding version when we constructed the cursor // so we shouldn't check it again. Lock::DBRead lk(_ns); Client::Context ctx(_ns, storageGlobalParams.dbpath, /*doVersion=*/false); ClientCursorPin pin(_cursorId); ClientCursor* cursor = pin.c(); uassert(16950, "Cursor deleted. Was the collection or database dropped?", cursor); Runner* runner = cursor->getRunner(); runner->restoreState(); int memUsageBytes = 0; BSONObj obj; Runner::RunnerState state; while ((state = runner->getNext(&obj, NULL)) == Runner::RUNNER_ADVANCED) { // TODO SERVER-11831: consider using documentFromBsonWithDeps(obj, _dependencies) _currentBatch.push_back(Document(obj)); if (_limit) { if (++_docsAddedToBatches == _limit->getLimit()) { break; } verify(_docsAddedToBatches < _limit->getLimit()); } memUsageBytes += _currentBatch.back().getApproximateSize(); if (memUsageBytes > MaxBytesToReturnToClientAtOnce) { // End this batch and prepare cursor for yielding. runner->saveState(); cc().curop()->yielded(); return; } } // If we got here, there won't be any more documents, so destroy the cursor and runner. _cursorId = 0; pin.deleteUnderlying(); uassert(16028, "collection or index disappeared when cursor yielded", state != Runner::RUNNER_DEAD); uassert(17285, "cursor encountered an error", state != Runner::RUNNER_ERROR); massert(17286, str::stream() << "Unexpected return from Runner::getNext: " << state, state == Runner::RUNNER_EOF || state == Runner::RUNNER_ADVANCED); }
/* must call this on a delete so we clean up the cursors. */ void ClientCursor::aboutToDelete(const DiskLoc& dl) { recursive_scoped_lock lock(ccmutex); Database *db = cc().database(); assert(db); aboutToDeleteForSharding( db , dl ); CCByLoc& bl = db->ccByLoc; CCByLoc::iterator j = bl.lower_bound(dl); CCByLoc::iterator stop = bl.upper_bound(dl); if ( j == stop ) return; vector<ClientCursor*> toAdvance; while ( 1 ) { toAdvance.push_back(j->second); DEV assert( j->first == dl ); ++j; if ( j == stop ) break; } wassert( toAdvance.size() < 5000 ); for ( vector<ClientCursor*>::iterator i = toAdvance.begin(); i != toAdvance.end(); ++i ){ ClientCursor* cc = *i; wassert(cc->_db == db); if ( cc->_doingDeletes ) continue; Cursor *c = cc->c.get(); if ( c->capped() ){ delete cc; continue; } c->checkLocation(); DiskLoc tmp1 = c->refLoc(); if ( tmp1 != dl ) { /* this might indicate a failure to call ClientCursor::updateLocation() */ problem() << "warning: cursor loc " << tmp1 << " does not match byLoc position " << dl << " !" << endl; } c->advance(); if ( c->eof() ) { // advanced to end // leave ClientCursor in place so next getMore doesn't fail // still need to mark new location though cc->updateLocation(); } else { wassert( c->refLoc() != dl ); cc->updateLocation(); } } }
virtual bool run(const string &db, BSONObj &cmdObj, int options, string &errmsg, BSONObjBuilder &result, bool fromRepl) { string ns = parseNs(db, cmdObj); intrusive_ptr<ExpressionContext> pCtx = new ExpressionContext(InterruptStatusMongod::status, NamespaceString(ns)); pCtx->tempDir = storageGlobalParams.dbpath + "/_tmp"; /* try to parse the command; if this fails, then we didn't run */ intrusive_ptr<Pipeline> pPipeline = Pipeline::parseCommand(errmsg, cmdObj, pCtx); if (!pPipeline.get()) return false; #if _DEBUG // This is outside of the if block to keep the object alive until the pipeline is finished. BSONObj parsed; if (!pPipeline->isExplain() && !pCtx->inShard) { // Make sure all operations round-trip through Pipeline::toBson() // correctly by reparsing every command on DEBUG builds. This is // important because sharded aggregations rely on this ability. // Skipping when inShard because this has already been through the // transformation (and this unsets pCtx->inShard). parsed = pPipeline->serialize().toBson(); pPipeline = Pipeline::parseCommand(errmsg, parsed, pCtx); verify(pPipeline); } #endif // This does the mongod-specific stuff like creating a cursor PipelineD::prepareCursorSource(pPipeline, pCtx); pPipeline->stitch(); if (pPipeline->isExplain()) { result << "stages" << Value(pPipeline->writeExplainOps()); return true; // don't do any actual execution } if (isCursorCommand(cmdObj)) { CursorId id; { // Set up cursor Client::ReadContext ctx(ns); ClientCursor* cc = new ClientCursor(new PipelineRunner(pPipeline)); cc->isAggCursor = true; // enable special locking and ns deletion behavior id = cc->cursorid(); } handleCursorCommand(id, cmdObj, result); } else { pPipeline->run(result); } return true; }
bool ClientCursor::erase( CursorId id ) { recursive_scoped_lock lock( ccmutex ); ClientCursor *cursor = find_inlock( id ); if ( ! cursor ) return false; if ( ! cc().getAuthenticationInfo()->isAuthorizedReads( nsToDatabase( cursor->ns() ) ) ) return false; assert( cursor->_pinValue < 100 ); // mustn't have an active ClientCursor::Pointer delete cursor; return true; }
void ClientCursor::idleTimeReport(unsigned millis) { bool foundSomeToTimeout = false; // two passes so that we don't need to readlock unless we really do some timeouts // we assume here that incrementing _idleAgeMillis outside readlock is ok. { recursive_scoped_lock lock(ccmutex); { unsigned sz = clientCursorsById.size(); static time_t last; if( sz >= 100000 ) { if( time(0) - last > 300 ) { last = time(0); log() << "warning number of open cursors is very large: " << sz << endl; } } } for ( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); ) { CCById::iterator j = i; i++; if( j->second->shouldTimeout( millis ) ) { foundSomeToTimeout = true; } } } if( foundSomeToTimeout ) { Lock::GlobalRead lk; recursive_scoped_lock cclock(ccmutex); CCById::const_iterator it = clientCursorsById.begin(); while (it != clientCursorsById.end()) { ClientCursor* cc = it->second; if( cc->shouldTimeout(0) ) { numberTimedOut++; LOG(1) << "killing old cursor " << cc->_cursorid << ' ' << cc->_ns << " idle:" << cc->idleTime() << "ms\n"; ClientCursor* toDelete = it->second; CursorId id = toDelete->cursorid(); // This is what winds up removing it from the map. delete toDelete; it = clientCursorsById.upper_bound(id); } else { ++it; } } } }
Value DocumentSourceCursor::serialize(bool explain) const { // we never parse a documentSourceCursor, so we only serialize for explain if (!explain) return Value(); Status explainStatus(ErrorCodes::InternalError, ""); scoped_ptr<TypeExplain> plan; { Lock::DBRead lk(_ns); Client::Context ctx(_ns, storageGlobalParams.dbpath, /*doVersion=*/false); ClientCursorPin pin(_cursorId); ClientCursor* cursor = pin.c(); uassert(17135, "Cursor deleted. Was the collection or database dropped?", cursor); Runner* runner = cursor->getRunner(); runner->restoreState(); TypeExplain* explainRaw; explainStatus = runner->getExplainPlan(&explainRaw); if (explainStatus.isOK()) plan.reset(explainRaw); runner->saveState(); } MutableDocument out; out["query"] = Value(_query); if (!_sort.isEmpty()) out["sort"] = Value(_sort); if (_limit) out["limit"] = Value(_limit->getLimit()); if (!_projection.isEmpty()) out["fields"] = Value(_projection); if (explainStatus.isOK()) { out["plan"] = Value(extractInfo(plan)); } else { out["planError"] = Value(explainStatus.toString()); } return out.freezeToValue(); }
bool ClientCursor::eraseIfAuthorized(CursorId id) { recursive_scoped_lock lock(ccmutex); ClientCursor* cursor = find_inlock(id); if (!cursor) { return false; } if (!cc().getAuthorizationManager()->checkAuthorization(cursor->ns(), ActionType::find) || !cc().getAuthenticationInfo()->isAuthorizedReads(nsToDatabase(cursor->ns()))) { return false; } return _erase_inlock(cursor); }
/* must call this on a delete so we clean up the cursors. */ void ClientCursor::aboutToDelete(const DiskLoc& dl) { recursive_scoped_lock lock(ccmutex); CCByLoc::iterator j = byLoc.lower_bound(dl); CCByLoc::iterator stop = byLoc.upper_bound(dl); if ( j == stop ) return; vector<ClientCursor*> toAdvance; while ( 1 ) { toAdvance.push_back(j->second); DEV assert( j->first == dl ); ++j; if ( j == stop ) break; } wassert( toAdvance.size() < 5000 ); for ( vector<ClientCursor*>::iterator i = toAdvance.begin(); i != toAdvance.end(); ++i ){ ClientCursor* cc = *i; if ( cc->_doingDeletes ) continue; Cursor *c = cc->c.get(); if ( c->capped() ){ delete cc; continue; } c->checkLocation(); DiskLoc tmp1 = c->refLoc(); if ( tmp1 != dl ) { /* this might indicate a failure to call ClientCursor::updateLocation() */ problem() << "warning: cursor loc " << tmp1 << " does not match byLoc position " << dl << " !" << endl; } c->advance(); if ( c->eof() ) { // advanced to end -- delete cursor delete cc; } else { wassert( c->refLoc() != dl ); cc->updateLocation(); } } }
/** * Called via a ScopeGuard on early return in order to ensure that the ClientCursor gets * cleaned up properly. */ static void cleanupCursor(OperationContext* txn, ClientCursorPin* ccPin, const GetMoreRequest& request) { ClientCursor* cursor = ccPin->c(); std::unique_ptr<Lock::DBLock> unpinDBLock; std::unique_ptr<Lock::CollectionLock> unpinCollLock; if (cursor->isAggCursor()) { unpinDBLock.reset(new Lock::DBLock(txn->lockState(), request.nss.db(), MODE_IS)); unpinCollLock.reset( new Lock::CollectionLock(txn->lockState(), request.nss.ns(), MODE_IS)); } ccPin->deleteUnderlying(); }
bool ClientCursor::eraseIfAuthorized(CursorId id) { NamespaceString ns; { recursive_scoped_lock lock(ccmutex); ClientCursor* cursor = find_inlock(id); if (!cursor) { audit::logKillCursorsAuthzCheck( &cc(), NamespaceString(), id, ErrorCodes::CursorNotFound); return false; } ns = NamespaceString(cursor->ns()); } // Can't be in a lock when checking authorization const bool isAuthorized = cc().getAuthorizationSession()->isAuthorizedForActionsOnNamespace( ns, ActionType::killCursors); audit::logKillCursorsAuthzCheck( &cc(), ns, id, isAuthorized ? ErrorCodes::OK : ErrorCodes::Unauthorized); if (!isAuthorized) { return false; } // It is safe to lookup the cursor again after temporarily releasing the mutex because // of 2 invariants: that the cursor ID won't be re-used in a short period of time, and that // the namespace associated with a cursor cannot change. recursive_scoped_lock lock(ccmutex); ClientCursor* cursor = find_inlock(id); if (!cursor) { // Cursor was deleted in another thread since we found it earlier in this function. return false; } if (ns != cursor->ns()) { warning() << "Cursor namespace changed. Previous ns: " << ns << ", current ns: " << cursor->ns() << endl; return false; } _erase_inlock(cursor); return true; }
bool ClientCursor::erase( CursorId id ) { recursive_scoped_lock lock( ccmutex ); ClientCursor *cursor = find_inlock( id ); if ( ! cursor ) return false; if ( ! cc().getAuthenticationInfo()->isAuthorizedReads( nsToDatabase( cursor->ns() ) ) ) return false; // Must not have an active ClientCursor::Pin. massert( 16089, str::stream() << "Cannot kill active cursor " << id, cursor->_pinValue < 100 ); delete cursor; return true; }
/* called every 4 seconds. millis is amount of idle time passed since the last call -- could be zero */ void ClientCursor::idleTimeReport(unsigned millis) { LockedIterator i; unsigned sz = clientCursorsById.size(); if (sz >= 100000) { RATELIMITED(300000) log() << "warning number of open cursors is very large: " << sz << endl; } while (i.ok()) { ClientCursor *cc = i.current(); if (cc->shouldTimeout(millis)) { LOG(1) << "killing old cursor " << cc->_cursorid << ' ' << cc->_ns << " idle:" << cc->idleTime() << "ms" << endl; i.deleteAndAdvance(); } else { i.advance(); } } }
/* called every 4 seconds. millis is amount of idle time passed since the last call -- could be zero */ void ClientCursor::idleTimeReport(unsigned millis) { bool foundSomeToTimeout = false; // two passes so that we don't need to readlock unless we really do some timeouts // we assume here that incrementing _idleAgeMillis outside readlock is ok. { recursive_scoped_lock lock(ccmutex); { unsigned sz = clientCursorsById.size(); static time_t last; if( sz >= 100000 ) { if( time(0) - last > 300 ) { last = time(0); log() << "warning number of open cursors is very large: " << sz << endl; } } } for ( CCById::iterator i = clientCursorsById.begin(); i != clientCursorsById.end(); ) { CCById::iterator j = i; i++; if( j->second->shouldTimeout( millis ) ) { foundSomeToTimeout = true; } } } if( foundSomeToTimeout ) { // todo: ideally all readlocks automatically note what we are locking for so this // can be reported in currentop command. e.g. something like: // readlock lk("", "timeout cursors"); readlock lk(""); for( LockedIterator i; i.ok(); ) { ClientCursor *cc = i.current(); if( cc->shouldTimeout(0) ) { numberTimedOut++; LOG(1) << "killing old cursor " << cc->_cursorid << ' ' << cc->_ns << " idle:" << cc->idleTime() << "ms\n"; i.deleteAndAdvance(); } else { i.advance(); } } } }
static void handleCursorCommand(CursorId id, BSONObj& cmdObj, BSONObjBuilder& result) { BSONElement batchSizeElem = cmdObj.getFieldDotted("cursor.batchSize"); const long long batchSize = batchSizeElem.isNumber() ? batchSizeElem.numberLong() : 101; // same as query // Using limited cursor API that ignores many edge cases. Should be sufficient for commands. ClientCursor::Pin pin(id); ClientCursor* cursor = pin.c(); massert(16958, "Cursor shouldn't have been deleted", cursor); // Make sure this cursor won't disappear on us fassert(16959, !cursor->c()->shouldDestroyOnNSDeletion()); fassert(16960, !cursor->c()->requiresLock()); try { // can't use result BSONObjBuilder directly since it won't handle exceptions correctly. BSONArrayBuilder resultsArray; const int byteLimit = MaxBytesToReturnToClientAtOnce; for (int objs = 0; objs < batchSize && cursor->ok() && resultsArray.len() <= byteLimit; objs++) { // TODO may need special logic if cursor->current() would cause results to be > 16MB resultsArray.append(cursor->current()); cursor->advance(); } // The initial ok() on a cursor may be very expensive so we don't do it when batchSize // is 0 since that indicates a desire for a fast return. if (batchSize != 0 && !cursor->ok()) { // There is no more data. Kill the cursor. pin.release(); ClientCursor::erase(id); id = 0; } BSONObjBuilder cursorObj(result.subobjStart("cursor")); cursorObj.append("id", id); cursorObj.append("ns", cursor->ns()); cursorObj.append("firstBatch", resultsArray.arr()); cursorObj.done(); } catch (...) { // Clean up cursor on way out of scope. pin.release(); ClientCursor::erase(id); throw; } }
// ns is either a full namespace or "dbname." when invalidating for a whole db void ClientCursor::invalidate(const StringData &ns) { Lock::assertWriteLocked(ns); size_t dotpos = ns.find('.'); verify(dotpos != string::npos); bool isDB = (dotpos + 1) == ns.size(); // first (and only) dot is the last char { //cout << "\nTEMP invalidate " << ns << endl; Database *db = cc().database(); verify(db); verify( ns.startsWith(db->name()) ); for( LockedIterator i; i.ok(); ) { ClientCursor *cc = i.current(); bool shouldDelete = false; if (cc->c()->shouldDestroyOnNSDeletion() && cc->_db == db) { if (isDB) { // already checked that db matched above dassert( StringData(cc->_ns).startsWith(ns) ); shouldDelete = true; } else { if ( ns == cc->_ns ) shouldDelete = true; } } if ( shouldDelete ) { i.deleteAndAdvance(); } else { i.advance(); } } } }
/** * Returns true if we need to keep a ClientCursor saved for this pipeline (for future getMore * requests). Otherwise, returns false. */ static bool handleCursorCommand(OperationContext* txn, const string& ns, ClientCursorPin* pin, PlanExecutor* exec, const BSONObj& cmdObj, BSONObjBuilder& result) { ClientCursor* cursor = pin ? pin->c() : NULL; if (pin) { invariant(cursor); invariant(cursor->getExecutor() == exec); invariant(cursor->isAggCursor()); } const long long defaultBatchSize = 101; // Same as query. long long batchSize; uassertStatusOK(Command::parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize)); // can't use result BSONObjBuilder directly since it won't handle exceptions correctly. BSONArrayBuilder resultsArray; BSONObj next; for (int objCount = 0; objCount < batchSize; objCount++) { // The initial getNext() on a PipelineProxyStage may be very expensive so we don't // do it when batchSize is 0 since that indicates a desire for a fast return. PlanExecutor::ExecState state; if ((state = exec->getNext(&next, NULL)) == PlanExecutor::IS_EOF) { // make it an obvious error to use cursor or executor after this point cursor = NULL; exec = NULL; break; } uassert(34426, "Plan executor error during aggregation: " + WorkingSetCommon::toStatusString(next), PlanExecutor::ADVANCED == state); // If adding this object will cause us to exceed the message size limit, then we stash it // for later. if (!FindCommon::haveSpaceForNext(next, objCount, resultsArray.len())) { exec->enqueue(next); break; } resultsArray.append(next); } // NOTE: exec->isEOF() can have side effects such as writing by $out. However, it should // be relatively quick since if there was no pin then the input is empty. Also, this // violates the contract for batchSize==0. Sharding requires a cursor to be returned in that // case. This is ok for now however, since you can't have a sharded collection that doesn't // exist. const bool canReturnMoreBatches = pin; if (!canReturnMoreBatches && exec && !exec->isEOF()) { // msgasserting since this shouldn't be possible to trigger from today's aggregation // language. The wording assumes that the only reason pin would be null is if the // collection doesn't exist. msgasserted( 17391, str::stream() << "Aggregation has more results than fit in initial batch, but can't " << "create cursor since collection " << ns << " doesn't exist"); } if (cursor) { // If a time limit was set on the pipeline, remaining time is "rolled over" to the // cursor (for use by future getmore ops). cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros()); CurOp::get(txn)->debug().cursorid = cursor->cursorid(); // Cursor needs to be in a saved state while we yield locks for getmore. State // will be restored in getMore(). exec->saveState(); exec->detachFromOperationContext(); } const long long cursorId = cursor ? cursor->cursorid() : 0LL; appendCursorResponseObject(cursorId, ns, resultsArray.arr(), &result); return static_cast<bool>(cursor); }
virtual bool run(OperationContext* txn, const string& db, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result) { const std::string ns = parseNs(db, cmdObj); if (nsToCollectionSubstring(ns).empty()) { errmsg = "missing collection name"; return false; } NamespaceString nss(ns); intrusive_ptr<ExpressionContext> pCtx = new ExpressionContext(txn, nss); pCtx->tempDir = storageGlobalParams.dbpath + "/_tmp"; /* try to parse the command; if this fails, then we didn't run */ intrusive_ptr<Pipeline> pPipeline = Pipeline::parseCommand(errmsg, cmdObj, pCtx); if (!pPipeline.get()) return false; // Save and reset the write concern so that it doesn't get changed accidentally by // DBDirectClient. auto oldWC = txn->getWriteConcern(); ON_BLOCK_EXIT([txn, oldWC] { txn->setWriteConcern(oldWC); }); // This is outside of the if block to keep the object alive until the pipeline is finished. BSONObj parsed; if (kDebugBuild && !pPipeline->isExplain() && !pCtx->inShard) { // Make sure all operations round-trip through Pipeline::toBson() correctly by // reparsing every command in debug builds. This is important because sharded // aggregations rely on this ability. Skipping when inShard because this has // already been through the transformation (and this unsets pCtx->inShard). parsed = pPipeline->serialize().toBson(); pPipeline = Pipeline::parseCommand(errmsg, parsed, pCtx); verify(pPipeline); } unique_ptr<ClientCursorPin> pin; // either this OR the exec will be non-null unique_ptr<PlanExecutor> exec; { // This will throw if the sharding version for this connection is out of date. The // lock must be held continuously from now until we have we created both the output // ClientCursor and the input executor. This ensures that both are using the same // sharding version that we synchronize on here. This is also why we always need to // create a ClientCursor even when we aren't outputting to a cursor. See the comment // on ShardFilterStage for more details. AutoGetCollectionForRead ctx(txn, nss.ns()); Collection* collection = ctx.getCollection(); // This does mongod-specific stuff like creating the input PlanExecutor and adding // it to the front of the pipeline if needed. std::shared_ptr<PlanExecutor> input = PipelineD::prepareCursorSource(txn, collection, nss, pPipeline, pCtx); pPipeline->stitch(); if (collection && input) { // Record the indexes used by the input executor. Retrieval of summary stats for a // PlanExecutor is normally done post execution. DocumentSourceCursor however will // destroy the input PlanExecutor once the result set has been exhausted. For // that reason we need to collect the indexes used prior to plan execution. PlanSummaryStats stats; Explain::getSummaryStats(*input, &stats); collection->infoCache()->notifyOfQuery(txn, stats.indexesUsed); // TODO SERVER-23265: Confirm whether this is the correct place to gather all // metrics. There is no harm adding here for the time being. CurOp::get(txn)->debug().setPlanSummaryMetrics(stats); } // Create the PlanExecutor which returns results from the pipeline. The WorkingSet // ('ws') and the PipelineProxyStage ('proxy') will be owned by the created // PlanExecutor. auto ws = make_unique<WorkingSet>(); auto proxy = make_unique<PipelineProxyStage>(txn, pPipeline, input, ws.get()); auto statusWithPlanExecutor = (NULL == collection) ? PlanExecutor::make( txn, std::move(ws), std::move(proxy), nss.ns(), PlanExecutor::YIELD_MANUAL) : PlanExecutor::make( txn, std::move(ws), std::move(proxy), collection, PlanExecutor::YIELD_MANUAL); invariant(statusWithPlanExecutor.isOK()); exec = std::move(statusWithPlanExecutor.getValue()); if (!collection && input) { // If we don't have a collection, we won't be able to register any executors, so // make sure that the input PlanExecutor (likely wrapping an EOFStage) doesn't // need to be registered. invariant(!input->collection()); } if (collection) { const bool isAggCursor = true; // enable special locking behavior ClientCursor* cursor = new ClientCursor(collection->getCursorManager(), exec.release(), nss.ns(), txn->recoveryUnit()->isReadingFromMajorityCommittedSnapshot(), 0, cmdObj.getOwned(), isAggCursor); pin.reset(new ClientCursorPin(collection->getCursorManager(), cursor->cursorid())); // Don't add any code between here and the start of the try block. } // At this point, it is safe to release the collection lock. // - In the case where we have a collection: we will need to reacquire the // collection lock later when cleaning up our ClientCursorPin. // - In the case where we don't have a collection: our PlanExecutor won't be // registered, so it will be safe to clean it up outside the lock. invariant(NULL == exec.get() || NULL == exec->collection()); } try { // Unless set to true, the ClientCursor created above will be deleted on block exit. bool keepCursor = false; const bool isCursorCommand = !cmdObj["cursor"].eoo(); // If both explain and cursor are specified, explain wins. if (pPipeline->isExplain()) { result << "stages" << Value(pPipeline->writeExplainOps()); } else if (isCursorCommand) { keepCursor = handleCursorCommand(txn, nss.ns(), pin.get(), pin ? pin->c()->getExecutor() : exec.get(), cmdObj, result); } else { pPipeline->run(result); } // Clean up our ClientCursorPin, if needed. We must reacquire the collection lock // in order to do so. if (pin) { // We acquire locks here with DBLock and CollectionLock instead of using // AutoGetCollectionForRead. AutoGetCollectionForRead will throw if the // sharding version is out of date, and we don't care if the sharding version // has changed. Lock::DBLock dbLock(txn->lockState(), nss.db(), MODE_IS); Lock::CollectionLock collLock(txn->lockState(), nss.ns(), MODE_IS); if (keepCursor) { pin->release(); } else { pin->deleteUnderlying(); } } } catch (...) { // On our way out of scope, we clean up our ClientCursorPin if needed. if (pin) { Lock::DBLock dbLock(txn->lockState(), nss.db(), MODE_IS); Lock::CollectionLock collLock(txn->lockState(), nss.ns(), MODE_IS); pin->deleteUnderlying(); } throw; } // Any code that needs the cursor pinned must be inside the try block, above. return true; }
bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result) { BSONElement first = cmdObj.firstElement(); uassert(28528, str::stream() << "Argument to listIndexes must be of type String, not " << typeName(first.type()), first.type() == String); StringData collectionName = first.valueStringData(); uassert(28529, str::stream() << "Argument to listIndexes must be a collection name, " << "not the empty string", !collectionName.empty()); const NamespaceString ns(dbname, collectionName); const long long defaultBatchSize = std::numeric_limits<long long>::max(); long long batchSize; Status parseCursorStatus = parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize); if (!parseCursorStatus.isOK()) { return appendCommandStatus(result, parseCursorStatus); } AutoGetCollectionForRead autoColl(txn, ns); if (!autoColl.getDb()) { return appendCommandStatus(result, Status(ErrorCodes::NamespaceNotFound, "no database")); } const Collection* collection = autoColl.getCollection(); if (!collection) { return appendCommandStatus(result, Status(ErrorCodes::NamespaceNotFound, "no collection")); } const CollectionCatalogEntry* cce = collection->getCatalogEntry(); invariant(cce); vector<string> indexNames; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { indexNames.clear(); cce->getAllIndexes(txn, &indexNames); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns()); std::unique_ptr<WorkingSet> ws(new WorkingSet()); std::unique_ptr<QueuedDataStage> root(new QueuedDataStage(ws.get())); for (size_t i = 0; i < indexNames.size(); i++) { BSONObj indexSpec; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { indexSpec = cce->getIndexSpec(txn, indexNames[i]); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns()); WorkingSetID id = ws->allocate(); WorkingSetMember* member = ws->get(id); member->keyData.clear(); member->loc = RecordId(); member->obj = Snapshotted<BSONObj>(SnapshotId(), indexSpec.getOwned()); member->transitionToOwnedObj(); root->pushBack(id); } std::string cursorNamespace = str::stream() << dbname << ".$cmd." << name << "." << ns.coll(); dassert(NamespaceString(cursorNamespace).isValid()); dassert(NamespaceString(cursorNamespace).isListIndexesCursorNS()); dassert(ns == NamespaceString(cursorNamespace).getTargetNSForListIndexes()); auto statusWithPlanExecutor = PlanExecutor::make( txn, std::move(ws), std::move(root), cursorNamespace, PlanExecutor::YIELD_MANUAL); if (!statusWithPlanExecutor.isOK()) { return appendCommandStatus(result, statusWithPlanExecutor.getStatus()); } std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue()); BSONArrayBuilder firstBatch; const int byteLimit = MaxBytesToReturnToClientAtOnce; for (long long objCount = 0; objCount < batchSize && firstBatch.len() < byteLimit; objCount++) { BSONObj next; PlanExecutor::ExecState state = exec->getNext(&next, NULL); if (state == PlanExecutor::IS_EOF) { break; } invariant(state == PlanExecutor::ADVANCED); firstBatch.append(next); } CursorId cursorId = 0LL; if (!exec->isEOF()) { exec->saveState(); ClientCursor* cursor = new ClientCursor( CursorManager::getGlobalCursorManager(), exec.release(), cursorNamespace); cursorId = cursor->cursorid(); } appendCursorResponseObject(cursorId, cursorNamespace, firstBatch.arr(), &result); return true; }
void ClientCursor::LockedIterator::deleteAndAdvance() { ClientCursor *cc = current(); CursorId id = cc->cursorid(); delete cc; _i = clientCursorsById.upper_bound( id ); }
/* must call this on a delete so we clean up the cursors. */ void ClientCursor::aboutToDelete(const DiskLoc& dl) { NoPageFaultsAllowed npfa; recursive_scoped_lock lock(ccmutex); Database *db = cc().database(); verify(db); aboutToDeleteForSharding( db , dl ); CCByLoc& bl = db->ccByLoc; CCByLoc::iterator j = bl.lower_bound(ByLocKey::min(dl)); CCByLoc::iterator stop = bl.upper_bound(ByLocKey::max(dl)); if ( j == stop ) return; vector<ClientCursor*> toAdvance; while ( 1 ) { toAdvance.push_back(j->second); DEV verify( j->first.loc == dl ); ++j; if ( j == stop ) break; } if( toAdvance.size() >= 3000 ) { log() << "perf warning MPW101: " << toAdvance.size() << " cursors for one diskloc " << dl.toString() << ' ' << toAdvance[1000]->_ns << ' ' << toAdvance[2000]->_ns << ' ' << toAdvance[1000]->_pinValue << ' ' << toAdvance[2000]->_pinValue << ' ' << toAdvance[1000]->_pos << ' ' << toAdvance[2000]->_pos << ' ' << toAdvance[1000]->_idleAgeMillis << ' ' << toAdvance[2000]->_idleAgeMillis << ' ' << toAdvance[1000]->_doingDeletes << ' ' << toAdvance[2000]->_doingDeletes << endl; //wassert( toAdvance.size() < 5000 ); } for ( vector<ClientCursor*>::iterator i = toAdvance.begin(); i != toAdvance.end(); ++i ) { ClientCursor* cc = *i; wassert(cc->_db == db); if ( cc->_doingDeletes ) continue; Cursor *c = cc->_c.get(); if ( c->capped() ) { /* note we cannot advance here. if this condition occurs, writes to the oplog have "caught" the reader. skipping ahead, the reader would miss postentially important data. */ delete cc; continue; } c->recoverFromYield(); DiskLoc tmp1 = c->refLoc(); if ( tmp1 != dl ) { // This might indicate a failure to call ClientCursor::prepareToYield() but it can // also happen during correct operation, see SERVER-2009. problem() << "warning: cursor loc " << tmp1 << " does not match byLoc position " << dl << " !" << endl; } else { c->advance(); } while (!c->eof() && c->refLoc() == dl) { /* We don't delete at EOF because we want to return "no more results" rather than "no such cursor". * The loop is to handle MultiKey indexes where the deleted record is pointed to by multiple adjacent keys. * In that case we need to advance until we get to the next distinct record or EOF. * SERVER-4154 * SERVER-5198 * But see SERVER-5725. */ c->advance(); } cc->updateLocation(); } }
std::string newRunQuery(OperationContext* txn, Message& m, QueryMessage& q, CurOp& curop, Message &result, bool fromDBDirectClient) { // Validate the namespace. const char *ns = q.ns; uassert(16332, "can't have an empty ns", ns[0]); const NamespaceString nsString(ns); uassert(16256, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid()); // Set curop information. curop.debug().ns = ns; curop.debug().ntoreturn = q.ntoreturn; curop.debug().query = q.query; curop.setQuery(q.query); // If the query is really a command, run it. if (nsString.isCommand()) { int nToReturn = q.ntoreturn; uassert(16979, str::stream() << "bad numberToReturn (" << nToReturn << ") for $cmd type ns - can only be 1 or -1", nToReturn == 1 || nToReturn == -1); curop.markCommand(); BufBuilder bb; bb.skip(sizeof(QueryResult::Value)); BSONObjBuilder cmdResBuf; if (!runCommands(txn, ns, q.query, curop, bb, cmdResBuf, false, q.queryOptions)) { uasserted(13530, "bad or malformed command request?"); } curop.debug().iscommand = true; // TODO: Does this get overwritten/do we really need to set this twice? curop.debug().query = q.query; QueryResult::View qr = bb.buf(); bb.decouple(); qr.setResultFlagsToOk(); qr.msgdata().setLen(bb.len()); curop.debug().responseLength = bb.len(); qr.msgdata().setOperation(opReply); qr.setCursorId(0); qr.setStartingFrom(0); qr.setNReturned(1); result.setData(qr.view2ptr(), true); return ""; } const NamespaceString nss(q.ns); // Parse the qm into a CanonicalQuery. CanonicalQuery* cq; Status canonStatus = CanonicalQuery::canonicalize( q, &cq, WhereCallbackReal(txn, StringData(nss.db()))); if (!canonStatus.isOK()) { uasserted(17287, str::stream() << "Can't canonicalize query: " << canonStatus.toString()); } QLOG() << "Running query:\n" << cq->toString(); LOG(2) << "Running query: " << cq->toStringShort(); // Parse, canonicalize, plan, transcribe, and get a plan executor. PlanExecutor* rawExec = NULL; // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); AutoGetCollectionForRead ctx(txn, nss); const int dbProfilingLevel = (ctx.getDb() != NULL) ? ctx.getDb()->getProfilingLevel() : serverGlobalParams.defaultProfile; Collection* collection = ctx.getCollection(); // We'll now try to get the query executor that will execute this query for us. There // are a few cases in which we know upfront which executor we should get and, therefore, // we shortcut the selection process here. // // (a) If the query is over a collection that doesn't exist, we use an EOFStage. // // (b) if the query is a replication's initial sync one, we use a specifically designed // stage that skips extents faster (see details in exec/oplogstart.h). // // Otherwise we go through the selection of which executor is most suited to the // query + run-time context at hand. Status status = Status::OK(); if (NULL != collection && pq.getOptions().oplogReplay) { // Takes ownership of 'cq'. status = getOplogStartHack(txn, collection, cq, &rawExec); } else { size_t options = QueryPlannerParams::DEFAULT; if (shardingState.needCollectionMetadata(pq.ns())) { options |= QueryPlannerParams::INCLUDE_SHARD_FILTER; } // Takes ownership of 'cq'. status = getExecutor(txn, collection, cq, PlanExecutor::YIELD_AUTO, &rawExec, options); } if (!status.isOK()) { // NOTE: Do not access cq as getExecutor has deleted it. uasserted(17007, "Unable to execute query: " + status.reason()); } verify(NULL != rawExec); auto_ptr<PlanExecutor> exec(rawExec); // If it's actually an explain, do the explain and return rather than falling through // to the normal query execution loop. if (pq.isExplain()) { BufBuilder bb; bb.skip(sizeof(QueryResult::Value)); BSONObjBuilder explainBob; Explain::explainStages(exec.get(), ExplainCommon::EXEC_ALL_PLANS, &explainBob); // Add the resulting object to the return buffer. BSONObj explainObj = explainBob.obj(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); curop.debug().iscommand = true; // TODO: Does this get overwritten/do we really need to set this twice? curop.debug().query = q.query; // Set query result fields. QueryResult::View qr = bb.buf(); bb.decouple(); qr.setResultFlagsToOk(); qr.msgdata().setLen(bb.len()); curop.debug().responseLength = bb.len(); qr.msgdata().setOperation(opReply); qr.setCursorId(0); qr.setStartingFrom(0); qr.setNReturned(1); result.setData(qr.view2ptr(), true); return ""; } // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns()); // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. bool slaveOK = pq.getOptions().slaveOk || pq.hasReadPref(); status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor( txn, NamespaceString(cq->ns()), slaveOK); uassertStatusOK(status); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult::Value)); // How many results have we obtained from the executor? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the PlanExecutor in a ClientCursor for getMore calls later? bool saveClientCursor = false; BSONObj obj; PlanExecutor::ExecState state; // uint64_t numMisplacedDocs = 0; // Get summary info about which plan the executor is using. curop.debug().planSummary = Explain::getPlanSummary(exec.get()); while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.getOptions().oplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { QLOG() << "Enough for first batch, wantMore=" << pq.wantMore() << " numToReturn=" << pq.getNumToReturn() << " numResults=" << numResults << endl; // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { QLOG() << " executor EOF=" << exec->isEOF() << endl; saveClientCursor = !exec->isEOF(); } break; } } // If we cache the executor later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the executor later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the executor. exec->deregisterExec(); // Caller expects exceptions thrown in certain cases. if (PlanExecutor::EXEC_ERROR == state) { scoped_ptr<PlanStageStats> stats(exec->getStats()); error() << "Plan executor error, stats: " << Explain::statsToBSON(*stats); uasserted(17144, "Executor error: " + WorkingSetCommon::toStatusString(obj)); } // Why save a dead executor? if (PlanExecutor::DEAD == state) { saveClientCursor = false; } else if (pq.getOptions().tailable) { // If we're tailing a capped collection, we don't bother saving the cursor if the // collection is empty. Otherwise, the semantics of the tailable cursor is that the // client will keep trying to read from it. So we'll keep it around. if (collection && collection->numRecords(txn) != 0 && pq.getNumToReturn() != 1) { saveClientCursor = true; } } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } const logger::LogComponent queryLogComponent = logger::LogComponent::kQuery; const logger::LogSeverity logLevelOne = logger::LogSeverity::Debug(1); PlanSummaryStats summaryStats; Explain::getSummaryStats(exec.get(), &summaryStats); curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; curop.debug().scanAndOrder = summaryStats.hasSortStage; curop.debug().nscanned = summaryStats.totalKeysExamined; curop.debug().nscannedObjects = summaryStats.totalDocsExamined; curop.debug().idhack = summaryStats.isIdhack; // Set debug information for consumption by the profiler. if (dbProfilingLevel > 0 || curop.elapsedMillis() > serverGlobalParams.slowMS || logger::globalLogDomain()->shouldLog(queryLogComponent, logLevelOne)) { // Get BSON stats. scoped_ptr<PlanStageStats> execStats(exec->getStats()); BSONObjBuilder statsBob; Explain::statsToBSON(*execStats, &statsBob); curop.debug().execStats.set(statsBob.obj()); // Replace exec stats with plan summary if stats cannot fit into CachedBSONObj. if (curop.debug().execStats.tooBig() && !curop.debug().planSummary.empty()) { BSONObjBuilder bob; bob.append("summary", curop.debug().planSummary.toString()); curop.debug().execStats.set(bob.done()); } } long long ccId = 0; if (saveClientCursor) { // We won't use the executor until it's getMore'd. exec->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(collection, exec.get(), cq->getParsed().getOptions().toInt(), cq->getParsed().getFilter()); ccId = cc->cursorid(); if (fromDBDirectClient) { cc->setUnownedRecoveryUnit(txn->recoveryUnit()); } else if (state == PlanExecutor::IS_EOF && pq.getOptions().tailable) { // Don't stash the RU for tailable cursors at EOF, let them get a new RU on their // next getMore. } else { // We stash away the RecoveryUnit in the ClientCursor. It's used for subsequent // getMore requests. The calling OpCtx gets a fresh RecoveryUnit. cc->setOwnedRecoveryUnit(txn->releaseRecoveryUnit()); StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); txn->setRecoveryUnit(storageEngine->newRecoveryUnit(txn)); } QLOG() << "caching executor with cursorid " << ccId << " after returning " << numResults << " results" << endl; // ClientCursor takes ownership of executor. Release to make sure it's not deleted. exec.release(); // TODO document if (pq.getOptions().oplogReplay && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.getOptions().exhaust) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } else { QLOG() << "Not caching executor but returning " << numResults << " results.\n"; } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult::View qr = result.header().view2ptr(); qr.setCursorId(ccId); curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr.setResultFlagsToOk(); qr.msgdata().setOperation(opReply); qr.setStartingFrom(0); qr.setNReturned(numResults); // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
/** * Called by db/instance.cpp. This is the getMore entry point. * * pass - when QueryOption_AwaitData is in use, the caller will make repeated calls * when this method returns an empty result, incrementing pass on each call. * Thus, pass == 0 indicates this is the first "attempt" before any 'awaiting'. */ QueryResult::View newGetMore(OperationContext* txn, const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized, bool fromDBDirectClient) { // For testing, we may want to fail if we receive a getmore. if (MONGO_FAIL_POINT(failReceivedGetmore)) { invariant(0); } exhaust = false; // This is a read lock. const NamespaceString nss(ns); scoped_ptr<AutoGetCollectionForRead> ctx(new AutoGetCollectionForRead(txn, nss)); Collection* collection = ctx->getCollection(); uassert( 17356, "collection dropped between getMore calls", collection ); QLOG() << "Running getMore, cursorid: " << cursorid << endl; // This checks to make sure the operation is allowed on a replicated node. Since we are not // passing in a query object (necessary to check SlaveOK query option), the only state where // reads are allowed is PRIMARY (or master in master/slave). This function uasserts if // reads are not okay. Status status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor( txn, nss, true); uassertStatusOK(status); // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. ClientCursorPin ccPin(collection, cursorid); ClientCursor* cc = ccPin.c(); // If we're not being called from DBDirectClient we want to associate the RecoveryUnit // used to create the execution machinery inside the cursor with our OperationContext. // If we throw or otherwise exit this method in a disorderly fashion, we must ensure // that further calls to getMore won't fail, and that the provided OperationContext // has a valid RecoveryUnit. As such, we use RAII to accomplish this. // // This must be destroyed before the ClientCursor is destroyed. std::auto_ptr<ScopedRecoveryUnitSwapper> ruSwapper; // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; const int InitialBufSize = 512 + sizeof(QueryResult::Value) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(InitialBufSize); bb.skip(sizeof(QueryResult::Value)); if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Quote: check for spoofing of the ns such that it does not match the one originally // there for the cursor uassert(17011, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // Restore the RecoveryUnit if we need to. if (fromDBDirectClient) { if (cc->hasRecoveryUnit()) invariant(txn->recoveryUnit() == cc->getUnownedRecoveryUnit()); } else { if (!cc->hasRecoveryUnit()) { // Start using a new RecoveryUnit cc->setOwnedRecoveryUnit( getGlobalEnvironment()->getGlobalStorageEngine()->newRecoveryUnit(txn)); } // Swap RecoveryUnit(s) between the ClientCursor and OperationContext. ruSwapper.reset(new ScopedRecoveryUnitSwapper(cc, txn)); } // Reset timeout timer on the cursor since the cursor is still in use. cc->setIdleTime(0); // TODO: fail point? // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. if (0 == pass) { cc->updateSlaveLocation(txn, curop); } if (cc->isAggCursor) { // Agg cursors handle their own locking internally. ctx.reset(); // unlocks } CollectionMetadataPtr collMetadata = cc->getCollMetadata(); // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); // What gives us results. PlanExecutor* exec = cc->getExecutor(); const int queryOptions = cc->queryOptions(); // Get results out of the executor. exec->restoreState(txn); BSONObj obj; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (queryOptions & QueryOption_OplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } if ((ntoreturn && numResults >= ntoreturn) || bb.len() > MaxBytesToReturnToClientAtOnce) { break; } } // We save the client cursor when there might be more results, and hence we may receive // another getmore. If we receive a EOF or an error, or 'exec' is dead, then we know // that we will not be producing more results. We indicate that the cursor is closed by // sending a cursorId of 0 back to the client. // // On the other hand, if we retrieve all results necessary for this batch, then // 'saveClientCursor' is true and we send a valid cursorId back to the client. In // this case, there may or may not actually be more results (for example, the next call // to getNext(...) might just return EOF). bool saveClientCursor = false; if (PlanExecutor::DEAD == state || PlanExecutor::EXEC_ERROR == state) { // Propagate this error to caller. if (PlanExecutor::EXEC_ERROR == state) { scoped_ptr<PlanStageStats> stats(exec->getStats()); error() << "Plan executor error, stats: " << Explain::statsToBSON(*stats); uasserted(17406, "getMore executor error: " + WorkingSetCommon::toStatusString(obj)); } // If we're dead there's no way to get more results. saveClientCursor = false; // In the old system tailable capped cursors would be killed off at the // cursorid level. If a tailable capped cursor is nuked the cursorid // would vanish. // // In the new system they die and are cleaned up later (or time out). // So this is where we get to remove the cursorid. if (0 == numResults) { resultFlags = ResultFlag_CursorNotFound; } } else if (PlanExecutor::IS_EOF == state) { // EOF is also end of the line unless it's tailable. saveClientCursor = queryOptions & QueryOption_CursorTailable; } else { verify(PlanExecutor::ADVANCED == state); saveClientCursor = true; } if (!saveClientCursor) { ruSwapper.reset(); ccPin.deleteUnderlying(); // cc is now invalid, as is the executor cursorid = 0; cc = NULL; QLOG() << "getMore NOT saving client cursor, ended with state " << PlanExecutor::statestr(state) << endl; } else { // Continue caching the ClientCursor. cc->incPos(numResults); exec->saveState(); QLOG() << "getMore saving client cursor ended with state " << PlanExecutor::statestr(state) << endl; if (PlanExecutor::IS_EOF == state && (queryOptions & QueryOption_CursorTailable)) { if (!fromDBDirectClient) { // Don't stash the RU. Get a new one on the next getMore. ruSwapper.reset(); delete cc->releaseOwnedRecoveryUnit(); } if ((queryOptions & QueryOption_AwaitData) && (numResults == 0) && (pass < 1000)) { // Bubble up to the AwaitData handling code in receivedGetMore which will // try again. return NULL; } } // Possibly note slave's position in the oplog. if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = (queryOptions & QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult::View qr = bb.buf(); qr.msgdata().setLen(bb.len()); qr.msgdata().setOperation(opReply); qr.setResultFlags(resultFlags); qr.setCursorId(cursorid); qr.setStartingFrom(startingResult); qr.setNReturned(numResults); bb.decouple(); QLOG() << "getMore returned " << numResults << " results\n"; return qr; }
virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int options, string& errmsg, BSONObjBuilder& result, bool fromRepl = false ) { NamespaceString ns( dbname, cmdObj[name].String() ); AutoGetCollectionForRead ctx(txn, ns.ns()); Collection* collection = ctx.getCollection(); if ( !collection ) return appendCommandStatus( result, Status( ErrorCodes::NamespaceNotFound, str::stream() << "ns does not exist: " << ns.ns() ) ); size_t numCursors = static_cast<size_t>( cmdObj["numCursors"].numberInt() ); if ( numCursors == 0 || numCursors > 10000 ) return appendCommandStatus( result, Status( ErrorCodes::BadValue, str::stream() << "numCursors has to be between 1 and 10000" << " was: " << numCursors ) ); OwnedPointerVector<RecordIterator> iterators(collection->getManyIterators(txn)); if (iterators.size() < numCursors) { numCursors = iterators.size(); } OwnedPointerVector<PlanExecutor> execs; for ( size_t i = 0; i < numCursors; i++ ) { WorkingSet* ws = new WorkingSet(); MultiIteratorStage* mis = new MultiIteratorStage(txn, ws, collection); PlanExecutor* rawExec; // Takes ownership of 'ws' and 'mis'. Status execStatus = PlanExecutor::make(txn, ws, mis, collection, PlanExecutor::YIELD_AUTO, &rawExec); invariant(execStatus.isOK()); auto_ptr<PlanExecutor> curExec(rawExec); // The PlanExecutor was registered on construction due to the YIELD_AUTO policy. // We have to deregister it, as it will be registered with ClientCursor. curExec->deregisterExec(); // Need to save state while yielding locks between now and getMore(). curExec->saveState(); execs.push_back(curExec.release()); } // transfer iterators to executors using a round-robin distribution. // TODO consider using a common work queue once invalidation issues go away. for (size_t i = 0; i < iterators.size(); i++) { PlanExecutor* theExec = execs[i % execs.size()]; MultiIteratorStage* mis = static_cast<MultiIteratorStage*>(theExec->getRootStage()); // This wasn't called above as they weren't assigned yet iterators[i]->saveState(); mis->addIterator(iterators.releaseAt(i)); } { BSONArrayBuilder bucketsBuilder; for (size_t i = 0; i < execs.size(); i++) { // transfer ownership of an executor to the ClientCursor (which manages its own // lifetime). ClientCursor* cc = new ClientCursor( collection->getCursorManager(), execs.releaseAt(i), ns.ns() ); BSONObjBuilder threadResult; appendCursorResponseObject( cc->cursorid(), ns.ns(), BSONArray(), &threadResult ); threadResult.appendBool( "ok", 1 ); bucketsBuilder.append( threadResult.obj() ); } result.appendArray( "cursors", bucketsBuilder.obj() ); } return true; }
/** * This is called by db/ops/query.cpp. This is the entry point for answering a query. */ std::string newRunQuery(CanonicalQuery* cq, CurOp& curop, Message &result) { QLOG() << "Running query on new system: " << cq->toString(); // This is a read lock. Client::ReadContext ctx(cq->ns(), storageGlobalParams.dbpath); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner = NULL; // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); // Need to call cq->toString() now, since upon error getRunner doesn't guarantee // cq is in a consistent state. string cqStr = cq->toString(); // We'll now try to get the query runner that will execute this query for us. There // are a few cases in which we know upfront which runner we should get and, therefore, // we shortcut the selection process here. // // (a) If the query is over a collection that doesn't exist, we get a special runner // that's is so (a runner) which doesn't return results, the EOFRunner. // // (b) if the query is a replication's initial sync one, we get a SingleSolutinRunner // that uses a specifically designed stage that skips extents faster (see details in // exec/oplogstart.h) // // Otherwise we go through the selection of which runner is most suited to the // query + run-time context at hand. Status status = Status::OK(); if (ctx.ctx().db()->getCollection(cq->ns()) == NULL) { rawRunner = new EOFRunner(cq, cq->ns()); } else if (pq.hasOption(QueryOption_OplogReplay)) { status = getOplogStartHack(cq, &rawRunner); } else { // Takes ownership of cq. size_t options = QueryPlannerParams::DEFAULT; if (shardingState.needCollectionMetadata(pq.ns())) { options |= QueryPlannerParams::INCLUDE_SHARD_FILTER; } status = getRunner(cq, &rawRunner, options); } if (!status.isOK()) { uasserted(17007, "Couldn't get runner for query because: " + status.reason() + " query is " + cqStr); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns()); // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; // We turn on auto-yielding for the runner here. The runner registers itself with the // active runners list in ClientCursor. ClientCursor::registerRunner(runner.get()); runner->setYieldPolicy(Runner::YIELD_AUTO); auto_ptr<DeregisterEvenIfUnderlyingCodeThrows> safety( new DeregisterEvenIfUnderlyingCodeThrows(runner.get())); BSONObj obj; Runner::RunnerState state; // uint64_t numMisplacedDocs = 0; // set this outside loop. we will need to use this both within loop and when deciding // to fill in explain information const bool isExplain = pq.isExplain(); while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // Add result to output buffer. This is unnecessary if explain info is requested if (!isExplain) { bb.appendBuf((void*)obj.objdata(), obj.objsize()); } // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; if (isExplain) { if (enoughForExplain(pq, numResults)) { break; } } else if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { QLOG() << "Enough for first batch, wantMore=" << pq.wantMore() << " numToReturn=" << pq.getNumToReturn() << " numResults=" << numResults << endl; // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { QLOG() << " runner EOF=" << runner->isEOF() << endl; saveClientCursor = !runner->isEOF(); } break; } } // If we cache the runner later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the runner later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the runner. safety.reset(); // Caller expects exceptions thrown in certain cases: // * in-memory sort using too much RAM. if (Runner::RUNNER_ERROR == state) { uasserted(17144, "Runner error, memory limit for sort probably exceeded"); } // Why save a dead runner? if (Runner::RUNNER_DEAD == state) { saveClientCursor = false; } else if (pq.hasOption(QueryOption_CursorTailable)) { // If we're tailing a capped collection, we don't bother saving the cursor if the // collection is empty. Otherwise, the semantics of the tailable cursor is that the // client will keep trying to read from it. So we'll keep it around. Collection* collection = ctx.ctx().db()->getCollection(cq->ns()); if (collection && collection->numRecords() != 0 && pq.getNumToReturn() != 1) { saveClientCursor = true; } } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } // Append explain information to query results by asking the runner to produce them. if (isExplain) { TypeExplain* bareExplain; Status res = runner->getExplainPlan(&bareExplain); if (!res.isOK()) { error() << "could not produce explain of query '" << pq.getFilter() << "', error: " << res.reason(); // If numResults and the data in bb don't correspond, we'll crash later when rooting // through the reply msg. BSONObj emptyObj; bb.appendBuf((void*)emptyObj.objdata(), emptyObj.objsize()); // The explain output is actually a result. numResults = 1; // TODO: we can fill out millis etc. here just fine even if the plan screwed up. } else { boost::scoped_ptr<TypeExplain> explain(bareExplain); // Fill in the missing run-time fields in explain, starting with propeties of // the process running the query. std::string server = mongoutils::str::stream() << getHostNameCached() << ":" << serverGlobalParams.port; explain->setServer(server); // We might have skipped some results due to chunk migration etc. so our count is // correct. explain->setN(numResults); // Clock the whole operation. explain->setMillis(curop.elapsedMillis()); BSONObj explainObj = explain->toBSON(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // The explain output is actually a result. numResults = 1; } } long long ccId = 0; if (saveClientCursor) { // We won't use the runner until it's getMore'd. runner->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(), cq->getParsed().getFilter()); ccId = cc->cursorid(); QLOG() << "caching runner with cursorid " << ccId << " after returning " << numResults << " results" << endl; // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); // TODO document if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } else { QLOG() << "not caching runner but returning " << numResults << " results\n"; } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }