bool run(OperationContext* txn, const std::string& dbname, BSONObj& cmdObj, int options, std::string& errmsg, BSONObjBuilder& result) override { // Counted as a getMore, not as a command. globalOpCounters.gotGetMore(); if (txn->getClient()->isInDirectClient()) { return appendCommandStatus(result, Status(ErrorCodes::IllegalOperation, "Cannot run getMore command from eval()")); } StatusWith<GetMoreRequest> parseStatus = GetMoreRequest::parseFromBSON(dbname, cmdObj); if (!parseStatus.isOK()) { return appendCommandStatus(result, parseStatus.getStatus()); } const GetMoreRequest& request = parseStatus.getValue(); // Depending on the type of cursor being operated on, we hold locks for the whole // getMore, or none of the getMore, or part of the getMore. The three cases in detail: // // 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore. // 2) Cursor owned by global cursor manager: we don't lock anything. These cursors // don't own any collection state. // 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and // "unpinCollLock". This is because agg cursors handle locking internally (hence the // release), but the pin and unpin of the cursor must occur under the collection // lock. We don't use our AutoGetCollectionForRead "ctx" to relock, because // AutoGetCollectionForRead checks the sharding version (and we want the relock for // the unpin to succeed even if the sharding version has changed). // // Note that we declare our locks before our ClientCursorPin, in order to ensure that // the pin's destructor is called before the lock destructors (so that the unpin occurs // under the lock). std::unique_ptr<AutoGetCollectionForRead> ctx; std::unique_ptr<Lock::DBLock> unpinDBLock; std::unique_ptr<Lock::CollectionLock> unpinCollLock; CursorManager* cursorManager; CursorManager* globalCursorManager = CursorManager::getGlobalCursorManager(); if (globalCursorManager->ownsCursorId(request.cursorid)) { cursorManager = globalCursorManager; } else { ctx.reset(new AutoGetCollectionForRead(txn, request.nss)); Collection* collection = ctx->getCollection(); if (!collection) { return appendCommandStatus(result, Status(ErrorCodes::OperationFailed, "collection dropped between getMore calls")); } cursorManager = collection->getCursorManager(); } ClientCursorPin ccPin(cursorManager, request.cursorid); ClientCursor* cursor = ccPin.c(); if (!cursor) { // We didn't find the cursor. return appendCommandStatus(result, Status(ErrorCodes::CursorNotFound, str::stream() << "Cursor not found, cursor id: " << request.cursorid)); } if (request.nss.ns() != cursor->ns()) { return appendCommandStatus(result, Status(ErrorCodes::Unauthorized, str::stream() << "Requested getMore on namespace '" << request.nss.ns() << "', but cursor belongs to a different namespace")); } const bool hasOwnMaxTime = CurOp::get(txn)->isMaxTimeSet(); // Validation related to awaitData. if (isCursorAwaitData(cursor)) { invariant(isCursorTailable(cursor)); if (!hasOwnMaxTime) { Status status(ErrorCodes::BadValue, str::stream() << "Must set maxTimeMS on a getMore if the initial " << "query had 'awaitData' set: " << cmdObj); return appendCommandStatus(result, status); } if (cursor->isAggCursor()) { Status status(ErrorCodes::BadValue, "awaitData cannot be set on an aggregation cursor"); return appendCommandStatus(result, status); } } // On early return, get rid of the cursor. ScopeGuard cursorFreer = MakeGuard(&GetMoreCmd::cleanupCursor, txn, &ccPin, request); if (!cursor->hasRecoveryUnit()) { // Start using a new RecoveryUnit. cursor->setOwnedRecoveryUnit( getGlobalServiceContext()->getGlobalStorageEngine()->newRecoveryUnit()); } // Swap RecoveryUnit(s) between the ClientCursor and OperationContext. ScopedRecoveryUnitSwapper ruSwapper(cursor, txn); // Reset timeout timer on the cursor since the cursor is still in use. cursor->setIdleTime(0); // If there is no time limit set directly on this getMore command, but the operation // that spawned this cursor had a time limit set, then we have to apply any leftover // time to this getMore. if (!hasOwnMaxTime) { CurOp::get(txn)->setMaxTimeMicros(cursor->getLeftoverMaxTimeMicros()); } txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. if (cursor->isAggCursor()) { // Agg cursors handle their own locking internally. ctx.reset(); // unlocks } PlanExecutor* exec = cursor->getExecutor(); exec->restoreState(txn); // If we're tailing a capped collection, retrieve a monotonically increasing insert // counter. uint64_t lastInsertCount = 0; if (isCursorAwaitData(cursor)) { invariant(ctx->getCollection()->isCapped()); lastInsertCount = ctx->getCollection()->getCappedInsertNotifier()->getCount(); } CursorId respondWithId = 0; BSONArrayBuilder nextBatch; BSONObj obj; PlanExecutor::ExecState state; int numResults = 0; Status batchStatus = generateBatch(cursor, request, &nextBatch, &state, &numResults); if (!batchStatus.isOK()) { return appendCommandStatus(result, batchStatus); } // If this is an await data cursor, and we hit EOF without generating any results, then // we block waiting for new oplog data to arrive. if (isCursorAwaitData(cursor) && state == PlanExecutor::IS_EOF && numResults == 0) { // Retrieve the notifier which we will wait on until new data arrives. We make sure // to do this in the lock because once we drop the lock it is possible for the // collection to become invalid. The notifier itself will outlive the collection if // the collection is dropped, as we keep a shared_ptr to it. auto notifier = ctx->getCollection()->getCappedInsertNotifier(); // Save the PlanExecutor and drop our locks. exec->saveState(); ctx.reset(); // Block waiting for data. Microseconds timeout(CurOp::get(txn)->getRemainingMaxTimeMicros()); notifier->waitForInsert(lastInsertCount, timeout); notifier.reset(); ctx.reset(new AutoGetCollectionForRead(txn, request.nss)); exec->restoreState(txn); // We woke up because either the timed_wait expired, or there was more data. Either // way, attempt to generate another batch of results. batchStatus = generateBatch(cursor, request, &nextBatch, &state, &numResults); if (!batchStatus.isOK()) { return appendCommandStatus(result, batchStatus); } } if (shouldSaveCursorGetMore(state, exec, isCursorTailable(cursor))) { respondWithId = request.cursorid; exec->saveState(); // If maxTimeMS was set directly on the getMore rather than being rolled over // from a previous find, then don't roll remaining micros over to the next // getMore. if (!hasOwnMaxTime) { cursor->setLeftoverMaxTimeMicros(CurOp::get(txn)->getRemainingMaxTimeMicros()); } cursor->incPos(numResults); if (isCursorTailable(cursor) && state == PlanExecutor::IS_EOF) { // Rather than swapping their existing RU into the client cursor, tailable // cursors should get a new recovery unit. ruSwapper.dismiss(); } } else { CurOp::get(txn)->debug().cursorExhausted = true; } appendGetMoreResponseObject(respondWithId, request.nss.ns(), nextBatch.arr(), &result); if (respondWithId) { cursorFreer.Dismiss(); // If we are operating on an aggregation cursor, then we dropped our collection lock // earlier and need to reacquire it in order to clean up our ClientCursorPin. if (cursor->isAggCursor()) { invariant(NULL == ctx.get()); unpinDBLock.reset( new Lock::DBLock(txn->lockState(), request.nss.db(), MODE_IS)); unpinCollLock.reset( new Lock::CollectionLock(txn->lockState(), request.nss.ns(), MODE_IS)); } } return true; }
/** * Called by db/instance.cpp. This is the getMore entry point. * * pass - when QueryOption_AwaitData is in use, the caller will make repeated calls * when this method returns an empty result, incrementing pass on each call. * Thus, pass == 0 indicates this is the first "attempt" before any 'awaiting'. */ QueryResult::View getMore(OperationContext* txn, const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized) { // For testing, we may want to fail if we receive a getmore. if (MONGO_FAIL_POINT(failReceivedGetmore)) { invariant(0); } exhaust = false; const NamespaceString nss(ns); // Depending on the type of cursor being operated on, we hold locks for the whole getMore, // or none of the getMore, or part of the getMore. The three cases in detail: // // 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore. // 2) Cursor owned by global cursor manager: we don't lock anything. These cursors don't // own any collection state. // 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and // "unpinCollLock". This is because agg cursors handle locking internally (hence the // release), but the pin and unpin of the cursor must occur under the collection lock. // We don't use our AutoGetCollectionForRead "ctx" to relock, because // AutoGetCollectionForRead checks the sharding version (and we want the relock for the // unpin to succeed even if the sharding version has changed). // // Note that we declare our locks before our ClientCursorPin, in order to ensure that the // pin's destructor is called before the lock destructors (so that the unpin occurs under // the lock). boost::scoped_ptr<AutoGetCollectionForRead> ctx; boost::scoped_ptr<Lock::DBLock> unpinDBLock; boost::scoped_ptr<Lock::CollectionLock> unpinCollLock; CursorManager* cursorManager; CursorManager* globalCursorManager = CursorManager::getGlobalCursorManager(); if (globalCursorManager->ownsCursorId(cursorid)) { cursorManager = globalCursorManager; } else { ctx.reset(new AutoGetCollectionForRead(txn, nss)); Collection* collection = ctx->getCollection(); uassert( 17356, "collection dropped between getMore calls", collection ); cursorManager = collection->getCursorManager(); } LOG(5) << "Running getMore, cursorid: " << cursorid << endl; // This checks to make sure the operation is allowed on a replicated node. Since we are not // passing in a query object (necessary to check SlaveOK query option), the only state where // reads are allowed is PRIMARY (or master in master/slave). This function uasserts if // reads are not okay. Status status = repl::getGlobalReplicationCoordinator()->checkCanServeReadsFor( txn, nss, true); uassertStatusOK(status); // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. ClientCursorPin ccPin(cursorManager, cursorid); ClientCursor* cc = ccPin.c(); // If we're not being called from DBDirectClient we want to associate the RecoveryUnit // used to create the execution machinery inside the cursor with our OperationContext. // If we throw or otherwise exit this method in a disorderly fashion, we must ensure // that further calls to getMore won't fail, and that the provided OperationContext // has a valid RecoveryUnit. As such, we use RAII to accomplish this. // // This must be destroyed before the ClientCursor is destroyed. std::auto_ptr<ScopedRecoveryUnitSwapper> ruSwapper; // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; const int InitialBufSize = 512 + sizeof(QueryResult::Value) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(InitialBufSize); bb.skip(sizeof(QueryResult::Value)); if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Check for spoofing of the ns such that it does not match the one originally // there for the cursor. uassert(ErrorCodes::Unauthorized, str::stream() << "Requested getMore on namespace " << ns << ", but cursor " << cursorid << " belongs to namespace " << cc->ns(), ns == cc->ns()); *isCursorAuthorized = true; // Restore the RecoveryUnit if we need to. if (txn->getClient()->isInDirectClient()) { if (cc->hasRecoveryUnit()) invariant(txn->recoveryUnit() == cc->getUnownedRecoveryUnit()); } else { if (!cc->hasRecoveryUnit()) { // Start using a new RecoveryUnit cc->setOwnedRecoveryUnit( getGlobalServiceContext()->getGlobalStorageEngine()->newRecoveryUnit()); } // Swap RecoveryUnit(s) between the ClientCursor and OperationContext. ruSwapper.reset(new ScopedRecoveryUnitSwapper(cc, txn)); } // Reset timeout timer on the cursor since the cursor is still in use. cc->setIdleTime(0); // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. if (0 == pass) { cc->updateSlaveLocation(txn); } if (cc->isAggCursor()) { // Agg cursors handle their own locking internally. ctx.reset(); // unlocks } // If we're replaying the oplog, we save the last time that we read. Timestamp slaveReadTill; // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); // What gives us results. PlanExecutor* exec = cc->getExecutor(); const int queryOptions = cc->queryOptions(); // Get results out of the executor. exec->restoreState(txn); BSONObj obj; PlanExecutor::ExecState state; while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (queryOptions & QueryOption_OplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || bsonTimestamp == e.type()) { slaveReadTill = e.timestamp(); } } if (enoughForGetMore(ntoreturn, numResults, bb.len())) { break; } } if (PlanExecutor::DEAD == state || PlanExecutor::FAILURE == state) { // Propagate this error to caller. if (PlanExecutor::FAILURE == state) { scoped_ptr<PlanStageStats> stats(exec->getStats()); error() << "Plan executor error, stats: " << Explain::statsToBSON(*stats); uasserted(17406, "getMore executor error: " + WorkingSetCommon::toStatusString(obj)); } // In the old system tailable capped cursors would be killed off at the // cursorid level. If a tailable capped cursor is nuked the cursorid // would vanish. // // In the new system they die and are cleaned up later (or time out). // So this is where we get to remove the cursorid. if (0 == numResults) { resultFlags = ResultFlag_CursorNotFound; } } const bool shouldSaveCursor = shouldSaveCursorGetMore(state, exec, isCursorTailable(cc)); // In order to deregister a cursor, we need to be holding the DB + collection lock and // if the cursor is aggregation, we release these locks. if (cc->isAggCursor()) { invariant(NULL == ctx.get()); unpinDBLock.reset(new Lock::DBLock(txn->lockState(), nss.db(), MODE_IS)); unpinCollLock.reset(new Lock::CollectionLock(txn->lockState(), nss.ns(), MODE_IS)); } // Our two possible ClientCursorPin cleanup paths are: // 1) If the cursor is not going to be saved, we call deleteUnderlying() on the pin. // 2) If the cursor is going to be saved, we simply let the pin go out of scope. In // this case, the pin's destructor will be invoked, which will call release() on the // pin. Because our ClientCursorPin is declared after our lock is declared, this // will happen under the lock. if (!shouldSaveCursor) { ruSwapper.reset(); ccPin.deleteUnderlying(); // cc is now invalid, as is the executor cursorid = 0; cc = NULL; curop.debug().cursorExhausted = true; LOG(5) << "getMore NOT saving client cursor, ended with state " << PlanExecutor::statestr(state) << endl; } else { // Continue caching the ClientCursor. cc->incPos(numResults); exec->saveState(); LOG(5) << "getMore saving client cursor ended with state " << PlanExecutor::statestr(state) << endl; if (PlanExecutor::IS_EOF == state && (queryOptions & QueryOption_CursorTailable)) { if (!txn->getClient()->isInDirectClient()) { // Don't stash the RU. Get a new one on the next getMore. ruSwapper->dismiss(); } if ((queryOptions & QueryOption_AwaitData) && (numResults == 0) && (pass < 1000)) { // Bubble up to the AwaitData handling code in receivedGetMore which will // try again. return NULL; } } // Possibly note slave's position in the oplog. if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = (queryOptions & QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult::View qr = bb.buf(); qr.msgdata().setLen(bb.len()); qr.msgdata().setOperation(opReply); qr.setResultFlags(resultFlags); qr.setCursorId(cursorid); qr.setStartingFrom(startingResult); qr.setNReturned(numResults); bb.decouple(); LOG(5) << "getMore returned " << numResults << " results\n"; return qr; }
/** * Generates the next batch of results for a ClientCursor. * * TODO: Do we need to support some equivalent of OP_REPLY responseFlags? * * TODO: Is it possible to support awaitData? */ bool run(OperationContext* txn, const std::string& dbname, BSONObj& cmdObj, int options, std::string& errmsg, BSONObjBuilder& result) override { // Counted as a getMore, not as a command. globalOpCounters.gotGetMore(); if (txn->getClient()->isInDirectClient()) { return appendCommandStatus(result, Status(ErrorCodes::IllegalOperation, "Cannot run getMore command from eval()")); } StatusWith<GetMoreRequest> parseStatus = GetMoreRequest::parseFromBSON(dbname, cmdObj); if (!parseStatus.isOK()) { return appendCommandStatus(result, parseStatus.getStatus()); } const GetMoreRequest& request = parseStatus.getValue(); // Depending on the type of cursor being operated on, we hold locks for the whole // getMore, or none of the getMore, or part of the getMore. The three cases in detail: // // 1) Normal cursor: we lock with "ctx" and hold it for the whole getMore. // 2) Cursor owned by global cursor manager: we don't lock anything. These cursors // don't own any collection state. // 3) Agg cursor: we lock with "ctx", then release, then relock with "unpinDBLock" and // "unpinCollLock". This is because agg cursors handle locking internally (hence the // release), but the pin and unpin of the cursor must occur under the collection // lock. We don't use our AutoGetCollectionForRead "ctx" to relock, because // AutoGetCollectionForRead checks the sharding version (and we want the relock for // the unpin to succeed even if the sharding version has changed). // // Note that we declare our locks before our ClientCursorPin, in order to ensure that // the pin's destructor is called before the lock destructors (so that the unpin occurs // under the lock). std::unique_ptr<AutoGetCollectionForRead> ctx; std::unique_ptr<Lock::DBLock> unpinDBLock; std::unique_ptr<Lock::CollectionLock> unpinCollLock; CursorManager* cursorManager; CursorManager* globalCursorManager = CursorManager::getGlobalCursorManager(); if (globalCursorManager->ownsCursorId(request.cursorid)) { cursorManager = globalCursorManager; } else { ctx.reset(new AutoGetCollectionForRead(txn, request.nss)); Collection* collection = ctx->getCollection(); if (!collection) { return appendCommandStatus(result, Status(ErrorCodes::OperationFailed, "collection dropped between getMore calls")); } cursorManager = collection->getCursorManager(); } ClientCursorPin ccPin(cursorManager, request.cursorid); ClientCursor* cursor = ccPin.c(); if (!cursor) { // We didn't find the cursor. return appendCommandStatus(result, Status(ErrorCodes::CursorNotFound, str::stream() << "Cursor not found, cursor id: " << request.cursorid)); } if (request.nss.ns() != cursor->ns()) { return appendCommandStatus(result, Status(ErrorCodes::Unauthorized, str::stream() << "Requested getMore on namespace '" << request.nss.ns() << "', but cursor belongs to a different namespace")); } // On early return, get rid of the the cursor. ScopeGuard cursorFreer = MakeGuard(&ClientCursorPin::deleteUnderlying, ccPin); if (!cursor->hasRecoveryUnit()) { // Start using a new RecoveryUnit. cursor->setOwnedRecoveryUnit( getGlobalServiceContext()->getGlobalStorageEngine()->newRecoveryUnit()); } // Swap RecoveryUnit(s) between the ClientCursor and OperationContext. ScopedRecoveryUnitSwapper ruSwapper(cursor, txn); // Reset timeout timer on the cursor since the cursor is still in use. cursor->setIdleTime(0); // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. txn->getCurOp()->setMaxTimeMicros(cursor->getLeftoverMaxTimeMicros()); txn->checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. if (cursor->isAggCursor()) { // Agg cursors handle their own locking internally. ctx.reset(); // unlocks } PlanExecutor* exec = cursor->getExecutor(); exec->restoreState(txn); // TODO: Handle result sets larger than 16MB. BSONArrayBuilder nextBatch; BSONObj obj; PlanExecutor::ExecState state; int numResults = 0; while (PlanExecutor::ADVANCED == (state = exec->getNext(&obj, NULL))) { // Add result to output buffer. nextBatch.append(obj); numResults++; if (enoughForGetMore(request.batchSize, numResults, nextBatch.len())) { break; } } // If we are operating on an aggregation cursor, then we dropped our collection lock // earlier and need to reacquire it in order to clean up our ClientCursorPin. // // TODO: We need to ensure that this relock happens if we release the pin above in // response to PlanExecutor::getNext() throwing an exception. if (cursor->isAggCursor()) { invariant(NULL == ctx.get()); unpinDBLock.reset(new Lock::DBLock(txn->lockState(), request.nss.db(), MODE_IS)); unpinCollLock.reset( new Lock::CollectionLock(txn->lockState(), request.nss.ns(), MODE_IS)); } // Fail the command if the PlanExecutor reports execution failure. if (PlanExecutor::FAILURE == state) { const std::unique_ptr<PlanStageStats> stats(exec->getStats()); error() << "GetMore executor error, stats: " << Explain::statsToBSON(*stats); return appendCommandStatus(result, Status(ErrorCodes::OperationFailed, str::stream() << "GetMore executor error: " << WorkingSetCommon::toStatusString(obj))); } CursorId respondWithId = 0; if (shouldSaveCursorGetMore(state, exec, isCursorTailable(cursor))) { respondWithId = request.cursorid; exec->saveState(); cursor->setLeftoverMaxTimeMicros(txn->getCurOp()->getRemainingMaxTimeMicros()); cursor->incPos(numResults); if (isCursorTailable(cursor) && state == PlanExecutor::IS_EOF) { // Rather than swapping their existing RU into the client cursor, tailable // cursors should get a new recovery unit. ruSwapper.dismiss(); } } else { txn->getCurOp()->debug().cursorExhausted = true; } appendGetMoreResponseObject(respondWithId, request.nss.ns(), nextBatch.arr(), &result); if (respondWithId) { cursorFreer.Dismiss(); } return true; }