/** * Also called by db/ops/query.cpp. This is the new getMore entry point. * * pass - when QueryOption_AwaitData is in use, the caller will make repeated calls * when this method returns an empty result, incrementing pass on each call. * Thus, pass == 0 indicates this is the first "attempt" before any 'awaiting'. */ QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized) { exhaust = false; // This is a read lock. scoped_ptr<Client::ReadContext> ctx(new Client::ReadContext(ns)); Collection* collection = ctx->ctx().db()->getCollection(ns); uassert( 17356, "collection dropped between getMore calls", collection ); QLOG() << "Running getMore, cursorid: " << cursorid << endl; // This checks to make sure the operation is allowed on a replicated node. Since we are not // passing in a query object (necessary to check SlaveOK query option), the only state where // reads are allowed is PRIMARY (or master in master/slave). This function uasserts if // reads are not okay. replVerifyReadsOk(); // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. ClientCursorPin ccPin(collection, cursorid); ClientCursor* cc = ccPin.c(); // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; const int InitialBufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(InitialBufSize); bb.skip(sizeof(QueryResult)); if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Quote: check for spoofing of the ns such that it does not match the one originally // there for the cursor uassert(17011, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // Reset timeout timer on the cursor since the cursor is still in use. cc->setIdleTime(0); // TODO: fail point? // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. if (0 == pass) { cc->updateSlaveLocation(curop); } if (cc->isAggCursor) { // Agg cursors handle their own locking internally. ctx.reset(); // unlocks } CollectionMetadataPtr collMetadata = cc->getCollMetadata(); // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); // What gives us results. Runner* runner = cc->getRunner(); const int queryOptions = cc->queryOptions(); // Get results out of the runner. runner->restoreState(); BSONObj obj; Runner::RunnerState state; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (queryOptions & QueryOption_OplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } if ((ntoreturn && numResults >= ntoreturn) || bb.len() > MaxBytesToReturnToClientAtOnce) { break; } } if (Runner::RUNNER_EOF == state && 0 == numResults && (queryOptions & QueryOption_CursorTailable) && (queryOptions & QueryOption_AwaitData) && (pass < 1000)) { // If the cursor is tailable we don't kill it if it's eof. We let it try to get // data some # of times first. return 0; } bool saveClientCursor = false; if (Runner::RUNNER_DEAD == state || Runner::RUNNER_ERROR == state) { // Propagate this error to caller. if (Runner::RUNNER_ERROR == state) { // Stats are helpful when errors occur. TypeExplain* bareExplain; Status res = runner->getInfo(&bareExplain, NULL); if (res.isOK()) { boost::scoped_ptr<TypeExplain> errorExplain(bareExplain); error() << "Runner error, stats:\n" << errorExplain->stats.jsonString(Strict, true); } uasserted(17406, "getMore runner error: " + WorkingSetCommon::toStatusString(obj)); } // If we're dead there's no way to get more results. saveClientCursor = false; // In the old system tailable capped cursors would be killed off at the // cursorid level. If a tailable capped cursor is nuked the cursorid // would vanish. // // In the new system they die and are cleaned up later (or time out). // So this is where we get to remove the cursorid. if (0 == numResults) { resultFlags = ResultFlag_CursorNotFound; } } else if (Runner::RUNNER_EOF == state) { // EOF is also end of the line unless it's tailable. saveClientCursor = queryOptions & QueryOption_CursorTailable; } else { verify(Runner::RUNNER_ADVANCED == state); saveClientCursor = true; } if (!saveClientCursor) { ccPin.deleteUnderlying(); // cc is now invalid, as is the runner cursorid = 0; cc = NULL; QLOG() << "getMore NOT saving client cursor, ended with state " << Runner::statestr(state) << endl; } else { // Continue caching the ClientCursor. cc->incPos(numResults); runner->saveState(); QLOG() << "getMore saving client cursor ended with state " << Runner::statestr(state) << endl; // Possibly note slave's position in the oplog. if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = (queryOptions & QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf()); qr->len = bb.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = startingResult; qr->nReturned = numResults; bb.decouple(); QLOG() << "getMore returned " << numResults << " results\n"; return qr; }
std::string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { // Validate the namespace. const char *ns = q.ns; uassert(16332, "can't have an empty ns", ns[0]); const NamespaceString nsString(ns); uassert(16256, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid()); // Set curop information. curop.debug().ns = ns; curop.debug().ntoreturn = q.ntoreturn; curop.debug().query = q.query; curop.setQuery(q.query); // If the query is really a command, run it. if (nsString.isCommand()) { int nToReturn = q.ntoreturn; uassert(16979, str::stream() << "bad numberToReturn (" << nToReturn << ") for $cmd type ns - can only be 1 or -1", nToReturn == 1 || nToReturn == -1); curop.markCommand(); BufBuilder bb; bb.skip(sizeof(QueryResult)); BSONObjBuilder cmdResBuf; if (!runCommands(ns, q.query, curop, bb, cmdResBuf, false, q.queryOptions)) { uasserted(13530, "bad or malformed command request?"); } curop.debug().iscommand = true; // TODO: Does this get overwritten/do we really need to set this twice? curop.debug().query = q.query; QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf()); bb.decouple(); qr->setResultFlagsToOk(); qr->len = bb.len(); curop.debug().responseLength = bb.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; result.setData(qr, true); return ""; } // This is a read lock. We require this because if we're parsing a $where, the // where-specific parsing code assumes we have a lock and creates execution machinery that // requires it. Client::ReadContext ctx(q.ns); // Parse the qm into a CanonicalQuery. CanonicalQuery* cq; Status canonStatus = CanonicalQuery::canonicalize(q, &cq); if (!canonStatus.isOK()) { uasserted(17287, str::stream() << "Can't canonicalize query: " << canonStatus.toString()); } verify(cq); QLOG() << "Running query on new system: " << cq->toString(); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner = NULL; // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); // We'll now try to get the query runner that will execute this query for us. There // are a few cases in which we know upfront which runner we should get and, therefore, // we shortcut the selection process here. // // (a) If the query is over a collection that doesn't exist, we get a special runner // that's is so (a runner) which doesn't return results, the EOFRunner. // // (b) if the query is a replication's initial sync one, we get a SingleSolutinRunner // that uses a specifically designed stage that skips extents faster (see details in // exec/oplogstart.h) // // Otherwise we go through the selection of which runner is most suited to the // query + run-time context at hand. Status status = Status::OK(); if (ctx.ctx().db()->getCollection(cq->ns()) == NULL) { rawRunner = new EOFRunner(cq, cq->ns()); } else if (pq.hasOption(QueryOption_OplogReplay)) { status = getOplogStartHack(cq, &rawRunner); } else { // Takes ownership of cq. size_t options = QueryPlannerParams::DEFAULT; if (shardingState.needCollectionMetadata(pq.ns())) { options |= QueryPlannerParams::INCLUDE_SHARD_FILTER; } status = getRunner(cq, &rawRunner, options); } if (!status.isOK()) { // NOTE: Do not access cq as getRunner has deleted it. uasserted(17007, "Unable to execute query: " + status.reason()); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(cq->ns()); // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; // We turn on auto-yielding for the runner here. The runner registers itself with the // active runners list in ClientCursor. auto_ptr<ScopedRunnerRegistration> safety(new ScopedRunnerRegistration(runner.get())); runner->setYieldPolicy(Runner::YIELD_AUTO); BSONObj obj; Runner::RunnerState state; // uint64_t numMisplacedDocs = 0; // set this outside loop. we will need to use this both within loop and when deciding // to fill in explain information const bool isExplain = pq.isExplain(); while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // Add result to output buffer. This is unnecessary if explain info is requested if (!isExplain) { bb.appendBuf((void*)obj.objdata(), obj.objsize()); } // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; if (isExplain) { if (enoughForExplain(pq, numResults)) { break; } } else if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { QLOG() << "Enough for first batch, wantMore=" << pq.wantMore() << " numToReturn=" << pq.getNumToReturn() << " numResults=" << numResults << endl; // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { QLOG() << " runner EOF=" << runner->isEOF() << endl; saveClientCursor = !runner->isEOF(); } break; } } // If we cache the runner later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the runner later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the runner. safety.reset(); // Caller expects exceptions thrown in certain cases: // * in-memory sort using too much RAM. if (Runner::RUNNER_ERROR == state) { uasserted(17144, "Runner error, memory limit for sort probably exceeded"); } // Why save a dead runner? if (Runner::RUNNER_DEAD == state) { saveClientCursor = false; } else if (pq.hasOption(QueryOption_CursorTailable)) { // If we're tailing a capped collection, we don't bother saving the cursor if the // collection is empty. Otherwise, the semantics of the tailable cursor is that the // client will keep trying to read from it. So we'll keep it around. Collection* collection = ctx.ctx().db()->getCollection(cq->ns()); if (collection && collection->numRecords() != 0 && pq.getNumToReturn() != 1) { saveClientCursor = true; } } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } // Append explain information to query results by asking the runner to produce them. if (isExplain) { TypeExplain* bareExplain; Status res = runner->getExplainPlan(&bareExplain); if (!res.isOK()) { error() << "could not produce explain of query '" << pq.getFilter() << "', error: " << res.reason(); // If numResults and the data in bb don't correspond, we'll crash later when rooting // through the reply msg. BSONObj emptyObj; bb.appendBuf((void*)emptyObj.objdata(), emptyObj.objsize()); // The explain output is actually a result. numResults = 1; // TODO: we can fill out millis etc. here just fine even if the plan screwed up. } else { boost::scoped_ptr<TypeExplain> explain(bareExplain); // Fill in the missing run-time fields in explain, starting with propeties of // the process running the query. std::string server = mongoutils::str::stream() << getHostNameCached() << ":" << serverGlobalParams.port; explain->setServer(server); // We might have skipped some results due to chunk migration etc. so our count is // correct. explain->setN(numResults); // Clock the whole operation. explain->setMillis(curop.elapsedMillis()); BSONObj explainObj = explain->toBSON(); bb.appendBuf((void*)explainObj.objdata(), explainObj.objsize()); // The explain output is actually a result. numResults = 1; } } long long ccId = 0; if (saveClientCursor) { // We won't use the runner until it's getMore'd. runner->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(), cq->getParsed().getFilter()); ccId = cc->cursorid(); QLOG() << "caching runner with cursorid " << ccId << " after returning " << numResults << " results" << endl; // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); // TODO document if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } else { QLOG() << "not caching runner but returning " << numResults << " results\n"; } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) { bool ok = true; DbMessage d(m); const char *ns = d.getns(); int ntoreturn = d.pullInt(); long long cursorid = d.pullInt64(); curop.debug().ns = ns; curop.debug().ntoreturn = ntoreturn; curop.debug().cursorid = cursorid; shared_ptr<AssertionException> ex; scoped_ptr<Timer> timer; int pass = 0; bool exhaust = false; QueryResult* msgdata = 0; OpTime last; while( 1 ) { try { const NamespaceString nsString( ns ); uassert( 16258, str::stream() << "Invalid ns [" << ns << "]", nsString.isValid() ); if (str::startsWith(ns, "local.oplog.")){ if (pass == 0) { mutex::scoped_lock lk(OpTime::m); last = OpTime::getLast(lk); } else { last.waitForDifferent(1000/*ms*/); } } Client::ReadContext ctx(ns); // call this readlocked so state can't change replVerifyReadsOk(); msgdata = processGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust); } catch ( AssertionException& e ) { ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) ); ok = false; break; } if (msgdata == 0) { // this should only happen with QueryOption_AwaitData exhaust = false; massert(13073, "shutting down", !inShutdown() ); if ( ! timer ) { timer.reset( new Timer() ); } else { if ( timer->seconds() >= 4 ) { // after about 4 seconds, return. pass stops at 1000 normally. // we want to return occasionally so slave can checkpoint. pass = 10000; } } pass++; if (debug) sleepmillis(20); else sleepmillis(2); // note: the 1100 is beacuse of the waitForDifferent above // should eventually clean this up a bit curop.setExpectedLatencyMs( 1100 + timer->millis() ); continue; } break; }; if (ex) { exhaust = false; BSONObjBuilder err; ex->getInfo().append( err ); BSONObj errObj = err.done(); log() << errObj << endl; curop.debug().exceptionInfo = ex->getInfo(); if (ex->getCode() == 13436) { replyToQuery(ResultFlag_ErrSet, m, dbresponse, errObj); curop.debug().responseLength = dbresponse.response->header()->dataLen(); curop.debug().nreturned = 1; return ok; } msgdata = emptyMoreResult(cursorid); } Message *resp = new Message(); resp->setData(msgdata, true); curop.debug().responseLength = resp->header()->dataLen(); curop.debug().nreturned = msgdata->nReturned; dbresponse.response = resp; dbresponse.responseTo = m.header()->id; if( exhaust ) { curop.debug().exhaust = true; dbresponse.exhaustNS = ns; } return ok; }
/** * Also called by db/ops/query.cpp. This is the new getMore entry point. */ QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized) { exhaust = false; int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(bufSize); bb.skip(sizeof(QueryResult)); // This is a read lock. TODO: There is a cursor flag for not needing this. Do we care? Client::ReadContext ctx(ns); // TODO: Document. replVerifyReadsOk(); ClientCursorPin ccPin(cursorid); ClientCursor* cc = ccPin.c(); // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Quote: check for spoofing of the ns such that it does not match the one originally // there for the cursor uassert(17011, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // TODO: fail point? // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); // TODO: // curop.debug().query = BSONForQuery // curop.setQuery(curop.debug().query); // TODO: What is pass? if (0 == pass) { cc->updateSlaveLocation(curop); } CollectionMetadataPtr collMetadata = cc->getCollMetadata(); // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; startingResult = cc->pos(); Runner* runner = cc->getRunner(); const ParsedQuery& pq = runner->getQuery().getParsed(); // Get results out of the runner. // TODO: There may be special handling required for tailable cursors? runner->restoreState(); BSONObj obj; // TODO: Differentiate EOF from error. while (runner->getNext(&obj)) { // If we're sharded make sure that we don't return any data that hasn't been // migrated off of our shard yet. if (collMetadata) { KeyPattern kp(collMetadata->getKeyPattern()); if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; } } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } if ((numResults && numResults >= ntoreturn) || bb.len() > MaxBytesToReturnToClientAtOnce) { break; } } cc->incPos(numResults); runner->saveState(); // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = pq.hasOption(QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf()); qr->len = bb.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = startingResult; qr->nReturned = numResults; bb.decouple(); return qr; }
/** * This is called by db/ops/query.cpp. This is the entry point for answering a query. */ string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { // This is a read lock. Client::ReadContext ctx(q.ns, dbpath); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner; Status status = getRunner(q, &rawRunner); if (!status.isOK()) { uasserted(17007, "Couldn't process query " + q.query.toString() + " why: " + status.reason()); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(q.ns); // We use this a lot below. const ParsedQuery& pq = runner->getQuery().getParsed(); // TODO: Document why we do this. replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; BSONObj obj; // TODO: Differentiate EOF from error. while (runner->getNext(&obj)) { // If we're sharded make sure that we don't return any data that hasn't been migrated // off of our shared yet. if (collMetadata) { // This information can change if we yield and as such we must make sure to re-fetch // it if we yield. KeyPattern kp(collMetadata->getKeyPattern()); // This performs excessive BSONObj creation but that's OK for now. if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; } } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; const bool isExplain = pq.isExplain(); if (isExplain && pq.enoughForExplain(numResults)) { break; } else if (!supportsGetMore && (pq.enough(numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (pq.enoughForFirstBatch(numResults, bb.len())) { // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { saveClientCursor = true; } break; } } // TODO: Stage creation can set tailable depending on what's in the parsed query. We have // the full parsed query available during planning...set it there. // // TODO: If we're tailable we want to save the client cursor. Make sure we do this later. //if (pq.hasOption(QueryOption_CursorTailable) && pq.getNumToReturn() != 1) { ... } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } long long ccId = 0; if (saveClientCursor) { // Allocate a new ClientCursor. ClientCursorHolder ccHolder; ccHolder.reset(new ClientCursor(runner.get())); ccId = ccHolder->cursorid(); // We won't use the runner until it's getMore'd. runner->saveState(); // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { ccHolder->slaveReadTill(slaveReadTill); } if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. ccHolder->setCollMetadata(collMetadata); ccHolder->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). ccHolder->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); // Give up our reference to the CC. ccHolder.release(); } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; // TODO: nscanned is bogus. // curop.debug().nscanned = ( cursor ? cursor->nscanned() : 0LL ); curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
/** * Run a query -- includes checking for and running a Command. * @return points to ns if exhaust mode. 0=normal mode * @locks the db mutex for reading (and potentially for writing temporarily to create a new db). * @yields the db mutex periodically after acquiring it. * @asserts on scan and order memory exhaustion and other cases. */ const char *runQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { shared_ptr<ParsedQuery> pq_shared( new ParsedQuery(q) ); ParsedQuery& pq( *pq_shared ); BSONObj jsobj = q.query; int queryOptions = q.queryOptions; const char *ns = q.ns; if( logLevel >= 2 ) log() << "runQuery called " << ns << " " << jsobj << endl; curop.debug().ns = ns; curop.debug().ntoreturn = pq.getNumToReturn(); curop.debug().query = jsobj; curop.setQuery(jsobj); // Run a command. if ( pq.couldBeCommand() ) { BufBuilder bb; bb.skip(sizeof(QueryResult)); BSONObjBuilder cmdResBuf; if ( runCommands(ns, jsobj, curop, bb, cmdResBuf, false, queryOptions) ) { curop.debug().iscommand = true; curop.debug().query = jsobj; curop.markCommand(); auto_ptr< QueryResult > qr; qr.reset( (QueryResult *) bb.buf() ); bb.decouple(); qr->setResultFlagsToOk(); qr->len = bb.len(); curop.debug().responseLength = bb.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = 1; result.setData( qr.release(), true ); } else { uasserted(13530, "bad or malformed command request?"); } return 0; } bool explain = pq.isExplain(); BSONObj order = pq.getOrder(); BSONObj query = pq.getFilter(); /* The ElemIter will not be happy if this isn't really an object. So throw exception here when that is true. (Which may indicate bad data from client.) */ if ( query.objsize() == 0 ) { out() << "Bad query object?\n jsobj:"; out() << jsobj.toString() << "\n query:"; out() << query.toString() << endl; uassert( 10110 , "bad query object", false); } Client::ReadContext ctx( ns , dbpath ); // read locks const ConfigVersion shardingVersionAtStart = shardingState.getVersion( ns ); replVerifyReadsOk(&pq); if ( pq.hasOption( QueryOption_CursorTailable ) ) { NamespaceDetails *d = nsdetails( ns ); uassert( 13051, "tailable cursor requested on non capped collection", d && d->isCapped() ); const BSONObj nat1 = BSON( "$natural" << 1 ); if ( order.isEmpty() ) { order = nat1; } else { uassert( 13052, "only {$natural:1} order allowed for tailable cursor", order == nat1 ); } } // Run a simple id query. if ( ! (explain || pq.showDiskLoc()) && isSimpleIdQuery( query ) && !pq.hasOption( QueryOption_CursorTailable ) ) { int n = 0; bool nsFound = false; bool indexFound = false; BSONObj resObject; Client& c = cc(); bool found = Helpers::findById( c, ns , query , resObject , &nsFound , &indexFound ); if ( nsFound == false || indexFound == true ) { if ( shardingState.needShardChunkManager( ns ) ) { ShardChunkManagerPtr m = shardingState.getShardChunkManager( ns ); if ( m && ! m->belongsToMe( resObject ) ) { // I have something this _id // but it doesn't belong to me // so return nothing resObject = BSONObj(); found = false; } } BufBuilder bb(sizeof(QueryResult)+resObject.objsize()+32); bb.skip(sizeof(QueryResult)); curop.debug().idhack = true; if ( found ) { n = 1; fillQueryResultFromObj( bb , pq.getFields() , resObject ); } auto_ptr< QueryResult > qr; qr.reset( (QueryResult *) bb.buf() ); bb.decouple(); qr->setResultFlagsToOk(); qr->len = bb.len(); curop.debug().responseLength = bb.len(); qr->setOperation(opReply); qr->cursorId = 0; qr->startingFrom = 0; qr->nReturned = n; result.setData( qr.release(), true ); return NULL; } } // Run a regular query. BSONObj oldPlan; if ( explain && ! pq.hasIndexSpecifier() ) { MultiPlanScanner mps( ns, query, order ); if ( mps.usingCachedPlan() ) { oldPlan = mps.oldExplain().firstElement().embeddedObject() .firstElement().embeddedObject().getOwned(); } } // In some cases the query may be retried if there is an in memory sort size assertion. for( int retry = 0; retry < 2; ++retry ) { try { return queryWithQueryOptimizer( m, queryOptions, ns, jsobj, curop, query, order, pq_shared, oldPlan, shardingVersionAtStart, result ); } catch ( const QueryRetryException & ) { verify( retry == 0 ); } } verify( false ); return 0; }
/** * This is called by db/ops/query.cpp. This is the entry point for answering a query. */ string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { // This is a read lock. Client::ReadContext ctx(q.ns, dbpath); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner; CanonicalQuery* cq; Status status = getRunner(q, &rawRunner, &cq); if (!status.isOK()) { uasserted(17007, "Couldn't process query " + q.query.toString() + " why: " + status.reason()); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); log() << "Running query on new system: " << cq->toString(); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(q.ns); // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); // TODO: Remove when impl'd if (pq.hasOption(QueryOption_OplogReplay)) { warning() << "haven't implemented findingstartcursor yet\n"; } // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; // We turn on auto-yielding for the runner here. The runner registers itself with the // active runners list in ClientCursor. ClientCursor::registerRunner(runner.get()); runner->setYieldPolicy(Runner::YIELD_AUTO); auto_ptr<DeregisterEvenIfUnderlyingCodeThrows> safety( new DeregisterEvenIfUnderlyingCodeThrows(runner.get())); BSONObj obj; Runner::RunnerState state; // set this outside loop. we will need to use this both within loop and when deciding // to fill in explain information const bool isExplain = pq.isExplain(); while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // If we're sharded make sure that we don't return any data that hasn't been migrated // off of our shared yet. if (collMetadata) { // This information can change if we yield and as such we must make sure to re-fetch // it if we yield. KeyPattern kp(collMetadata->getKeyPattern()); // This performs excessive BSONObj creation but that's OK for now. if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; } } // Add result to output buffer. This is unnecessary if explain info is requested if (!isExplain) { bb.appendBuf((void*)obj.objdata(), obj.objsize()); } // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; if (isExplain) { if (enoughForExplain(pq, numResults)) { break; } } else if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { saveClientCursor = true; } break; } } // If we cache the runner later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the runner later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the runner. safety.reset(); // Caller expects exceptions thrown in certain cases: // * in-memory sort using too much RAM. if (Runner::RUNNER_ERROR == state) { uasserted(17144, "Runner error, memory limit for sort probably exceeded"); } // Why save a dead runner? if (Runner::RUNNER_DEAD == state) { saveClientCursor = false; } else if (pq.hasOption(QueryOption_CursorTailable) && (1 != pq.getNumToReturn())) { // If pq.hasOption(tailable) the only plan the planner will output is a collscan with // tailable set. saveClientCursor = true; } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } long long ccId = 0; if (saveClientCursor) { // We won't use the runner until it's getMore'd. runner->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(), cq->getParsed().getFilter()); ccId = cc->cursorid(); log() << "caching runner with cursorid " << ccId << " after returning " << numResults << " results" << endl; // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); // TODO document if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } // append explain information to query results if (isExplain) { BSONObjBuilder bob; bob.append("n", numResults); BSONObj obj = bob.done(); bb.appendBuf((void*)obj.objdata(), obj.objsize()); // The explain output is actually a result. numResults = 1; } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; // TODO: nscanned is bogus. // curop.debug().nscanned = ( cursor ? cursor->nscanned() : 0LL ); curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
/** * Also called by db/ops/query.cpp. This is the new getMore entry point. */ QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized) { exhaust = false; int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(bufSize); bb.skip(sizeof(QueryResult)); // This is a read lock. TODO: There is a cursor flag for not needing this. Do we care? Client::ReadContext ctx(ns); //log() << "running getMore in new system, cursorid " << cursorid << endl; // This checks to make sure the operation is allowed on a replicated node. Since we are not // passing in a query object (necessary to check SlaveOK query option), the only state where // reads are allowed is PRIMARY (or master in master/slave). This function uasserts if // reads are not okay. replVerifyReadsOk(); // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. ClientCursorPin ccPin(cursorid); ClientCursor* cc = ccPin.c(); // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Quote: check for spoofing of the ns such that it does not match the one originally // there for the cursor uassert(17011, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // TODO: fail point? // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // TODO: // curop.debug().query = BSONForQuery // curop.setQuery(curop.debug().query); // TODO: What is pass? if (0 == pass) { cc->updateSlaveLocation(curop); } CollectionMetadataPtr collMetadata = cc->getCollMetadata(); // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); // What gives us results. Runner* runner = cc->getRunner(); const int queryOptions = cc->queryOptions(); // Get results out of the runner. runner->restoreState(); BSONObj obj; Runner::RunnerState state; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // If we're sharded make sure that we don't return any data that hasn't been // migrated off of our shard yet. if (collMetadata) { KeyPattern kp(collMetadata->getKeyPattern()); if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; } } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (queryOptions & QueryOption_OplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } if ((numResults && numResults >= ntoreturn) || bb.len() > MaxBytesToReturnToClientAtOnce) { break; } } if (Runner::RUNNER_EOF == state && 0 == numResults && (queryOptions & QueryOption_CursorTailable) && (queryOptions & QueryOption_AwaitData) && (pass < 1000)) { // If the cursor is tailable we don't kill it if it's eof. We let it try to get // data some # of times first. return 0; } else if (Runner::RUNNER_DEAD == state || Runner::RUNNER_EOF == state) { ccPin.free(); // cc is now invalid, as is the runner cursorid = 0; cc = NULL; } else { // Continue caching the ClientCursor. cc->incPos(numResults); runner->saveState(); // Possibly note slave's position in the oplog. if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = (queryOptions & QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf()); qr->len = bb.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = startingResult; qr->nReturned = numResults; bb.decouple(); return qr; }
QueryResult* processGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized ) { if (isNewQueryFrameworkEnabled()) { return newGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust, isCursorAuthorized); } exhaust = false; int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce; BufBuilder b( bufSize ); b.skip(sizeof(QueryResult)); int resultFlags = ResultFlag_AwaitCapable; int start = 0; int n = 0; scoped_ptr<Client::ReadContext> ctx(new Client::ReadContext(ns)); // call this readlocked so state can't change replVerifyReadsOk(); ClientCursorPin p(cursorid); ClientCursor *cc = p.c(); if ( unlikely(!cc) ) { LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl; cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Some internal users create a ClientCursor with a Runner. Don't crash if this // happens. Instead, hand them off to the new framework. if (NULL != cc->getRunner()) { p.release(); return newGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust, isCursorAuthorized); } // check for spoofing of the ns such that it does not match the one originally there for the cursor uassert(14833, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // This must be done after auth check to ensure proper cleanup. uassert(16951, "failing getmore due to set failpoint", !MONGO_FAIL_POINT(getMoreError)); // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros( cc->getLeftoverMaxTimeMicros() ); if ( pass == 0 ) cc->updateSlaveLocation( curop ); int queryOptions = cc->queryOptions(); curop.debug().query = cc->query(); curop.setQuery( cc->query() ); start = cc->pos(); Cursor *c = cc->c(); if (!c->requiresLock()) { // make sure it won't be destroyed under us fassert(16952, !c->shouldDestroyOnNSDeletion()); fassert(16953, !c->supportYields()); ctx.reset(); // unlocks } c->recoverFromYield(); DiskLoc last; // This metadata may be stale, but it's the state of chunking when the cursor was // created. CollectionMetadataPtr metadata = cc->getCollMetadata(); KeyPattern keyPattern( metadata ? metadata->getKeyPattern() : BSONObj() ); while ( 1 ) { if ( !c->ok() ) { if ( c->tailable() ) { // when a tailable cursor hits "EOF", ok() goes false, and current() is // null. however advance() can still be retries as a reactivation attempt. // when there is new data, it will return true. that's what we are doing // here. if ( c->advance() ) continue; if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) { return 0; } break; } p.release(); bool ok = ClientCursor::erase(cursorid); verify(ok); cursorid = 0; cc = 0; break; } MatchDetails details; if ( cc->fields && cc->fields->getArrayOpType() == Projection::ARRAY_OP_POSITIONAL ) { // field projection specified, and contains an array operator details.requestElemMatchKey(); } // in some cases (clone collection) there won't be a matcher if ( !c->currentMatches( &details ) ) { } else if ( metadata && !metadata->keyBelongsToMe( extractKey(c, keyPattern ) ) ) { LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl; } else { if( c->getsetdup(c->currLoc()) ) { //out() << " but it's a dup \n"; } else { last = c->currLoc(); n++; // Fill out the fields requested by the query. const Projection::KeyOnly *keyFieldsOnly = c->keyFieldsOnly(); if ( keyFieldsOnly ) { fillQueryResultFromObj( b, 0, keyFieldsOnly->hydrate( c->currKey() ), &details ); } else { DiskLoc loc = c->currLoc(); fillQueryResultFromObj( b, cc->fields.get(), c->current(), &details, ( ( cc->pq.get() && cc->pq->showDiskLoc() ) ? &loc : 0 ) ); } if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) { c->advance(); cc->incPos( n ); break; } } } c->advance(); if ( ! cc->yieldSometimes( ( c->ok() && c->keyFieldsOnly() ) ? ClientCursor::DontNeed : ClientCursor::WillNeed ) ) { ClientCursor::erase(cursorid); cursorid = 0; cc = 0; break; } } if ( cc ) { if ( c->supportYields() ) { ClientCursor::YieldData data; verify( cc->prepareToYield( data ) ); } else { cc->c()->noteLocation(); } cc->storeOpForSlave( last ); exhaust = cc->queryOptions() & QueryOption_Exhaust; // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult *qr = (QueryResult *) b.buf(); qr->len = b.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = start; qr->nReturned = n; b.decouple(); return qr; }
bool receivedGetMore(DbResponse& dbresponse, Message& m, CurOp& curop ) { bool ok = true; DbMessage d(m); const char *ns = d.getns(); int ntoreturn = d.pullInt(); long long cursorid = d.pullInt64(); curop.debug().ns = ns; curop.debug().ntoreturn = ntoreturn; curop.debug().cursorid = cursorid; shared_ptr<AssertionException> ex; scoped_ptr<Timer> timer; int pass = 0; bool exhaust = false; QueryResult* msgdata = 0; GTID last; bool isOplog = false; while( 1 ) { bool isCursorAuthorized = false; try { uassert( 16258, str::stream() << "Invalid ns [" << ns << "]", NamespaceString::isValid(ns) ); Status status = cc().getAuthorizationManager()->checkAuthForGetMore(ns); uassert(16543, status.reason(), status.isOK()); // I (Zardosht), am not crazy about this, but I cannot think of // better alternatives at the moment. The high level goal is to find // a way to do a wait without having a read lock held // via Client::ReadContext. Unfortunately, we can't get the exact position // of the cursor without accessing it, which required a read lock. // So, we do this, which is a good estimate. // // Note this is similar to what vanilla MongoDB does. // // in the first pass, we extract the minimum live GTID. This must be // greater than or equal to the existing cursor's position. // In the second pass, we wait for the GTID manager to have a // minumum live GTID greater than what we saw in the first pass. // This new GTID will be greater than the cursor's starting position, // and therefore the cursor should have more data to look at. // It is theoretically possible that one day, the cursor will still // return no new data because all new GTIDs in between these // two values aborted, but that is not possible right now. Any GTID // assigned is done so with the intent to commit, and tokumx // aborts if a coommit is not successful. if (str::startsWith(ns, "local.oplog.") && theReplSet){ isOplog = true; if (pass == 0) { last = theReplSet->gtidManager->getMinLiveGTID(); } else { theReplSet->gtidManager->waitForDifferentMinLive( last, 2000 // ms, this will be called twice ); } } LOCK_REASON(lockReason, "getMore"); Client::ReadContext ctx(ns, lockReason); // call this readlocked so state can't change replVerifyReadsOk(); msgdata = processGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust, &isCursorAuthorized); } catch ( AssertionException& e ) { if ( isCursorAuthorized ) { // If a cursor with id 'cursorid' was authorized, it may have been advanced // before an exception terminated processGetMore. Erase the ClientCursor // because it may now be out of sync with the client's iteration state. // SERVER-7952 // TODO Temporary code, see SERVER-4563 for a cleanup overview. ClientCursor::erase( cursorid ); } ex.reset( new AssertionException( e.getInfo().msg, e.getCode() ) ); ok = false; break; } pass++; if (msgdata == 0) { // this should only happen with QueryOption_AwaitData exhaust = false; massert(13073, "shutting down", !inShutdown() ); if (!isOplog) { if ( ! timer ) { timer.reset( new Timer() ); } else { if ( timer->seconds() >= 4 ) { // after about 4 seconds, return. pass stops at 1000 normally. // we want to return occasionally so slave can checkpoint. pass = 10000; } } if (debug) { sleepmillis(20); } else { sleepmillis(2); } } else { // in the case where we are the oplog, using // waitForDifferentMinLive is sufficient. That // waits for 4 seconds, as the timer above does. // So, we we don't need more than 2 passes. if (pass > 1) { pass = 10000; } } // should eventually clean this up a bit if (isOplog) { curop.setExpectedLatencyMs( 4100 ); } else { // not sure if this 1100 is still wise. curop.setExpectedLatencyMs( 1100 + timer->millis() ); } continue; } break; }; if (ex) { exhaust = false; BSONObjBuilder err; ex->getInfo().append( err ); BSONObj errObj = err.done(); if (!ex->interrupted()) { log() << errObj << endl; } curop.debug().exceptionInfo = ex->getInfo(); if (ex->getCode() == 13436) { replyToQuery(ResultFlag_ErrSet, m, dbresponse, errObj); curop.debug().responseLength = dbresponse.response->header()->dataLen(); curop.debug().nreturned = 1; return ok; } msgdata = emptyMoreResult(cursorid); } Message *resp = new Message(); resp->setData(msgdata, true); curop.debug().responseLength = resp->header()->dataLen(); curop.debug().nreturned = msgdata->nReturned; dbresponse.response = resp; dbresponse.responseTo = m.header()->id; if( exhaust ) { curop.debug().exhaust = true; dbresponse.exhaustNS = ns; } return ok; }