/** * Cleans up one range of orphaned data starting from a range that overlaps or starts at * 'startingFromKey'. If empty, startingFromKey is the minimum key of the sharded range. * * @return CleanupResult_Continue and 'stoppedAtKey' if orphaned range was found and cleaned * @return CleanupResult_Done if no orphaned ranges remain * @return CleanupResult_Error and 'errMsg' if an error occurred * * If the collection is not sharded, returns CleanupResult_Done. */ CleanupResult cleanupOrphanedData( OperationContext* txn, const NamespaceString& ns, const BSONObj& startingFromKeyConst, const WriteConcernOptions& secondaryThrottle, BSONObj* stoppedAtKey, string* errMsg ) { BSONObj startingFromKey = startingFromKeyConst; CollectionMetadataPtr metadata = shardingState.getCollectionMetadata( ns.toString() ); if ( !metadata || metadata->getKeyPattern().isEmpty() ) { warning() << "skipping orphaned data cleanup for " << ns.toString() << ", collection is not sharded" << endl; return CleanupResult_Done; } BSONObj keyPattern = metadata->getKeyPattern(); if ( !startingFromKey.isEmpty() ) { if ( !metadata->isValidKey( startingFromKey ) ) { *errMsg = stream() << "could not cleanup orphaned data, start key " << startingFromKey << " does not match shard key pattern " << keyPattern; warning() << *errMsg << endl; return CleanupResult_Error; } } else { startingFromKey = metadata->getMinKey(); } KeyRange orphanRange; if ( !metadata->getNextOrphanRange( startingFromKey, &orphanRange ) ) { LOG( 1 ) << "orphaned data cleanup requested for " << ns.toString() << " starting from " << startingFromKey << ", no orphan ranges remain" << endl; return CleanupResult_Done; } *stoppedAtKey = orphanRange.maxKey; // We're done with this metadata now, no matter what happens metadata.reset(); LOG( 1 ) << "orphaned data cleanup requested for " << ns.toString() << " starting from " << startingFromKey << ", removing next orphan range" << " [" << orphanRange.minKey << "," << orphanRange.maxKey << ")" << endl; // Metadata snapshot may be stale now, but deleter checks metadata again in write lock // before delete. if ( !getDeleter()->deleteNow( txn, ns.toString(), orphanRange.minKey, orphanRange.maxKey, keyPattern, secondaryThrottle, errMsg ) ) { warning() << *errMsg << endl; return CleanupResult_Error; } return CleanupResult_Continue; }
/** * For a given query, get a runner. The runner could be a SingleSolutionRunner, a * CachedQueryRunner, or a MultiPlanRunner, depending on the cache/query solver/etc. */ Status getRunner(Collection* collection, CanonicalQuery* rawCanonicalQuery, Runner** out, size_t plannerOptions) { verify(rawCanonicalQuery); auto_ptr<CanonicalQuery> canonicalQuery(rawCanonicalQuery); // This can happen as we're called by internal clients as well. if (NULL == collection) { const string& ns = canonicalQuery->ns(); *out = new EOFRunner(canonicalQuery.release(), ns); return Status::OK(); } // If we have an _id index we can use the idhack runner. if (canUseIDHack(*canonicalQuery) && collection->getIndexCatalog()->findIdIndex()) { *out = new IDHackRunner(collection, canonicalQuery.release()); return Status::OK(); } // If it's not NULL, we may have indices. Access the catalog and fill out IndexEntry(s) QueryPlannerParams plannerParams; IndexCatalog::IndexIterator ii = collection->getIndexCatalog()->getIndexIterator(false); while (ii.more()) { const IndexDescriptor* desc = ii.next(); plannerParams.indices.push_back(IndexEntry(desc->keyPattern(), desc->isMultikey(), desc->isSparse(), desc->indexName(), desc->infoObj())); } // If query supports admin hint, filter params.indices by indexes in query settings. QuerySettings* querySettings = collection->infoCache()->getQuerySettings(); AllowedIndices* allowedIndicesRaw; // Filter index catalog if admin hint is specified for query. // Also, signal to planner that application hint should be ignored. if (querySettings->getAllowedIndices(*canonicalQuery, &allowedIndicesRaw)) { boost::scoped_ptr<AllowedIndices> allowedIndices(allowedIndicesRaw); filterAllowedIndexEntries(*allowedIndices, &plannerParams.indices); plannerParams.adminHintApplied = true; } // Tailable: If the query requests tailable the collection must be capped. if (canonicalQuery->getParsed().hasOption(QueryOption_CursorTailable)) { if (!collection->isCapped()) { return Status(ErrorCodes::BadValue, "error processing query: " + canonicalQuery->toString() + " tailable cursor requested on non capped collection"); } // If a sort is specified it must be equal to expectedSort. const BSONObj expectedSort = BSON("$natural" << 1); const BSONObj& actualSort = canonicalQuery->getParsed().getSort(); if (!actualSort.isEmpty() && !(actualSort == expectedSort)) { return Status(ErrorCodes::BadValue, "error processing query: " + canonicalQuery->toString() + " invalid sort specified for tailable cursor: " + actualSort.toString()); } } // Process the planning options. plannerParams.options = plannerOptions; if (storageGlobalParams.noTableScan) { const string& ns = canonicalQuery->ns(); // There are certain cases where we ignore this restriction: bool ignore = canonicalQuery->getQueryObj().isEmpty() || (string::npos != ns.find(".system.")) || (0 == ns.find("local.")); if (!ignore) { plannerParams.options |= QueryPlannerParams::NO_TABLE_SCAN; } } if (!(plannerParams.options & QueryPlannerParams::NO_TABLE_SCAN)) { plannerParams.options |= QueryPlannerParams::INCLUDE_COLLSCAN; } // If the caller wants a shard filter, make sure we're actually sharded. if (plannerParams.options & QueryPlannerParams::INCLUDE_SHARD_FILTER) { CollectionMetadataPtr collMetadata = shardingState.getCollectionMetadata(canonicalQuery->ns()); if (collMetadata) { plannerParams.shardKey = collMetadata->getKeyPattern(); } else { // If there's no metadata don't bother w/the shard filter since we won't know what // the key pattern is anyway... plannerParams.options &= ~QueryPlannerParams::INCLUDE_SHARD_FILTER; } } // Try to look up a cached solution for the query. // // Skip cache look up for non-cacheable queries. // See PlanCache::shouldCacheQuery() // // TODO: Can the cache have negative data about a solution? CachedSolution* rawCS; if (PlanCache::shouldCacheQuery(*canonicalQuery) && collection->infoCache()->getPlanCache()->get(*canonicalQuery, &rawCS).isOK()) { // We have a CachedSolution. Have the planner turn it into a QuerySolution. boost::scoped_ptr<CachedSolution> cs(rawCS); QuerySolution *qs, *backupQs; Status status = QueryPlanner::planFromCache(*canonicalQuery, plannerParams, *cs, &qs, &backupQs); if (status.isOK()) { WorkingSet* ws; PlanStage* root; verify(StageBuilder::build(*qs, &root, &ws)); CachedPlanRunner* cpr = new CachedPlanRunner(canonicalQuery.release(), qs, root, ws); if (NULL != backupQs) { WorkingSet* backupWs; PlanStage* backupRoot; verify(StageBuilder::build(*backupQs, &backupRoot, &backupWs)); cpr->setBackupPlan(backupQs, backupRoot, backupWs); } *out = cpr; return Status::OK(); } } plannerParams.options |= QueryPlannerParams::INDEX_INTERSECTION; plannerParams.options |= QueryPlannerParams::KEEP_MUTATIONS; vector<QuerySolution*> solutions; Status status = QueryPlanner::plan(*canonicalQuery, plannerParams, &solutions); if (!status.isOK()) { return Status(ErrorCodes::BadValue, "error processing query: " + canonicalQuery->toString() + " planner returned error: " + status.reason()); } /* for (size_t i = 0; i < solutions.size(); ++i) { QLOG() << "solution " << i << " is " << solutions[i]->toString() << endl; } */ // We cannot figure out how to answer the query. Should this ever happen? if (0 == solutions.size()) { return Status(ErrorCodes::BadValue, "error processing query: " + canonicalQuery->toString() + " No query solutions"); } if (1 == solutions.size()) { // Only one possible plan. Run it. Build the stages from the solution. WorkingSet* ws; PlanStage* root; verify(StageBuilder::build(*solutions[0], &root, &ws)); // And, run the plan. *out = new SingleSolutionRunner(canonicalQuery.release(), solutions[0], root, ws); return Status::OK(); } else { // Many solutions. Let the MultiPlanRunner pick the best, update the cache, and so on. auto_ptr<MultiPlanRunner> mpr(new MultiPlanRunner(canonicalQuery.release())); for (size_t i = 0; i < solutions.size(); ++i) { WorkingSet* ws; PlanStage* root; if (solutions[i]->cacheData.get()) { solutions[i]->cacheData->adminHintApplied = plannerParams.adminHintApplied; } verify(StageBuilder::build(*solutions[i], &root, &ws)); // Takes ownership of all arguments. mpr->addPlan(solutions[i], root, ws); } *out = mpr.release(); return Status::OK(); } }
bool mergeChunks(OperationContext* txn, const NamespaceString& nss, const BSONObj& minKey, const BSONObj& maxKey, const OID& epoch, string* errMsg) { // // Get sharding state up-to-date // ConnectionString configLoc = ConnectionString::parse(shardingState.getConfigServer(), *errMsg); if (!configLoc.isValid()) { warning() << *errMsg << endl; return false; } // // Get the distributed lock // string whyMessage = stream() << "merging chunks in " << nss.ns() << " from " << minKey << " to " << maxKey; auto scopedDistLock = grid.catalogManager()->getDistLockManager()->lock(nss.ns(), whyMessage); if (!scopedDistLock.isOK()) { *errMsg = stream() << "could not acquire collection lock for " << nss.ns() << " to merge chunks in [" << minKey << "," << maxKey << ")" << causedBy(scopedDistLock.getStatus()); warning() << *errMsg << endl; return false; } // // We now have the collection lock, refresh metadata to latest version and sanity check // ChunkVersion shardVersion; Status status = shardingState.refreshMetadataNow(txn, nss.ns(), &shardVersion); if (!status.isOK()) { *errMsg = str::stream() << "could not merge chunks, failed to refresh metadata for " << nss.ns() << causedBy(status.reason()); warning() << *errMsg << endl; return false; } if (epoch.isSet() && shardVersion.epoch() != epoch) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has changed" << " since merge was sent" << "(sent epoch : " << epoch.toString() << ", current epoch : " << shardVersion.epoch().toString() << ")"; warning() << *errMsg << endl; return false; } CollectionMetadataPtr metadata = shardingState.getCollectionMetadata(nss.ns()); if (!metadata || metadata->getKeyPattern().isEmpty()) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " is not sharded"; warning() << *errMsg << endl; return false; } dassert(metadata->getShardVersion().equals(shardVersion)); if (!metadata->isValidKey(minKey) || !metadata->isValidKey(maxKey)) { *errMsg = stream() << "could not merge chunks, the range " << rangeToString(minKey, maxKey) << " is not valid" << " for collection " << nss.ns() << " with key pattern " << metadata->getKeyPattern(); warning() << *errMsg << endl; return false; } // // Get merged chunk information // ChunkVersion mergeVersion = metadata->getCollVersion(); mergeVersion.incMinor(); std::vector<ChunkType> chunksToMerge; ChunkType itChunk; itChunk.setMin(minKey); itChunk.setMax(minKey); itChunk.setNS(nss.ns()); itChunk.setShard(shardingState.getShardName()); while (itChunk.getMax().woCompare(maxKey) < 0 && metadata->getNextChunk(itChunk.getMax(), &itChunk)) { chunksToMerge.push_back(itChunk); } if (chunksToMerge.empty()) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " and ending at " << maxKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } // // Validate the range starts and ends at chunks and has no holes, error if not valid // BSONObj firstDocMin = chunksToMerge.front().getMin(); BSONObj firstDocMax = chunksToMerge.front().getMax(); // minKey is inclusive bool minKeyInRange = rangeContains(firstDocMin, firstDocMax, minKey); if (!minKeyInRange) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range starting at " << minKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } BSONObj lastDocMin = chunksToMerge.back().getMin(); BSONObj lastDocMax = chunksToMerge.back().getMax(); // maxKey is exclusive bool maxKeyInRange = lastDocMin.woCompare(maxKey) < 0 && lastDocMax.woCompare(maxKey) >= 0; if (!maxKeyInRange) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " range ending at " << maxKey << " does not belong to shard " << shardingState.getShardName(); warning() << *errMsg << endl; return false; } bool validRangeStartKey = firstDocMin.woCompare(minKey) == 0; bool validRangeEndKey = lastDocMax.woCompare(maxKey) == 0; if (!validRangeStartKey || !validRangeEndKey) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " does not contain a chunk " << (!validRangeStartKey ? "starting at " + minKey.toString() : "") << (!validRangeStartKey && !validRangeEndKey ? " or " : "") << (!validRangeEndKey ? "ending at " + maxKey.toString() : ""); warning() << *errMsg << endl; return false; } if (chunksToMerge.size() == 1) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " already contains chunk for " << rangeToString(minKey, maxKey); warning() << *errMsg << endl; return false; } // Look for hole in range for (size_t i = 1; i < chunksToMerge.size(); ++i) { if (chunksToMerge[i - 1].getMax().woCompare(chunksToMerge[i].getMin()) != 0) { *errMsg = stream() << "could not merge chunks, collection " << nss.ns() << " has a hole in the range " << rangeToString(minKey, maxKey) << " at " << rangeToString(chunksToMerge[i - 1].getMax(), chunksToMerge[i].getMin()); warning() << *errMsg << endl; return false; } } // // Run apply ops command // Status applyOpsStatus = runApplyOpsCmd(chunksToMerge, shardVersion, mergeVersion); if (!applyOpsStatus.isOK()) { warning() << applyOpsStatus; return false; } // // Install merged chunk metadata // { ScopedTransaction transaction(txn, MODE_IX); Lock::DBLock writeLk(txn->lockState(), nss.db(), MODE_IX); Lock::CollectionLock collLock(txn->lockState(), nss.ns(), MODE_X); shardingState.mergeChunks(txn, nss.ns(), minKey, maxKey, mergeVersion); } // // Log change // BSONObj mergeLogEntry = buildMergeLogEntry(chunksToMerge, shardVersion, mergeVersion); grid.catalogManager()->logChange( txn->getClient()->clientAddress(true), "merge", nss.ns(), mergeLogEntry); return true; }
/** * This is called by db/ops/query.cpp. This is the entry point for answering a query. */ string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { // This is a read lock. Client::ReadContext ctx(q.ns, dbpath); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner; Status status = getRunner(q, &rawRunner); if (!status.isOK()) { uasserted(17007, "Couldn't process query " + q.query.toString() + " why: " + status.reason()); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(q.ns); // We use this a lot below. const ParsedQuery& pq = runner->getQuery().getParsed(); // TODO: Document why we do this. replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; BSONObj obj; // TODO: Differentiate EOF from error. while (runner->getNext(&obj)) { // If we're sharded make sure that we don't return any data that hasn't been migrated // off of our shared yet. if (collMetadata) { // This information can change if we yield and as such we must make sure to re-fetch // it if we yield. KeyPattern kp(collMetadata->getKeyPattern()); // This performs excessive BSONObj creation but that's OK for now. if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; } } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; const bool isExplain = pq.isExplain(); if (isExplain && pq.enoughForExplain(numResults)) { break; } else if (!supportsGetMore && (pq.enough(numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (pq.enoughForFirstBatch(numResults, bb.len())) { // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { saveClientCursor = true; } break; } } // TODO: Stage creation can set tailable depending on what's in the parsed query. We have // the full parsed query available during planning...set it there. // // TODO: If we're tailable we want to save the client cursor. Make sure we do this later. //if (pq.hasOption(QueryOption_CursorTailable) && pq.getNumToReturn() != 1) { ... } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } long long ccId = 0; if (saveClientCursor) { // Allocate a new ClientCursor. ClientCursorHolder ccHolder; ccHolder.reset(new ClientCursor(runner.get())); ccId = ccHolder->cursorid(); // We won't use the runner until it's getMore'd. runner->saveState(); // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { ccHolder->slaveReadTill(slaveReadTill); } if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. ccHolder->setCollMetadata(collMetadata); ccHolder->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). ccHolder->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); // Give up our reference to the CC. ccHolder.release(); } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; // TODO: nscanned is bogus. // curop.debug().nscanned = ( cursor ? cursor->nscanned() : 0LL ); curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
Runner::RunnerState IDHackRunner::getNext(BSONObj* objOut, DiskLoc* dlOut) { if (_killed) { return Runner::RUNNER_DEAD; } if (_done) { return Runner::RUNNER_EOF; } // Use the index catalog to get the id index. IndexCatalog* catalog = _collection->getIndexCatalog(); // Find the index we use. const IndexDescriptor* idDesc = catalog->findIdIndex(); if (NULL == idDesc) { _done = true; return Runner::RUNNER_EOF; } BtreeBasedAccessMethod* accessMethod = catalog->getBtreeBasedIndex( idDesc ); BSONObj key = _query->getQueryObj()["_id"].wrap(); // Look up the key by going directly to the Btree. DiskLoc loc = accessMethod->findSingle( key ); _done = true; // Key not found. if (loc.isNull()) { return Runner::RUNNER_EOF; } // Set out parameters and note that we're done w/lookup. if (NULL != objOut) { Record* record = loc.rec(); // If the record isn't in memory... if (!Record::likelyInPhysicalMemory(record->dataNoThrowing())) { // And we're allowed to yield ourselves... if (Runner::YIELD_AUTO == _policy) { // Note what we're yielding to fetch so that we don't crash if the loc is // deleted during a yield. _locFetching = loc; // Yield. TODO: Do we want to bother yielding if micros < 0? int micros = ClientCursor::suggestYieldMicros(); ClientCursor::staticYield(micros, "", record); // This can happen when we're yielded for various reasons (e.g. db/idx dropped). if (_killed) { return Runner::RUNNER_DEAD; } } } // Either the data was in memory or we paged it in. *objOut = loc.obj(); // If we're sharded make sure the key belongs to us. We need the object to do this. if (shardingState.needCollectionMetadata(_query->ns())) { CollectionMetadataPtr m = shardingState.getCollectionMetadata(_query->ns()); if (m) { KeyPattern kp(m->getKeyPattern()); if (!m->keyBelongsToMe( kp.extractSingleKey(*objOut))) { // We have something with a matching _id but it doesn't belong to me. return Runner::RUNNER_EOF; } } } // If there is a projection... if (NULL != _query->getProj()) { // Create something to execute it. auto_ptr<ProjectionExec> projExec(new ProjectionExec(_query->getParsed().getProj(), _query->root())); projExec->transform(*objOut, objOut); } } // Return the DiskLoc if the caller wants it. if (NULL != dlOut) { *dlOut = loc; } return Runner::RUNNER_ADVANCED; }
/** * Also called by db/ops/query.cpp. This is the new getMore entry point. */ QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized) { exhaust = false; int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(bufSize); bb.skip(sizeof(QueryResult)); // This is a read lock. TODO: There is a cursor flag for not needing this. Do we care? Client::ReadContext ctx(ns); // TODO: Document. replVerifyReadsOk(); ClientCursorPin ccPin(cursorid); ClientCursor* cc = ccPin.c(); // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Quote: check for spoofing of the ns such that it does not match the one originally // there for the cursor uassert(17011, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // TODO: fail point? // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); // TODO: // curop.debug().query = BSONForQuery // curop.setQuery(curop.debug().query); // TODO: What is pass? if (0 == pass) { cc->updateSlaveLocation(curop); } CollectionMetadataPtr collMetadata = cc->getCollMetadata(); // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; startingResult = cc->pos(); Runner* runner = cc->getRunner(); const ParsedQuery& pq = runner->getQuery().getParsed(); // Get results out of the runner. // TODO: There may be special handling required for tailable cursors? runner->restoreState(); BSONObj obj; // TODO: Differentiate EOF from error. while (runner->getNext(&obj)) { // If we're sharded make sure that we don't return any data that hasn't been // migrated off of our shard yet. if (collMetadata) { KeyPattern kp(collMetadata->getKeyPattern()); if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; } } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } if ((numResults && numResults >= ntoreturn) || bb.len() > MaxBytesToReturnToClientAtOnce) { break; } } cc->incPos(numResults); runner->saveState(); // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = pq.hasOption(QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf()); qr->len = bb.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = startingResult; qr->nReturned = numResults; bb.decouple(); return qr; }
/** * This is called by db/ops/query.cpp. This is the entry point for answering a query. */ string newRunQuery(Message& m, QueryMessage& q, CurOp& curop, Message &result) { // This is a read lock. Client::ReadContext ctx(q.ns, dbpath); // Parse, canonicalize, plan, transcribe, and get a runner. Runner* rawRunner; CanonicalQuery* cq; Status status = getRunner(q, &rawRunner, &cq); if (!status.isOK()) { uasserted(17007, "Couldn't process query " + q.query.toString() + " why: " + status.reason()); } verify(NULL != rawRunner); auto_ptr<Runner> runner(rawRunner); log() << "Running query on new system: " << cq->toString(); // We freak out later if this changes before we're done with the query. const ChunkVersion shardingVersionAtStart = shardingState.getVersion(q.ns); // We use this a lot below. const LiteParsedQuery& pq = cq->getParsed(); // TODO: Remove when impl'd if (pq.hasOption(QueryOption_OplogReplay)) { warning() << "haven't implemented findingstartcursor yet\n"; } // Handle query option $maxTimeMS (not used with commands). curop.setMaxTimeMicros(static_cast<unsigned long long>(pq.getMaxTimeMS()) * 1000); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // uassert if we are not on a primary, and not a secondary with SlaveOk query parameter set. replVerifyReadsOk(&pq); // If this exists, the collection is sharded. // If it doesn't exist, we can assume we're not sharded. // If we're sharded, we might encounter data that is not consistent with our sharding state. // We must ignore this data. CollectionMetadataPtr collMetadata; if (!shardingState.needCollectionMetadata(pq.ns())) { collMetadata = CollectionMetadataPtr(); } else { collMetadata = shardingState.getCollectionMetadata(pq.ns()); } // Run the query. // bb is used to hold query results // this buffer should contain either requested documents per query or // explain information, but not both BufBuilder bb(32768); bb.skip(sizeof(QueryResult)); // How many results have we obtained from the runner? int numResults = 0; // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // Do we save the Runner in a ClientCursor for getMore calls later? bool saveClientCursor = false; // We turn on auto-yielding for the runner here. The runner registers itself with the // active runners list in ClientCursor. ClientCursor::registerRunner(runner.get()); runner->setYieldPolicy(Runner::YIELD_AUTO); auto_ptr<DeregisterEvenIfUnderlyingCodeThrows> safety( new DeregisterEvenIfUnderlyingCodeThrows(runner.get())); BSONObj obj; Runner::RunnerState state; // set this outside loop. we will need to use this both within loop and when deciding // to fill in explain information const bool isExplain = pq.isExplain(); while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // If we're sharded make sure that we don't return any data that hasn't been migrated // off of our shared yet. if (collMetadata) { // This information can change if we yield and as such we must make sure to re-fetch // it if we yield. KeyPattern kp(collMetadata->getKeyPattern()); // This performs excessive BSONObj creation but that's OK for now. if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; } } // Add result to output buffer. This is unnecessary if explain info is requested if (!isExplain) { bb.appendBuf((void*)obj.objdata(), obj.objsize()); } // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (pq.hasOption(QueryOption_OplogReplay)) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } // TODO: only one type of 2d search doesn't support this. We need a way to pull it out // of CanonicalQuery. :( const bool supportsGetMore = true; if (isExplain) { if (enoughForExplain(pq, numResults)) { break; } } else if (!supportsGetMore && (enough(pq, numResults) || bb.len() >= MaxBytesToReturnToClientAtOnce)) { break; } else if (enoughForFirstBatch(pq, numResults, bb.len())) { // If only one result requested assume it's a findOne() and don't save the cursor. if (pq.wantMore() && 1 != pq.getNumToReturn()) { saveClientCursor = true; } break; } } // If we cache the runner later, we want to deregister it as it receives notifications // anyway by virtue of being cached. // // If we don't cache the runner later, we are deleting it, so it must be deregistered. // // So, no matter what, deregister the runner. safety.reset(); // Caller expects exceptions thrown in certain cases: // * in-memory sort using too much RAM. if (Runner::RUNNER_ERROR == state) { uasserted(17144, "Runner error, memory limit for sort probably exceeded"); } // Why save a dead runner? if (Runner::RUNNER_DEAD == state) { saveClientCursor = false; } else if (pq.hasOption(QueryOption_CursorTailable) && (1 != pq.getNumToReturn())) { // If pq.hasOption(tailable) the only plan the planner will output is a collscan with // tailable set. saveClientCursor = true; } // TODO(greg): This will go away soon. if (!shardingState.getVersion(pq.ns()).isWriteCompatibleWith(shardingVersionAtStart)) { // if the version changed during the query we might be missing some data and its safe to // send this as mongos can resend at this point throw SendStaleConfigException(pq.ns(), "version changed during initial query", shardingVersionAtStart, shardingState.getVersion(pq.ns())); } long long ccId = 0; if (saveClientCursor) { // We won't use the runner until it's getMore'd. runner->saveState(); // Allocate a new ClientCursor. We don't have to worry about leaking it as it's // inserted into a global map by its ctor. ClientCursor* cc = new ClientCursor(runner.get(), cq->getParsed().getOptions(), cq->getParsed().getFilter()); ccId = cc->cursorid(); log() << "caching runner with cursorid " << ccId << " after returning " << numResults << " results" << endl; // ClientCursor takes ownership of runner. Release to make sure it's not deleted. runner.release(); // TODO document if (pq.hasOption(QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } // TODO document if (pq.hasOption(QueryOption_Exhaust)) { curop.debug().exhaust = true; } // Set attributes for getMore. cc->setCollMetadata(collMetadata); cc->setPos(numResults); // If the query had a time limit, remaining time is "rolled over" to the cursor (for // use by future getmore ops). cc->setLeftoverMaxTimeMicros(curop.getRemainingMaxTimeMicros()); } // append explain information to query results if (isExplain) { BSONObjBuilder bob; bob.append("n", numResults); BSONObj obj = bob.done(); bb.appendBuf((void*)obj.objdata(), obj.objsize()); // The explain output is actually a result. numResults = 1; } // Add the results from the query into the output buffer. result.appendData(bb.buf(), bb.len()); bb.decouple(); // Fill out the output buffer's header. QueryResult* qr = static_cast<QueryResult*>(result.header()); qr->cursorId = ccId; curop.debug().cursorid = (0 == ccId ? -1 : ccId); qr->setResultFlagsToOk(); qr->setOperation(opReply); qr->startingFrom = 0; qr->nReturned = numResults; // TODO: nscanned is bogus. // curop.debug().nscanned = ( cursor ? cursor->nscanned() : 0LL ); curop.debug().ntoskip = pq.getSkip(); curop.debug().nreturned = numResults; // curop.debug().exhaust is set above. return curop.debug().exhaust ? pq.ns() : ""; }
/** * Also called by db/ops/query.cpp. This is the new getMore entry point. */ QueryResult* newGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized) { exhaust = false; int bufSize = 512 + sizeof(QueryResult) + MaxBytesToReturnToClientAtOnce; BufBuilder bb(bufSize); bb.skip(sizeof(QueryResult)); // This is a read lock. TODO: There is a cursor flag for not needing this. Do we care? Client::ReadContext ctx(ns); //log() << "running getMore in new system, cursorid " << cursorid << endl; // This checks to make sure the operation is allowed on a replicated node. Since we are not // passing in a query object (necessary to check SlaveOK query option), the only state where // reads are allowed is PRIMARY (or master in master/slave). This function uasserts if // reads are not okay. replVerifyReadsOk(); // A pin performs a CC lookup and if there is a CC, increments the CC's pin value so it // doesn't time out. Also informs ClientCursor that there is somebody actively holding the // CC, so don't delete it. ClientCursorPin ccPin(cursorid); ClientCursor* cc = ccPin.c(); // These are set in the QueryResult msg we return. int resultFlags = ResultFlag_AwaitCapable; int numResults = 0; int startingResult = 0; if (NULL == cc) { cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Quote: check for spoofing of the ns such that it does not match the one originally // there for the cursor uassert(17011, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // TODO: fail point? // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros(cc->getLeftoverMaxTimeMicros()); killCurrentOp.checkForInterrupt(); // May trigger maxTimeAlwaysTimeOut fail point. // TODO: // curop.debug().query = BSONForQuery // curop.setQuery(curop.debug().query); // TODO: What is pass? if (0 == pass) { cc->updateSlaveLocation(curop); } CollectionMetadataPtr collMetadata = cc->getCollMetadata(); // If we're replaying the oplog, we save the last time that we read. OpTime slaveReadTill; // What number result are we starting at? Used to fill out the reply. startingResult = cc->pos(); // What gives us results. Runner* runner = cc->getRunner(); const int queryOptions = cc->queryOptions(); // Get results out of the runner. runner->restoreState(); BSONObj obj; Runner::RunnerState state; while (Runner::RUNNER_ADVANCED == (state = runner->getNext(&obj, NULL))) { // If we're sharded make sure that we don't return any data that hasn't been // migrated off of our shard yet. if (collMetadata) { KeyPattern kp(collMetadata->getKeyPattern()); if (!collMetadata->keyBelongsToMe(kp.extractSingleKey(obj))) { continue; } } // Add result to output buffer. bb.appendBuf((void*)obj.objdata(), obj.objsize()); // Count the result. ++numResults; // Possibly note slave's position in the oplog. if (queryOptions & QueryOption_OplogReplay) { BSONElement e = obj["ts"]; if (Date == e.type() || Timestamp == e.type()) { slaveReadTill = e._opTime(); } } if ((numResults && numResults >= ntoreturn) || bb.len() > MaxBytesToReturnToClientAtOnce) { break; } } if (Runner::RUNNER_EOF == state && 0 == numResults && (queryOptions & QueryOption_CursorTailable) && (queryOptions & QueryOption_AwaitData) && (pass < 1000)) { // If the cursor is tailable we don't kill it if it's eof. We let it try to get // data some # of times first. return 0; } else if (Runner::RUNNER_DEAD == state || Runner::RUNNER_EOF == state) { ccPin.free(); // cc is now invalid, as is the runner cursorid = 0; cc = NULL; } else { // Continue caching the ClientCursor. cc->incPos(numResults); runner->saveState(); // Possibly note slave's position in the oplog. if ((queryOptions & QueryOption_OplogReplay) && !slaveReadTill.isNull()) { cc->slaveReadTill(slaveReadTill); } exhaust = (queryOptions & QueryOption_Exhaust); // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult* qr = reinterpret_cast<QueryResult*>(bb.buf()); qr->len = bb.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = startingResult; qr->nReturned = numResults; bb.decouple(); return qr; }
/** * For a given query, get a runner. The runner could be a SingleSolutionRunner, a * CachedQueryRunner, or a MultiPlanRunner, depending on the cache/query solver/etc. */ Status getRunner(CanonicalQuery* rawCanonicalQuery, Runner** out, size_t plannerOptions) { verify(rawCanonicalQuery); auto_ptr<CanonicalQuery> canonicalQuery(rawCanonicalQuery); // Try to look up a cached solution for the query. // TODO: Can the cache have negative data about a solution? PlanCache* localCache = PlanCache::get(canonicalQuery->ns()); if (NULL != localCache) { CachedSolution* cs = localCache->get(*canonicalQuery); if (NULL != cs) { // We have a cached solution. Hand the canonical query and cached solution off to // the cached plan runner, which takes ownership of both. WorkingSet* ws; PlanStage* root; verify(StageBuilder::build(*cs->solution, &root, &ws)); *out = new CachedPlanRunner(canonicalQuery.release(), cs, root, ws); return Status::OK(); } } // No entry in cache for the query. We have to solve the query ourself. // Get the indices that we could possibly use. Database* db = cc().database(); verify( db ); Collection* collection = db->getCollection( canonicalQuery->ns() ); // This can happen as we're called by internal clients as well. if (NULL == collection) { const string& ns = canonicalQuery->ns(); *out = new EOFRunner(canonicalQuery.release(), ns); return Status::OK(); } // If we have an _id index we can use the idhack runner. if (canUseIDHack(*canonicalQuery) && collection->getIndexCatalog()->findIdIndex()) { *out = new IDHackRunner(collection, canonicalQuery.release()); return Status::OK(); } // If it's not NULL, we may have indices. Access the catalog and fill out IndexEntry(s) QueryPlannerParams plannerParams; for (int i = 0; i < collection->getIndexCatalog()->numIndexesReady(); ++i) { IndexDescriptor* desc = collection->getIndexCatalog()->getDescriptor( i ); plannerParams.indices.push_back(IndexEntry(desc->keyPattern(), desc->isMultikey(), desc->isSparse(), desc->indexName())); } // Tailable: If the query requests tailable the collection must be capped. if (canonicalQuery->getParsed().hasOption(QueryOption_CursorTailable)) { if (!collection->isCapped()) { return Status(ErrorCodes::BadValue, "tailable cursor requested on non capped collection"); } // If a sort is specified it must be equal to expectedSort. const BSONObj expectedSort = BSON("$natural" << 1); const BSONObj& actualSort = canonicalQuery->getParsed().getSort(); if (!actualSort.isEmpty() && !(actualSort == expectedSort)) { return Status(ErrorCodes::BadValue, "invalid sort specified for tailable cursor: " + actualSort.toString()); } } // Process the planning options. plannerParams.options = plannerOptions; if (storageGlobalParams.noTableScan) { const string& ns = canonicalQuery->ns(); // There are certain cases where we ignore this restriction: bool ignore = canonicalQuery->getQueryObj().isEmpty() || (string::npos != ns.find(".system.")) || (0 == ns.find("local.")); if (!ignore) { plannerParams.options |= QueryPlannerParams::NO_TABLE_SCAN; } } if (!(plannerParams.options & QueryPlannerParams::NO_TABLE_SCAN)) { plannerParams.options |= QueryPlannerParams::INCLUDE_COLLSCAN; } // If the caller wants a shard filter, make sure we're actually sharded. if (plannerParams.options & QueryPlannerParams::INCLUDE_SHARD_FILTER) { CollectionMetadataPtr collMetadata = shardingState.getCollectionMetadata(canonicalQuery->ns()); if (collMetadata) { plannerParams.shardKey = collMetadata->getKeyPattern(); } else { // If there's no metadata don't bother w/the shard filter since we won't know what // the key pattern is anyway... plannerParams.options &= ~QueryPlannerParams::INCLUDE_SHARD_FILTER; } } vector<QuerySolution*> solutions; QueryPlanner::plan(*canonicalQuery, plannerParams, &solutions); /* for (size_t i = 0; i < solutions.size(); ++i) { QLOG() << "solution " << i << " is " << solutions[i]->toString() << endl; } */ // We cannot figure out how to answer the query. Should this ever happen? if (0 == solutions.size()) { return Status(ErrorCodes::BadValue, "No query solutions"); } if (1 == solutions.size()) { // Only one possible plan. Run it. Build the stages from the solution. WorkingSet* ws; PlanStage* root; verify(StageBuilder::build(*solutions[0], &root, &ws)); // And, run the plan. *out = new SingleSolutionRunner(canonicalQuery.release(), solutions[0], root, ws); return Status::OK(); } else { // Many solutions. Let the MultiPlanRunner pick the best, update the cache, and so on. auto_ptr<MultiPlanRunner> mpr(new MultiPlanRunner(canonicalQuery.release())); for (size_t i = 0; i < solutions.size(); ++i) { WorkingSet* ws; PlanStage* root; verify(StageBuilder::build(*solutions[i], &root, &ws)); // Takes ownership of all arguments. mpr->addPlan(solutions[i], root, ws); } *out = mpr.release(); return Status::OK(); } }
QueryResult* processGetMore(const char* ns, int ntoreturn, long long cursorid, CurOp& curop, int pass, bool& exhaust, bool* isCursorAuthorized ) { if (isNewQueryFrameworkEnabled()) { return newGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust, isCursorAuthorized); } exhaust = false; int bufSize = 512 + sizeof( QueryResult ) + MaxBytesToReturnToClientAtOnce; BufBuilder b( bufSize ); b.skip(sizeof(QueryResult)); int resultFlags = ResultFlag_AwaitCapable; int start = 0; int n = 0; scoped_ptr<Client::ReadContext> ctx(new Client::ReadContext(ns)); // call this readlocked so state can't change replVerifyReadsOk(); ClientCursorPin p(cursorid); ClientCursor *cc = p.c(); if ( unlikely(!cc) ) { LOGSOME << "getMore: cursorid not found " << ns << " " << cursorid << endl; cursorid = 0; resultFlags = ResultFlag_CursorNotFound; } else { // Some internal users create a ClientCursor with a Runner. Don't crash if this // happens. Instead, hand them off to the new framework. if (NULL != cc->getRunner()) { p.release(); return newGetMore(ns, ntoreturn, cursorid, curop, pass, exhaust, isCursorAuthorized); } // check for spoofing of the ns such that it does not match the one originally there for the cursor uassert(14833, "auth error", str::equals(ns, cc->ns().c_str())); *isCursorAuthorized = true; // This must be done after auth check to ensure proper cleanup. uassert(16951, "failing getmore due to set failpoint", !MONGO_FAIL_POINT(getMoreError)); // If the operation that spawned this cursor had a time limit set, apply leftover // time to this getmore. curop.setMaxTimeMicros( cc->getLeftoverMaxTimeMicros() ); if ( pass == 0 ) cc->updateSlaveLocation( curop ); int queryOptions = cc->queryOptions(); curop.debug().query = cc->query(); curop.setQuery( cc->query() ); start = cc->pos(); Cursor *c = cc->c(); if (!c->requiresLock()) { // make sure it won't be destroyed under us fassert(16952, !c->shouldDestroyOnNSDeletion()); fassert(16953, !c->supportYields()); ctx.reset(); // unlocks } c->recoverFromYield(); DiskLoc last; // This metadata may be stale, but it's the state of chunking when the cursor was // created. CollectionMetadataPtr metadata = cc->getCollMetadata(); KeyPattern keyPattern( metadata ? metadata->getKeyPattern() : BSONObj() ); while ( 1 ) { if ( !c->ok() ) { if ( c->tailable() ) { // when a tailable cursor hits "EOF", ok() goes false, and current() is // null. however advance() can still be retries as a reactivation attempt. // when there is new data, it will return true. that's what we are doing // here. if ( c->advance() ) continue; if( n == 0 && (queryOptions & QueryOption_AwaitData) && pass < 1000 ) { return 0; } break; } p.release(); bool ok = ClientCursor::erase(cursorid); verify(ok); cursorid = 0; cc = 0; break; } MatchDetails details; if ( cc->fields && cc->fields->getArrayOpType() == Projection::ARRAY_OP_POSITIONAL ) { // field projection specified, and contains an array operator details.requestElemMatchKey(); } // in some cases (clone collection) there won't be a matcher if ( !c->currentMatches( &details ) ) { } else if ( metadata && !metadata->keyBelongsToMe( extractKey(c, keyPattern ) ) ) { LOG(2) << "cursor skipping document in un-owned chunk: " << c->current() << endl; } else { if( c->getsetdup(c->currLoc()) ) { //out() << " but it's a dup \n"; } else { last = c->currLoc(); n++; // Fill out the fields requested by the query. const Projection::KeyOnly *keyFieldsOnly = c->keyFieldsOnly(); if ( keyFieldsOnly ) { fillQueryResultFromObj( b, 0, keyFieldsOnly->hydrate( c->currKey() ), &details ); } else { DiskLoc loc = c->currLoc(); fillQueryResultFromObj( b, cc->fields.get(), c->current(), &details, ( ( cc->pq.get() && cc->pq->showDiskLoc() ) ? &loc : 0 ) ); } if ( ( ntoreturn && n >= ntoreturn ) || b.len() > MaxBytesToReturnToClientAtOnce ) { c->advance(); cc->incPos( n ); break; } } } c->advance(); if ( ! cc->yieldSometimes( ( c->ok() && c->keyFieldsOnly() ) ? ClientCursor::DontNeed : ClientCursor::WillNeed ) ) { ClientCursor::erase(cursorid); cursorid = 0; cc = 0; break; } } if ( cc ) { if ( c->supportYields() ) { ClientCursor::YieldData data; verify( cc->prepareToYield( data ) ); } else { cc->c()->noteLocation(); } cc->storeOpForSlave( last ); exhaust = cc->queryOptions() & QueryOption_Exhaust; // If the getmore had a time limit, remaining time is "rolled over" back to the // cursor (for use by future getmore ops). cc->setLeftoverMaxTimeMicros( curop.getRemainingMaxTimeMicros() ); } } QueryResult *qr = (QueryResult *) b.buf(); qr->len = b.len(); qr->setOperation(opReply); qr->_resultFlags() = resultFlags; qr->cursorId = cursorid; qr->startingFrom = start; qr->nReturned = n; b.decouple(); return qr; }