bool MultiPlanRunner::workAllPlans() { for (size_t i = 0; i < _candidates.size(); ++i) { CandidatePlan& candidate = _candidates[i]; WorkingSetID id; PlanStage::StageState state = candidate.root->work(&id); if (PlanStage::ADVANCED == state) { // Save result for later. candidate.results.push(id); } else if (PlanStage::NEED_TIME == state) { // Nothing to do here. } else if (PlanStage::NEED_FETCH == state) { // XXX: We can yield to do this. We have to deal with synchronization issues with // regards to the working set and invalidation. What if another thread invalidates // the thing we're fetching? The loc could vanish between hasLoc() and the actual // fetch... // id has a loc and refers to an obj we need to fetch. WorkingSetMember* member = candidate.ws->get(id); // This must be true for somebody to request a fetch and can only change when an // invalidation happens, which is when we give up a lock. Don't give up the // lock between receiving the NEED_FETCH and actually fetching(?). verify(member->hasLoc()); // Actually bring record into memory. Record* record = member->loc.rec(); record->touch(); // Record should be in memory now. Log if it's not. if (!Record::likelyInPhysicalMemory(record->dataNoThrowing())) { OCCASIONALLY { warning() << "Record wasn't in memory immediately after fetch: " << member->loc.toString() << endl; } } // Note that we're not freeing id. Fetch semantics say that we shouldn't. }
Status SortStageKeyGenerator::getSortKey(const WorkingSetMember& member, BSONObj* objOut) const { BSONObj btreeKeyToUse; Status btreeStatus = getBtreeKey(member.obj, &btreeKeyToUse); if (!btreeStatus.isOK()) { return btreeStatus; } if (!_sortHasMeta) { *objOut = btreeKeyToUse; return Status::OK(); } BSONObjBuilder mergedKeyBob; // Merge metadata into the key. BSONObjIterator it(_rawSortSpec); BSONObjIterator btreeIt(btreeKeyToUse); while (it.more()) { BSONElement elt = it.next(); if (elt.isNumber()) { // Merge btree key elt. mergedKeyBob.append(btreeIt.next()); } else if (LiteParsedQuery::isTextScoreMeta(elt)) { // Add text score metadata double score = 0.0; if (member.hasComputed(WSM_COMPUTED_TEXT_SCORE)) { const TextScoreComputedData* scoreData = static_cast<const TextScoreComputedData*>( member.getComputed(WSM_COMPUTED_TEXT_SCORE)); score = scoreData->getScore(); } mergedKeyBob.append("$metaTextScore", score); } } *objOut = mergedKeyBob.obj(); return Status::OK(); }
void WorkingSetCommon::prepareForSnapshotChange(WorkingSet* workingSet) { if (!supportsDocLocking()) { // Non doc-locking storage engines use invalidations, so we don't need to examine the // buffered working set ids. But we do need to clear the set of ids in order to keep our // memory utilization in check. workingSet->getAndClearYieldSensitiveIds(); return; } for (auto id : workingSet->getAndClearYieldSensitiveIds()) { if (workingSet->isFree(id)) { continue; } // We may see the same member twice, so anything we do here should be idempotent. WorkingSetMember* member = workingSet->get(id); if (member->getState() == WorkingSetMember::RID_AND_IDX) { member->isSuspicious = true; } } }
void getRecordIds(Collection* collection, CollectionScanParams::Direction direction, vector<RecordId>* out) { WorkingSet ws; CollectionScanParams params; params.collection = collection; params.direction = direction; params.tailable = false; unique_ptr<CollectionScan> scan(new CollectionScan(&_txn, params, &ws, NULL)); while (!scan->isEOF()) { WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState state = scan->work(&id); if (PlanStage::ADVANCED == state) { WorkingSetMember* member = ws.get(id); verify(member->hasRecordId()); out->push_back(member->recordId); } } }
void MergeSortStage::invalidate(const DiskLoc& dl, InvalidationType type) { ++_commonStats.invalidates; for (size_t i = 0; i < _children.size(); ++i) { _children[i]->invalidate(dl, type); } // Go through our data and see if we're holding on to the invalidated loc. for (list<StageWithValue>::iterator valueIt = _mergingData.begin(); valueIt != _mergingData.end(); valueIt++) { WorkingSetMember* member = _ws->get(valueIt->id); if (member->hasLoc() && (dl == member->loc)) { // Force a fetch and flag. We could possibly merge this result back in later. WorkingSetCommon::fetchAndInvalidateLoc(member, _collection); _ws->flagForReview(valueIt->id); ++_specificStats.forcedFetches; } } // If we see DL again it is not the same record as it once was so we still want to // return it. if (_dedup) { _seen.erase(dl); } }
void MergeSortStage::doInvalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { // Go through our data and see if we're holding on to the invalidated RecordId. for (list<StageWithValue>::iterator valueIt = _mergingData.begin(); valueIt != _mergingData.end(); valueIt++) { WorkingSetMember* member = _ws->get(valueIt->id); if (member->hasRecordId() && (dl == member->recordId)) { // Fetch the about-to-be mutated result. WorkingSetCommon::fetchAndInvalidateRecordId(txn, member, _collection); ++_specificStats.forcedFetches; } } // If we see the deleted RecordId again it is not the same record as it once was so we still // want to return it. if (_dedup && INVALIDATION_DELETION == type) { _seen.erase(dl); } }
void MultiPlanRunner::invalidate(const DiskLoc& dl, InvalidationType type) { if (_failure || _killed) { return; } if (NULL != _bestPlan) { _bestPlan->invalidate(dl, type); for (list<WorkingSetID>::iterator it = _alreadyProduced.begin(); it != _alreadyProduced.end();) { WorkingSetMember* member = _bestPlan->getWorkingSet()->get(*it); if (member->hasLoc() && member->loc == dl) { list<WorkingSetID>::iterator next = it; next++; WorkingSetCommon::fetchAndInvalidateLoc(member); _bestPlan->getWorkingSet()->flagForReview(*it); _alreadyProduced.erase(it); it = next; } else { it++; } } if (NULL != _backupPlan) { _backupPlan->invalidate(dl, type); for (list<WorkingSetID>::iterator it = _backupAlreadyProduced.begin(); it != _backupAlreadyProduced.end();) { WorkingSetMember* member = _backupPlan->getWorkingSet()->get(*it); if (member->hasLoc() && member->loc == dl) { list<WorkingSetID>::iterator next = it; next++; WorkingSetCommon::fetchAndInvalidateLoc(member); _backupPlan->getWorkingSet()->flagForReview(*it); _backupAlreadyProduced.erase(it); it = next; } else { it++; } } } } else { for (size_t i = 0; i < _candidates.size(); ++i) { _candidates[i].root->invalidate(dl, type); for (list<WorkingSetID>::iterator it = _candidates[i].results.begin(); it != _candidates[i].results.end();) { WorkingSetMember* member = _candidates[i].ws->get(*it); if (member->hasLoc() && member->loc == dl) { list<WorkingSetID>::iterator next = it; next++; WorkingSetCommon::fetchAndInvalidateLoc(member); _candidates[i].ws->flagForReview(*it); _candidates[i].results.erase(it); it = next; } else { it++; } } } } }
PlanStage::StageState MultiIteratorStage::work(WorkingSetID* out) { if (_collection == NULL) { Status status(ErrorCodes::InternalError, "MultiIteratorStage died on null collection"); *out = WorkingSetCommon::allocateStatusMember(_ws, status); return PlanStage::DEAD; } boost::optional<Record> record; try { while (!_iterators.empty()) { if (auto fetcher = _iterators.back()->fetcherForNext()) { // Pass the RecordFetcher off up. WorkingSetMember* member = _ws->get(_wsidForFetch); member->setFetcher(fetcher.release()); *out = _wsidForFetch; return NEED_YIELD; } record = _iterators.back()->next(); if (record) break; _iterators.pop_back(); } } catch (const WriteConflictException& wce) { // If _advance throws a WCE we shouldn't have moved. invariant(!_iterators.empty()); *out = WorkingSet::INVALID_ID; return NEED_YIELD; } if (!record) return IS_EOF; *out = _ws->allocate(); WorkingSetMember* member = _ws->get(*out); member->loc = record->id; member->obj = {_txn->recoveryUnit()->getSnapshotId(), record->data.releaseToBson()}; member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; return PlanStage::ADVANCED; }
void run() { // Various variables we'll need. OldClientWriteContext ctx(&_txn, nss.ns()); Collection* coll = ctx.getCollection(); const BSONObj query = BSONObj(); const auto ws = make_unique<WorkingSet>(); const unique_ptr<CanonicalQuery> cq(canonicalize(query)); // Configure a QueuedDataStage to pass an OWNED_OBJ to the delete stage. auto qds = make_unique<QueuedDataStage>(&_txn, ws.get()); { WorkingSetID id = ws->allocate(); WorkingSetMember* member = ws->get(id); member->obj = Snapshotted<BSONObj>(SnapshotId(), fromjson("{x: 1}")); member->transitionToOwnedObj(); qds->pushBack(id); } // Configure the delete. DeleteStageParams deleteParams; deleteParams.isMulti = false; deleteParams.canonicalQuery = cq.get(); const auto deleteStage = make_unique<DeleteStage>(&_txn, deleteParams, ws.get(), coll, qds.release()); const DeleteStats* stats = static_cast<const DeleteStats*>(deleteStage->getSpecificStats()); // Call work, passing the set up member to the delete stage. WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState state = deleteStage->work(&id); // Should return NEED_TIME, not deleting anything. ASSERT_EQUALS(PlanStage::NEED_TIME, state); ASSERT_EQUALS(stats->docsDeleted, 0U); id = WorkingSet::INVALID_ID; state = deleteStage->work(&id); ASSERT_EQUALS(PlanStage::IS_EOF, state); }
PlanStage::StageState SortKeyGeneratorStage::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (!_sortKeyGen) { _sortKeyGen = stdx::make_unique<SortKeyGenerator>(_collection, _sortSpec, _query); ++_commonStats.needTime; return PlanStage::NEED_TIME; } auto stageState = child()->work(out); if (stageState == PlanStage::ADVANCED) { WorkingSetMember* member = _ws->get(*out); BSONObj sortKey; Status sortKeyStatus = _sortKeyGen->getSortKey(*member, &sortKey); if (!sortKeyStatus.isOK()) { *out = WorkingSetCommon::allocateStatusMember(_ws, sortKeyStatus); return PlanStage::FAILURE; } // Add the sort key to the WSM as computed data. member->addComputed(new SortKeyComputedData(sortKey)); return PlanStage::ADVANCED; } if (stageState == PlanStage::IS_EOF) { _commonStats.isEOF = true; } else if (stageState == PlanStage::NEED_TIME) { ++_commonStats.needTime; } else if (stageState == PlanStage::NEED_YIELD) { ++_commonStats.needYield; } return stageState; }
void NearStage::invalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { ++_stats->common.invalidates; for (size_t i = 0; i < _childrenIntervals.size(); i++) { _childrenIntervals[i]->covering->invalidate(txn, dl, type); } // If a result is in _resultBuffer and has a RecordId it will be in _nextIntervalSeen as // well. It's safe to return the result w/o the RecordId, so just fetch the result. unordered_map<RecordId, WorkingSetID, RecordId::Hasher>::iterator seenIt = _nextIntervalSeen .find(dl); if (seenIt != _nextIntervalSeen.end()) { WorkingSetMember* member = _workingSet->get(seenIt->second); verify(member->hasLoc()); WorkingSetCommon::fetchAndInvalidateLoc(txn, member, _collection); verify(!member->hasLoc()); // Don't keep it around in the seen map since there's no valid RecordId anymore _nextIntervalSeen.erase(seenIt); } }
void NearStage::invalidate(const DiskLoc& dl, InvalidationType type) { ++_stats->common.invalidates; if (_nextInterval) { _nextInterval->covering->invalidate(dl, type); } // If a result is in _resultBuffer and has a DiskLoc it will be in _nextIntervalSeen as // well. It's safe to return the result w/o the DiskLoc, so just fetch the result. unordered_map<DiskLoc, WorkingSetID, DiskLoc::Hasher>::iterator seenIt = _nextIntervalSeen .find(dl); if (seenIt != _nextIntervalSeen.end()) { WorkingSetMember* member = _workingSet->get(seenIt->second); verify(member->hasLoc()); WorkingSetCommon::fetchAndInvalidateLoc(member, _collection); verify(!member->hasLoc()); // Don't keep it around in the seen map since there's no valid DiskLoc anymore _nextIntervalSeen.erase(seenIt); } }
PlanStage::StageState TextOrStage::returnResults(WorkingSetID* out) { if (_scoreIterator == _scores.end()) { _internalState = State::kDone; return PlanStage::IS_EOF; } // Retrieve the record that contains the text score. TextRecordData textRecordData = _scoreIterator->second; ++_scoreIterator; // Ignore non-matched documents. if (textRecordData.score < 0) { invariant(textRecordData.wsid == WorkingSet::INVALID_ID); return PlanStage::NEED_TIME; } WorkingSetMember* wsm = _ws->get(textRecordData.wsid); // Populate the working set member with the text score and return it. wsm->addComputed(new TextScoreComputedData(textRecordData.score)); *out = textRecordData.wsid; return PlanStage::ADVANCED; }
PlanStage::StageState PipelineProxyStage::doWork(WorkingSetID* out) { if (!out) { return PlanStage::FAILURE; } if (!_stash.empty()) { *out = _ws->allocate(); WorkingSetMember* member = _ws->get(*out); member->obj = Snapshotted<BSONObj>(SnapshotId(), _stash.back()); _stash.pop_back(); member->transitionToOwnedObj(); return PlanStage::ADVANCED; } if (boost::optional<BSONObj> next = getNextBson()) { *out = _ws->allocate(); WorkingSetMember* member = _ws->get(*out); member->obj = Snapshotted<BSONObj>(SnapshotId(), *next); member->transitionToOwnedObj(); return PlanStage::ADVANCED; } return PlanStage::IS_EOF; }
bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result) { BSONElement first = cmdObj.firstElement(); uassert(28528, str::stream() << "Argument to listIndexes must be of type String, not " << typeName(first.type()), first.type() == String); StringData collectionName = first.valueStringData(); uassert(28529, str::stream() << "Argument to listIndexes must be a collection name, " << "not the empty string", !collectionName.empty()); const NamespaceString ns(dbname, collectionName); const long long defaultBatchSize = std::numeric_limits<long long>::max(); long long batchSize; Status parseCursorStatus = parseCommandCursorOptions(cmdObj, defaultBatchSize, &batchSize); if (!parseCursorStatus.isOK()) { return appendCommandStatus(result, parseCursorStatus); } AutoGetCollectionForRead autoColl(txn, ns); if (!autoColl.getDb()) { return appendCommandStatus(result, Status(ErrorCodes::NamespaceNotFound, "no database")); } const Collection* collection = autoColl.getCollection(); if (!collection) { return appendCommandStatus(result, Status(ErrorCodes::NamespaceNotFound, "no collection")); } const CollectionCatalogEntry* cce = collection->getCatalogEntry(); invariant(cce); vector<string> indexNames; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { indexNames.clear(); cce->getAllIndexes(txn, &indexNames); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns()); std::unique_ptr<WorkingSet> ws(new WorkingSet()); std::unique_ptr<QueuedDataStage> root(new QueuedDataStage(ws.get())); for (size_t i = 0; i < indexNames.size(); i++) { BSONObj indexSpec; MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { indexSpec = cce->getIndexSpec(txn, indexNames[i]); } MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "listIndexes", ns.ns()); WorkingSetID id = ws->allocate(); WorkingSetMember* member = ws->get(id); member->keyData.clear(); member->loc = RecordId(); member->obj = Snapshotted<BSONObj>(SnapshotId(), indexSpec.getOwned()); member->transitionToOwnedObj(); root->pushBack(id); } std::string cursorNamespace = str::stream() << dbname << ".$cmd." << name << "." << ns.coll(); dassert(NamespaceString(cursorNamespace).isValid()); dassert(NamespaceString(cursorNamespace).isListIndexesCursorNS()); dassert(ns == NamespaceString(cursorNamespace).getTargetNSForListIndexes()); auto statusWithPlanExecutor = PlanExecutor::make( txn, std::move(ws), std::move(root), cursorNamespace, PlanExecutor::YIELD_MANUAL); if (!statusWithPlanExecutor.isOK()) { return appendCommandStatus(result, statusWithPlanExecutor.getStatus()); } std::unique_ptr<PlanExecutor> exec = std::move(statusWithPlanExecutor.getValue()); BSONArrayBuilder firstBatch; const int byteLimit = MaxBytesToReturnToClientAtOnce; for (long long objCount = 0; objCount < batchSize && firstBatch.len() < byteLimit; objCount++) { BSONObj next; PlanExecutor::ExecState state = exec->getNext(&next, NULL); if (state == PlanExecutor::IS_EOF) { break; } invariant(state == PlanExecutor::ADVANCED); firstBatch.append(next); } CursorId cursorId = 0LL; if (!exec->isEOF()) { exec->saveState(); ClientCursor* cursor = new ClientCursor( CursorManager::getGlobalCursorManager(), exec.release(), cursorNamespace); cursorId = cursor->cursorid(); } appendCursorResponseObject(cursorId, cursorNamespace, firstBatch.arr(), &result); return true; }
void run() { Client::WriteContext ctx(&_txn, ns()); Database* db = ctx.ctx().db(); Collection* coll = db->getCollection(&_txn, ns()); if (!coll) { coll = db->createCollection(&_txn, ns()); } WorkingSet ws; // Sort by foo:1 MergeSortStageParams msparams; msparams.pattern = BSON("foo" << 1); auto_ptr<MergeSortStage> ms(new MergeSortStage(msparams, &ws, coll)); IndexScanParams params; params.bounds.isSimpleRange = true; params.bounds.startKey = objWithMinKey(1); params.bounds.endKey = objWithMaxKey(1); params.bounds.endKeyInclusive = true; params.direction = 1; // Index 'a'+i has foo equal to 'i'. int numIndices = 20; for (int i = 0; i < numIndices; ++i) { // 'a', 'b', ... string index(1, 'a' + i); insert(BSON(index << 1 << "foo" << i)); BSONObj indexSpec = BSON(index << 1 << "foo" << 1); addIndex(indexSpec); params.descriptor = getIndex(indexSpec, coll); ms->addChild(new IndexScan(&_txn, params, &ws, NULL)); } set<DiskLoc> locs; getLocs(&locs, coll); set<DiskLoc>::iterator it = locs.begin(); ctx.commit(); // Get 10 results. Should be getting results in order of 'locs'. int count = 0; while (!ms->isEOF() && count < 10) { WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState status = ms->work(&id); if (PlanStage::ADVANCED != status) { continue; } WorkingSetMember* member = ws.get(id); ASSERT_EQUALS(member->loc, *it); BSONElement elt; string index(1, 'a' + count); ASSERT(member->getFieldDotted(index, &elt)); ASSERT_EQUALS(1, elt.numberInt()); ASSERT(member->getFieldDotted("foo", &elt)); ASSERT_EQUALS(count, elt.numberInt()); ++count; ++it; } // Invalidate locs[11]. Should force a fetch. We don't get it back. ms->prepareToYield(); ms->invalidate(*it, INVALIDATION_DELETION); ms->recoverFromYield(&_txn); // Make sure locs[11] was fetched for us. { // TODO: If we have "return upon invalidation" ever triggerable, do the following test. /* WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState status; do { status = ms->work(&id); } while (PlanStage::ADVANCED != status); WorkingSetMember* member = ws.get(id); ASSERT(!member->hasLoc()); ASSERT(member->hasObj()); string index(1, 'a' + count); BSONElement elt; ASSERT_TRUE(member->getFieldDotted(index, &elt)); ASSERT_EQUALS(1, elt.numberInt()); ASSERT(member->getFieldDotted("foo", &elt)); ASSERT_EQUALS(count, elt.numberInt()); */ ++it; ++count; } // And get the rest. while (!ms->isEOF()) { WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState status = ms->work(&id); if (PlanStage::ADVANCED != status) { continue; } WorkingSetMember* member = ws.get(id); ASSERT_EQUALS(member->loc, *it); BSONElement elt; string index(1, 'a' + count); ASSERT_TRUE(member->getFieldDotted(index, &elt)); ASSERT_EQUALS(1, elt.numberInt()); ASSERT(member->getFieldDotted("foo", &elt)); ASSERT_EQUALS(count, elt.numberInt()); ++count; ++it; } }
void run() { // Populate the collection. for (int i = 0; i < 50; ++i) { insert(BSON("_id" << i << "foo" << i)); } ASSERT_EQUALS(50U, count(BSONObj())); // Various variables we'll need. dbtests::WriteContextForTests ctx(&_opCtx, nss.ns()); OpDebug* opDebug = &CurOp::get(_opCtx)->debug(); Collection* coll = ctx.getCollection(); ASSERT(coll); UpdateRequest request(nss); const CollatorInterface* collator = nullptr; UpdateDriver driver(new ExpressionContext(&_opCtx, collator)); const int targetDocIndex = 10; const BSONObj query = BSON("foo" << BSON("$gte" << targetDocIndex)); const auto ws = make_unique<WorkingSet>(); const unique_ptr<CanonicalQuery> cq(canonicalize(query)); // Get the RecordIds that would be returned by an in-order scan. vector<RecordId> recordIds; getRecordIds(coll, CollectionScanParams::FORWARD, &recordIds); // Populate the request. request.setQuery(query); request.setUpdates(fromjson("{$set: {x: 0}}")); request.setSort(BSONObj()); request.setMulti(false); request.setReturnDocs(UpdateRequest::RETURN_NEW); const std::map<StringData, std::unique_ptr<ExpressionWithPlaceholder>> arrayFilters; ASSERT_DOES_NOT_THROW(driver.parse(request.getUpdates(), arrayFilters, request.isMulti())); // Configure a QueuedDataStage to pass the first object in the collection back in a // RID_AND_OBJ state. auto qds = make_unique<QueuedDataStage>(&_opCtx, ws.get()); WorkingSetID id = ws->allocate(); WorkingSetMember* member = ws->get(id); member->recordId = recordIds[targetDocIndex]; const BSONObj oldDoc = BSON("_id" << targetDocIndex << "foo" << targetDocIndex); member->obj = Snapshotted<BSONObj>(SnapshotId(), oldDoc); ws->transitionToRecordIdAndObj(id); qds->pushBack(id); // Configure the update. UpdateStageParams updateParams(&request, &driver, opDebug); updateParams.canonicalQuery = cq.get(); auto updateStage = make_unique<UpdateStage>(&_opCtx, updateParams, ws.get(), coll, qds.release()); // Should return advanced. id = WorkingSet::INVALID_ID; PlanStage::StageState state = updateStage->work(&id); ASSERT_EQUALS(PlanStage::ADVANCED, state); // Make sure the returned value is what we expect it to be. // Should give us back a valid id. ASSERT_TRUE(WorkingSet::INVALID_ID != id); WorkingSetMember* resultMember = ws->get(id); // With an owned copy of the object, with no RecordId. ASSERT_TRUE(resultMember->hasOwnedObj()); ASSERT_FALSE(resultMember->hasRecordId()); ASSERT_EQUALS(resultMember->getState(), WorkingSetMember::OWNED_OBJ); ASSERT_TRUE(resultMember->obj.value().isOwned()); // Should be the new value. BSONObj newDoc = BSON("_id" << targetDocIndex << "foo" << targetDocIndex << "x" << 0); ASSERT_BSONOBJ_EQ(resultMember->obj.value(), newDoc); // Should have done the update. vector<BSONObj> objs; getCollContents(coll, &objs); ASSERT_BSONOBJ_EQ(objs[targetDocIndex], newDoc); // That should be it. id = WorkingSet::INVALID_ID; ASSERT_EQUALS(PlanStage::IS_EOF, updateStage->work(&id)); }
PlanStage::StageState S2NearStage::addResultToQueue(WorkingSetID* out) { PlanStage::StageState state = _child->work(out); // All done reading from _child. if (PlanStage::IS_EOF == state) { _child.reset(); _keyGeoFilter.reset(); // Adjust the annulus size depending on how many results we got. if (_results.empty()) { _radiusIncrement *= 2; } else if (_results.size() < 300) { _radiusIncrement *= 2; } else if (_results.size() > 600) { _radiusIncrement /= 2; } // Make a new ixscan next time. return PlanStage::NEED_TIME; } // Nothing to do unless we advance. if (PlanStage::ADVANCED != state) { return state; } WorkingSetMember* member = _ws->get(*out); // Must have an object in order to get geometry out of it. verify(member->hasObj()); // The scans we use don't dedup so we must dedup them ourselves. We only put locs into here // if we know for sure whether or not we'll return them in this annulus. if (member->hasLoc()) { if (_seenInScan.end() != _seenInScan.find(member->loc)) { return PlanStage::NEED_TIME; } } // Get all the fields with that name from the document. BSONElementSet geom; member->obj.getFieldsDotted(_params.nearQuery.field, geom, false); if (geom.empty()) { return PlanStage::NEED_TIME; } // Some value that any distance we can calculate will be less than. double minDistance = numeric_limits<double>::max(); BSONObj minDistanceObj; for (BSONElementSet::iterator git = geom.begin(); git != geom.end(); ++git) { if (!git->isABSONObj()) { mongoutils::str::stream ss; ss << "s2near stage read invalid geometry element " << *git << " from child"; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember( _ws, status); return PlanStage::FAILURE; } BSONObj obj = git->Obj(); double distToObj; if (S2SearchUtil::distanceBetween(_params.nearQuery.centroid.point, obj, &distToObj)) { if (distToObj < minDistance) { minDistance = distToObj; minDistanceObj = obj; } } else { warning() << "unknown geometry: " << obj.toString(); } } // If we're here we'll either include the doc in this annulus or reject it. It's safe to // ignore it if it pops up again in this annulus. if (member->hasLoc()) { _seenInScan.insert(member->loc); } // If the distance to the doc satisfies our distance criteria, add it to our buffered // results. if (minDistance >= _innerRadius && (_outerRadiusInclusive ? minDistance <= _outerRadius : minDistance < _outerRadius)) { _results.push(Result(*out, minDistance)); if (_params.addDistMeta) { // FLAT implies the output distances are in radians. Convert to meters. if (FLAT == _params.nearQuery.centroid.crs) { member->addComputed(new GeoDistanceComputedData(minDistance / kRadiusOfEarthInMeters)); } else { member->addComputed(new GeoDistanceComputedData(minDistance)); } } if (_params.addPointMeta) { member->addComputed(new GeoNearPointComputedData(minDistanceObj)); } if (member->hasLoc()) { _invalidationMap[member->loc] = *out; } } return PlanStage::NEED_TIME; }
PlanStage::StageState CollectionScan::doWork(WorkingSetID* out) { if (_isDead) { Status status( ErrorCodes::CappedPositionLost, str::stream() << "CollectionScan died due to position in capped collection being deleted. " << "Last seen record id: " << _lastSeenId); *out = WorkingSetCommon::allocateStatusMember(_workingSet, status); return PlanStage::DEAD; } if ((0 != _params.maxScan) && (_specificStats.docsTested >= _params.maxScan)) { _commonStats.isEOF = true; } if (_commonStats.isEOF) { return PlanStage::IS_EOF; } boost::optional<Record> record; const bool needToMakeCursor = !_cursor; try { if (needToMakeCursor) { const bool forward = _params.direction == CollectionScanParams::FORWARD; _cursor = _params.collection->getCursor(getOpCtx(), forward); if (!_lastSeenId.isNull()) { invariant(_params.tailable); // Seek to where we were last time. If it no longer exists, mark us as dead // since we want to signal an error rather than silently dropping data from the // stream. This is related to the _lastSeenId handling in invalidate. Note that // we want to return the record *after* this one since we have already returned // this one. This is only possible in the tailing case because that is the only // time we'd need to create a cursor after already getting a record out of it. if (!_cursor->seekExact(_lastSeenId)) { _isDead = true; Status status(ErrorCodes::CappedPositionLost, str::stream() << "CollectionScan died due to failure to restore " << "tailable cursor position. " << "Last seen record id: " << _lastSeenId); *out = WorkingSetCommon::allocateStatusMember(_workingSet, status); return PlanStage::DEAD; } } return PlanStage::NEED_TIME; } if (_lastSeenId.isNull() && !_params.start.isNull()) { record = _cursor->seekExact(_params.start); } else { // See if the record we're about to access is in memory. If not, pass a fetch // request up. if (auto fetcher = _cursor->fetcherForNext()) { // Pass the RecordFetcher up. WorkingSetMember* member = _workingSet->get(_wsidForFetch); member->setFetcher(fetcher.release()); *out = _wsidForFetch; return PlanStage::NEED_YIELD; } record = _cursor->next(); } } catch (const WriteConflictException& wce) { // Leave us in a state to try again next time. if (needToMakeCursor) _cursor.reset(); *out = WorkingSet::INVALID_ID; return PlanStage::NEED_YIELD; } if (!record) { // We just hit EOF. If we are tailable and have already returned data, leave us in a // state to pick up where we left off on the next call to work(). Otherwise EOF is // permanent. if (_params.tailable && !_lastSeenId.isNull()) { _cursor.reset(); } else { _commonStats.isEOF = true; } return PlanStage::IS_EOF; } _lastSeenId = record->id; WorkingSetID id = _workingSet->allocate(); WorkingSetMember* member = _workingSet->get(id); member->recordId = record->id; member->obj = {getOpCtx()->recoveryUnit()->getSnapshotId(), record->data.releaseToBson()}; _workingSet->transitionToRecordIdAndObj(id); return returnIfMatches(member, id, out); }
/** * addToBuffer() and sortBuffer() work differently based on the * configured limit. addToBuffer() is also responsible for * performing some accounting on the overall memory usage to * make sure we're not using too much memory. * * limit == 0: * addToBuffer() - Adds item to vector. * sortBuffer() - Sorts vector. * limit == 1: * addToBuffer() - Replaces first item in vector with max of * current and new item. * Updates memory usage if item was replaced. * sortBuffer() - Does nothing. * limit > 1: * addToBuffer() - Does not update vector. Adds item to set. * If size of set exceeds limit, remove item from set * with lowest key. Updates memory usage accordingly. * sortBuffer() - Copies items from set to vectors. */ void SortStage::addToBuffer(const SortableDataItem& item) { // Holds ID of working set member to be freed at end of this function. WorkingSetID wsidToFree = WorkingSet::INVALID_ID; WorkingSetMember* member = _ws->get(item.wsid); if (_limit == 0) { // Ensure that the BSONObj underlying the WorkingSetMember is owned in case we yield. member->makeObjOwnedIfNeeded(); _data.push_back(item); _memUsage += member->getMemUsage(); } else if (_limit == 1) { if (_data.empty()) { member->makeObjOwnedIfNeeded(); _data.push_back(item); _memUsage = member->getMemUsage(); return; } wsidToFree = item.wsid; const WorkingSetComparator& cmp = *_sortKeyComparator; // Compare new item with existing item in vector. if (cmp(item, _data[0])) { wsidToFree = _data[0].wsid; member->makeObjOwnedIfNeeded(); _data[0] = item; _memUsage = member->getMemUsage(); } } else { // Update data item set instead of vector // Limit not reached - insert and return vector<SortableDataItem>::size_type limit(_limit); if (_dataSet->size() < limit) { member->makeObjOwnedIfNeeded(); _dataSet->insert(item); _memUsage += member->getMemUsage(); return; } // Limit will be exceeded - compare with item with lowest key // If new item does not have a lower key value than last item, // do nothing. wsidToFree = item.wsid; SortableDataItemSet::const_iterator lastItemIt = --(_dataSet->end()); const SortableDataItem& lastItem = *lastItemIt; const WorkingSetComparator& cmp = *_sortKeyComparator; if (cmp(item, lastItem)) { _memUsage -= _ws->get(lastItem.wsid)->getMemUsage(); _memUsage += member->getMemUsage(); wsidToFree = lastItem.wsid; // According to std::set iterator validity rules, // it does not matter which of erase()/insert() happens first. // Here, we choose to erase first to release potential resources // used by the last item and to keep the scope of the iterator to a minimum. _dataSet->erase(lastItemIt); member->makeObjOwnedIfNeeded(); _dataSet->insert(item); } } // If the working set ID is valid, remove from // RecordId invalidation map and free from working set. if (wsidToFree != WorkingSet::INVALID_ID) { WorkingSetMember* member = _ws->get(wsidToFree); if (member->hasLoc()) { _wsidByDiskLoc.erase(member->loc); } _ws->free(wsidToFree); } }
PlanStage::StageState DeleteStage::doWork(WorkingSetID* out) { if (isEOF()) { return PlanStage::IS_EOF; } invariant(_collection); // If isEOF() returns false, we must have a collection. // It is possible that after a delete was executed, a WriteConflictException occurred // and prevented us from returning ADVANCED with the old version of the document. if (_idReturning != WorkingSet::INVALID_ID) { // We should only get here if we were trying to return something before. invariant(_params.returnDeleted); WorkingSetMember* member = _ws->get(_idReturning); invariant(member->getState() == WorkingSetMember::OWNED_OBJ); *out = _idReturning; _idReturning = WorkingSet::INVALID_ID; return PlanStage::ADVANCED; } // Either retry the last WSM we worked on or get a new one from our child. WorkingSetID id; if (_idRetrying != WorkingSet::INVALID_ID) { id = _idRetrying; _idRetrying = WorkingSet::INVALID_ID; } else { auto status = child()->work(&id); switch (status) { case PlanStage::ADVANCED: break; case PlanStage::FAILURE: case PlanStage::DEAD: *out = id; // If a stage fails, it may create a status WSM to indicate why it failed, in which // case 'id' is valid. If ID is invalid, we create our own error message. if (WorkingSet::INVALID_ID == id) { const std::string errmsg = "delete stage failed to read in results from child"; *out = WorkingSetCommon::allocateStatusMember( _ws, Status(ErrorCodes::InternalError, errmsg)); } return status; case PlanStage::NEED_TIME: return status; case PlanStage::NEED_YIELD: *out = id; return status; case PlanStage::IS_EOF: return status; default: MONGO_UNREACHABLE; } } // We advanced, or are retrying, and id is set to the WSM to work on. WorkingSetMember* member = _ws->get(id); // We want to free this member when we return, unless we need to retry it. ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id); if (!member->hasRecordId()) { // We expect to be here because of an invalidation causing a force-fetch. ++_specificStats.nInvalidateSkips; return PlanStage::NEED_TIME; } RecordId recordId = member->recordId; // Deletes can't have projections. This means that covering analysis will always add // a fetch. We should always get fetched data, and never just key data. invariant(member->hasObj()); try { // If the snapshot changed, then we have to make sure we have the latest copy of the // doc and that it still matches. std::unique_ptr<SeekableRecordCursor> cursor; if (getOpCtx()->recoveryUnit()->getSnapshotId() != member->obj.snapshotId()) { cursor = _collection->getCursor(getOpCtx()); if (!WorkingSetCommon::fetch(getOpCtx(), _ws, id, cursor)) { // Doc is already deleted. Nothing more to do. return PlanStage::NEED_TIME; } // Make sure the re-fetched doc still matches the predicate. if (_params.canonicalQuery && !_params.canonicalQuery->root()->matchesBSON(member->obj.value(), NULL)) { // Doesn't match. return PlanStage::NEED_TIME; } } // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState() // is allowed to free the memory. if (_params.returnDeleted) { // Save a copy of the document that is about to get deleted, but keep it in the // RID_AND_OBJ state in case we need to retry deleting it. BSONObj deletedDoc = member->obj.value(); member->obj.setValue(deletedDoc.getOwned()); } // TODO: Do we want to buffer docs and delete them in a group rather than // saving/restoring state repeatedly? try { WorkingSetCommon::prepareForSnapshotChange(_ws); child()->saveState(); } catch (const WriteConflictException& wce) { std::terminate(); } // Do the write, unless this is an explain. if (!_params.isExplain) { WriteUnitOfWork wunit(getOpCtx()); _collection->deleteDocument(getOpCtx(), recordId, _params.fromMigrate); wunit.commit(); } ++_specificStats.docsDeleted; } catch (const WriteConflictException& wce) { // When we're doing a findAndModify with a sort, the sort will have a limit of 1, so will // not produce any more results even if there is another matching document. Re-throw the WCE // here so that these operations get another chance to find a matching document. The // findAndModify command should automatically retry if it gets a WCE. // TODO: this is not necessary if there was no sort specified. if (_params.returnDeleted) { throw; } _idRetrying = id; memberFreer.Dismiss(); // Keep this member around so we can retry deleting it. *out = WorkingSet::INVALID_ID; return NEED_YIELD; } if (_params.returnDeleted) { // After deleting the document, the RecordId associated with this member is invalid. // Remove the 'recordId' from the WorkingSetMember before returning it. member->recordId = RecordId(); member->transitionToOwnedObj(); } // As restoreState may restore (recreate) cursors, cursors are tied to the // transaction in which they are created, and a WriteUnitOfWork is a // transaction, make sure to restore the state outside of the WritUnitOfWork. try { child()->restoreState(); } catch (const WriteConflictException& wce) { // Note we don't need to retry anything in this case since the delete already // was committed. However, we still need to return the deleted document // (if it was requested). if (_params.returnDeleted) { // member->obj should refer to the deleted document. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); _idReturning = id; // Keep this member around so that we can return it on the next work() call. memberFreer.Dismiss(); } *out = WorkingSet::INVALID_ID; return NEED_YIELD; } if (_params.returnDeleted) { // member->obj should refer to the deleted document. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); memberFreer.Dismiss(); // Keep this member around so we can return it. *out = id; return PlanStage::ADVANCED; } return PlanStage::NEED_TIME; }
PlanStage::StageState TwoDNear::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (!_initted) { _initted = true; if ( !_params.collection ) return PlanStage::IS_EOF; IndexCatalog* indexCatalog = _params.collection->getIndexCatalog(); IndexDescriptor* desc = indexCatalog->findIndexByKeyPattern(_params.indexKeyPattern); if ( desc == NULL ) return PlanStage::IS_EOF; TwoDAccessMethod* am = static_cast<TwoDAccessMethod*>( indexCatalog->getIndex( desc ) ); auto_ptr<twod_exec::GeoSearch> search; search.reset(new twod_exec::GeoSearch(_params.collection, am, _params.nearQuery.centroid.oldPoint, _params.numWanted, _params.filter, _params.nearQuery.maxDistance, _params.nearQuery.isNearSphere ? twod_exec::GEO_SPHERE : twod_exec::GEO_PLANE)); // This is where all the work is done. :( search->exec(); _specificStats.objectsLoaded = search->_objectsLoaded; _specificStats.nscanned = search->_lookedAt; for (twod_exec::GeoHopper::Holder::iterator it = search->_points.begin(); it != search->_points.end(); it++) { WorkingSetID id = _workingSet->allocate(); WorkingSetMember* member = _workingSet->get(id); member->loc = it->_loc; member->obj = _params.collection->docFor(member->loc); member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; if (_params.addDistMeta) { member->addComputed(new GeoDistanceComputedData(it->_distance)); } if (_params.addPointMeta) { member->addComputed(new GeoNearPointComputedData(it->_pt)); } _results.push(Result(id, it->_distance)); _invalidationMap.insert(pair<DiskLoc, WorkingSetID>(it->_loc, id)); } } if (isEOF()) { return PlanStage::IS_EOF; } Result result = _results.top(); _results.pop(); *out = result.id; // Remove from invalidation map. WorkingSetMember* member = _workingSet->get(*out); // The WSM may have been mutated or deleted so it may not have a loc. if (member->hasLoc()) { typedef multimap<DiskLoc, WorkingSetID>::iterator MMIT; pair<MMIT, MMIT> range = _invalidationMap.equal_range(member->loc); for (MMIT it = range.first; it != range.second; ++it) { if (it->second == *out) { _invalidationMap.erase(it); break; } } } ++_commonStats.advanced; return PlanStage::ADVANCED; }
PlanStage::StageState OrStage::work(WorkingSetID* out) { ++_commonStats.works; if (isEOF()) { return PlanStage::IS_EOF; } if (0 == _specificStats.matchTested.size()) { _specificStats.matchTested = vector<uint64_t>(_children.size(), 0); } WorkingSetID id; StageState childStatus = _children[_currentChild]->work(&id); if (PlanStage::ADVANCED == childStatus) { WorkingSetMember* member = _ws->get(id); verify(member->hasLoc()); // If we're deduping... if (_dedup) { ++_specificStats.dupsTested; // ...and we've seen the DiskLoc before if (_seen.end() != _seen.find(member->loc)) { // ...drop it. ++_specificStats.dupsDropped; _ws->free(id); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else { // Otherwise, note that we've seen it. _seen.insert(member->loc); } } if (Filter::passes(member, _filter)) { if (NULL != _filter) { ++_specificStats.matchTested[_currentChild]; } // Match! return it. *out = id; ++_commonStats.advanced; return PlanStage::ADVANCED; } else { // Does not match, try again. _ws->free(id); ++_commonStats.needTime; return PlanStage::NEED_TIME; } } else if (PlanStage::IS_EOF == childStatus) { // Done with _currentChild, move to the next one. ++_currentChild; // Maybe we're out of children. if (isEOF()) { return PlanStage::IS_EOF; } else { ++_commonStats.needTime; return PlanStage::NEED_TIME; } } else { if (PlanStage::NEED_FETCH == childStatus) { *out = id; ++_commonStats.needFetch; } else if (PlanStage::NEED_TIME == childStatus) { ++_commonStats.needTime; } // NEED_TIME, ERROR, NEED_YIELD, pass them up. return childStatus; } }
// static Status WorkingSetCommon::getMemberStatus(const WorkingSetMember& member) { invariant(member.hasObj()); return getMemberObjectStatus(member.obj.value()); }
void run() { Lock::DBLock lk(&_opCtx, nsToDatabaseSubstring(ns()), MODE_X); OldClientContext ctx(&_opCtx, ns()); Database* db = ctx.db(); Collection* coll = db->getCollection(&_opCtx, ns()); if (!coll) { WriteUnitOfWork wuow(&_opCtx); coll = db->createCollection(&_opCtx, ns()); wuow.commit(); } WorkingSet ws; // Add an object to the DB. insert(BSON("foo" << 5)); set<RecordId> recordIds; getRecordIds(&recordIds, coll); ASSERT_EQUALS(size_t(1), recordIds.size()); // Create a mock stage that returns the WSM. auto mockStage = make_unique<QueuedDataStage>(&_opCtx, &ws); // Mock data. { WorkingSetID id = ws.allocate(); WorkingSetMember* mockMember = ws.get(id); mockMember->recordId = *recordIds.begin(); ws.transitionToRecordIdAndIdx(id); // State is RecordId and index, shouldn't be able to get the foo data inside. BSONElement elt; ASSERT_FALSE(mockMember->getFieldDotted("foo", &elt)); mockStage->pushBack(id); } // Make the filter. BSONObj filterObj = BSON("foo" << 6); const CollatorInterface* collator = nullptr; const boost::intrusive_ptr<ExpressionContext> expCtx( new ExpressionContext(&_opCtx, collator)); StatusWithMatchExpression statusWithMatcher = MatchExpressionParser::parse(filterObj, expCtx); verify(statusWithMatcher.isOK()); unique_ptr<MatchExpression> filterExpr = std::move(statusWithMatcher.getValue()); // Matcher requires that foo==6 but we only have data with foo==5. unique_ptr<FetchStage> fetchStage( new FetchStage(&_opCtx, &ws, mockStage.release(), filterExpr.get(), coll)); // First call should return a fetch request as it's not in memory. WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState state; // Normally we'd return the object but we have a filter that prevents it. state = fetchStage->work(&id); ASSERT_EQUALS(PlanStage::NEED_TIME, state); // No more data to fetch, so, EOF. state = fetchStage->work(&id); ASSERT_EQUALS(PlanStage::IS_EOF, state); }
bool MultiPlanStage::workAllPlans(size_t numResults, PlanYieldPolicy* yieldPolicy) { bool doneWorking = false; for (size_t ix = 0; ix < _candidates.size(); ++ix) { CandidatePlan& candidate = _candidates[ix]; if (candidate.failed) { continue; } // Might need to yield between calls to work due to the timer elapsing. if (!(tryYield(yieldPolicy)).isOK()) { return false; } WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState state = candidate.root->work(&id); if (PlanStage::ADVANCED == state) { // Save result for later. candidate.results.push_back(id); // Once a plan returns enough results, stop working. if (candidate.results.size() >= numResults) { doneWorking = true; } } else if (PlanStage::IS_EOF == state) { // First plan to hit EOF wins automatically. Stop evaluating other plans. // Assumes that the ranking will pick this plan. doneWorking = true; } else if (PlanStage::NEED_YIELD == state) { if (id == WorkingSet::INVALID_ID) { if (!yieldPolicy->allowedToYield()) throw WriteConflictException(); } else { WorkingSetMember* member = candidate.ws->get(id); invariant(member->hasFetcher()); // Transfer ownership of the fetcher and yield. _fetcher.reset(member->releaseFetcher()); } if (yieldPolicy->allowedToYield()) { yieldPolicy->forceYield(); } if (!(tryYield(yieldPolicy)).isOK()) { return false; } } else if (PlanStage::NEED_TIME != state) { // FAILURE or DEAD. Do we want to just tank that plan and try the rest? We // probably want to fail globally as this shouldn't happen anyway. candidate.failed = true; ++_failureCount; // Propagate most recent seen failure to parent. if (PlanStage::FAILURE == state) { _statusMemberId = id; } if (_failureCount == _candidates.size()) { _failure = true; return false; } } } return !doneWorking; }
Status CachedPlanStage::pickBestPlan(PlanYieldPolicy* yieldPolicy) { // Adds the amount of time taken by pickBestPlan() to executionTimeMillis. There's lots of // execution work that happens here, so this is needed for the time accounting to // make sense. ScopedTimer timer(&_commonStats.executionTimeMillis); // If we work this many times during the trial period, then we will replan the // query from scratch. size_t maxWorksBeforeReplan = static_cast<size_t>(internalQueryCacheEvictionRatio * _decisionWorks); // The trial period ends without replanning if the cached plan produces this many results. size_t numResults = MultiPlanStage::getTrialPeriodNumToReturn(*_canonicalQuery); for (size_t i = 0; i < maxWorksBeforeReplan; ++i) { // Might need to yield between calls to work due to the timer elapsing. Status yieldStatus = tryYield(yieldPolicy); if (!yieldStatus.isOK()) { return yieldStatus; } WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState state = child()->work(&id); if (PlanStage::ADVANCED == state) { // Save result for later. WorkingSetMember* member = _ws->get(id); // Ensure that the BSONObj underlying the WorkingSetMember is owned in case we yield. member->makeObjOwnedIfNeeded(); _results.push_back(id); if (_results.size() >= numResults) { // Once a plan returns enough results, stop working. Update cache with stats // from this run and return. updatePlanCache(); return Status::OK(); } } else if (PlanStage::IS_EOF == state) { // Cached plan hit EOF quickly enough. No need to replan. Update cache with stats // from this run and return. updatePlanCache(); return Status::OK(); } else if (PlanStage::NEED_YIELD == state) { if (id == WorkingSet::INVALID_ID) { if (!yieldPolicy->allowedToYield()) { throw WriteConflictException(); } } else { WorkingSetMember* member = _ws->get(id); invariant(member->hasFetcher()); // Transfer ownership of the fetcher and yield. _fetcher.reset(member->releaseFetcher()); } if (yieldPolicy->allowedToYield()) { yieldPolicy->forceYield(); } Status yieldStatus = tryYield(yieldPolicy); if (!yieldStatus.isOK()) { return yieldStatus; } } else if (PlanStage::FAILURE == state) { // On failure, fall back to replanning the whole query. We neither evict the // existing cache entry nor cache the result of replanning. BSONObj statusObj; WorkingSetCommon::getStatusMemberObject(*_ws, id, &statusObj); LOG(1) << "Execution of cached plan failed, falling back to replan." << " query: " << _canonicalQuery->toStringShort() << " planSummary: " << Explain::getPlanSummary(child().get()) << " status: " << statusObj; const bool shouldCache = false; return replan(yieldPolicy, shouldCache); } else if (PlanStage::DEAD == state) { BSONObj statusObj; WorkingSetCommon::getStatusMemberObject(*_ws, id, &statusObj); LOG(1) << "Execution of cached plan failed: PlanStage died" << ", query: " << _canonicalQuery->toStringShort() << " planSummary: " << Explain::getPlanSummary(child().get()) << " status: " << statusObj; return WorkingSetCommon::getMemberObjectStatus(statusObj); } else { invariant(PlanStage::NEED_TIME == state); } } // If we're here, the trial period took more than 'maxWorksBeforeReplan' work cycles. This // plan is taking too long, so we replan from scratch. LOG(1) << "Execution of cached plan required " << maxWorksBeforeReplan << " works, but was originally cached with only " << _decisionWorks << " works. Evicting cache entry and replanning query: " << _canonicalQuery->toStringShort() << " plan summary before replan: " << Explain::getPlanSummary(child().get()); const bool shouldCache = true; return replan(yieldPolicy, shouldCache); }
PlanStage::StageState IndexScan::work(WorkingSetID* out) { ++_commonStats.works; if (NULL == _indexCursor.get()) { // First call to work(). Perform possibly heavy init. initIndexScan(); checkEnd(); } else if (_yieldMovedCursor) { _yieldMovedCursor = false; // Note that we're not calling next() here. We got the next thing when we recovered // from yielding. } if (isEOF()) { return PlanStage::IS_EOF; } // Grab the next (key, value) from the index. BSONObj keyObj = _indexCursor->getKey(); DiskLoc loc = _indexCursor->getValue(); // Move to the next result. // The underlying IndexCursor points at the *next* thing we want to return. We do this so // that if we're scanning an index looking for docs to delete we don't continually clobber // the thing we're pointing at. _indexCursor->next(); checkEnd(); if (_shouldDedup) { ++_specificStats.dupsTested; if (_returned.end() != _returned.find(loc)) { ++_specificStats.dupsDropped; ++_commonStats.needTime; return PlanStage::NEED_TIME; } else { _returned.insert(loc); } } if (Filter::passes(keyObj, _keyPattern, _filter)) { if (NULL != _filter) { ++_specificStats.matchTested; } // We must make a copy of the on-disk data since it can mutate during the execution of // this query. BSONObj ownedKeyObj = keyObj.getOwned(); // Fill out the WSM. WorkingSetID id = _workingSet->allocate(); WorkingSetMember* member = _workingSet->get(id); member->loc = loc; member->keyData.push_back(IndexKeyDatum(_keyPattern, ownedKeyObj)); member->state = WorkingSetMember::LOC_AND_IDX; if (_params.addKeyMetadata) { BSONObjBuilder bob; bob.appendKeys(_keyPattern, ownedKeyObj); member->addComputed(new IndexKeyComputedData(bob.obj())); } *out = id; ++_commonStats.advanced; return PlanStage::ADVANCED; } ++_commonStats.needTime; return PlanStage::NEED_TIME; }
PlanStage::StageState GroupStage::work(WorkingSetID* out) { ++_commonStats.works; ScopedTimer timer(&_commonStats.executionTimeMillis); if (isEOF()) { return PlanStage::IS_EOF; } // On the first call to work(), call initGroupScripting(). if (_groupState == GroupState_Initializing) { Status status = initGroupScripting(); if (!status.isOK()) { *out = WorkingSetCommon::allocateStatusMember(_ws, status); return PlanStage::FAILURE; } _groupState = GroupState_ReadingFromChild; ++_commonStats.needTime; return PlanStage::NEED_TIME; } // Otherwise, read from our child. invariant(_groupState == GroupState_ReadingFromChild); WorkingSetID id = WorkingSet::INVALID_ID; StageState state = child()->work(&id); if (PlanStage::NEED_TIME == state) { ++_commonStats.needTime; return state; } else if (PlanStage::NEED_YIELD == state) { ++_commonStats.needYield; *out = id; return state; } else if (PlanStage::FAILURE == state) { *out = id; // If a stage fails, it may create a status WSM to indicate why it failed, in which // case 'id' is valid. If ID is invalid, we create our own error message. if (WorkingSet::INVALID_ID == id) { const std::string errmsg = "group stage failed to read in results from child"; *out = WorkingSetCommon::allocateStatusMember( _ws, Status(ErrorCodes::InternalError, errmsg)); } return state; } else if (PlanStage::DEAD == state) { return state; } else if (PlanStage::ADVANCED == state) { WorkingSetMember* member = _ws->get(id); // Group queries can't have projections. This means that covering analysis will always // add a fetch. We should always get fetched data, and never just key data. invariant(member->hasObj()); Status status = processObject(member->obj.value()); if (!status.isOK()) { *out = WorkingSetCommon::allocateStatusMember(_ws, status); return PlanStage::FAILURE; } _ws->free(id); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else { // We're done reading from our child. invariant(PlanStage::IS_EOF == state); auto results = finalizeResults(); if (!results.isOK()) { *out = WorkingSetCommon::allocateStatusMember(_ws, results.getStatus()); return PlanStage::FAILURE; } // Transition to state "done." Future calls to work() will return IS_EOF. _groupState = GroupState_Done; *out = _ws->allocate(); WorkingSetMember* member = _ws->get(*out); member->obj = Snapshotted<BSONObj>(SnapshotId(), results.getValue()); member->transitionToOwnedObj(); ++_commonStats.advanced; return PlanStage::ADVANCED; } }
PlanStage::StageState SortStage::doWork(WorkingSetID* out) { const size_t maxBytes = static_cast<size_t>(internalQueryExecMaxBlockingSortBytes); if (_memUsage > maxBytes) { mongoutils::str::stream ss; ss << "Sort operation used more than the maximum " << maxBytes << " bytes of RAM. Add an index, or specify a smaller limit."; Status status(ErrorCodes::OperationFailed, ss); *out = WorkingSetCommon::allocateStatusMember(_ws, status); return PlanStage::FAILURE; } if (isEOF()) { return PlanStage::IS_EOF; } // Still reading in results to sort. if (!_sorted) { WorkingSetID id = WorkingSet::INVALID_ID; StageState code = child()->work(&id); if (PlanStage::ADVANCED == code) { // Add it into the map for quick invalidation if it has a valid RecordId. // A RecordId may be invalidated at any time (during a yield). We need to get into // the WorkingSet as quickly as possible to handle it. WorkingSetMember* member = _ws->get(id); // Planner must put a fetch before we get here. verify(member->hasObj()); // We might be sorting something that was invalidated at some point. if (member->hasLoc()) { _wsidByDiskLoc[member->loc] = id; } SortableDataItem item; item.wsid = id; // We extract the sort key from the WSM's computed data. This must have been generated // by a SortKeyGeneratorStage descendent in the execution tree. auto sortKeyComputedData = static_cast<const SortKeyComputedData*>(member->getComputed(WSM_SORT_KEY)); item.sortKey = sortKeyComputedData->getSortKey(); if (member->hasLoc()) { // The RecordId breaks ties when sorting two WSMs with the same sort key. item.loc = member->loc; } addToBuffer(item); return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == code) { // TODO: We don't need the lock for this. We could ask for a yield and do this work // unlocked. Also, this is performing a lot of work for one call to work(...) sortBuffer(); _resultIterator = _data.begin(); _sorted = true; return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == code || PlanStage::DEAD == code) { *out = id; // If a stage fails, it may create a status WSM to indicate why it // failed, in which case 'id' is valid. If ID is invalid, we // create our own error message. if (WorkingSet::INVALID_ID == id) { mongoutils::str::stream ss; ss << "sort stage failed to read in results to sort from child"; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember(_ws, status); } return code; } else if (PlanStage::NEED_YIELD == code) { *out = id; } return code; } // Returning results. verify(_resultIterator != _data.end()); verify(_sorted); *out = _resultIterator->wsid; _resultIterator++; // If we're returning something, take it out of our DL -> WSID map so that future // calls to invalidate don't cause us to take action for a DL we're done with. WorkingSetMember* member = _ws->get(*out); if (member->hasLoc()) { _wsidByDiskLoc.erase(member->loc); } return PlanStage::ADVANCED; }