// static bool WorkingSetCommon::fetch(OperationContext* txn, WorkingSet* workingSet, WorkingSetID id, unowned_ptr<SeekableRecordCursor> cursor) { WorkingSetMember* member = workingSet->get(id); // The RecordFetcher should already have been transferred out of the WSM and used. invariant(!member->hasFetcher()); // We should have a RecordId but need to retrieve the obj. Get the obj now and reset all WSM // state appropriately. invariant(member->hasRecordId()); member->obj.reset(); auto record = cursor->seekExact(member->recordId); if (!record) { return false; } member->obj = {txn->recoveryUnit()->getSnapshotId(), record->data.releaseToBson()}; if (member->isSuspicious) { // Make sure that all of the keyData is still valid for this copy of the document. // This ensures both that index-provided filters and sort orders still hold. // TODO provide a way for the query planner to opt out of this checking if it is // unneeded due to the structure of the plan. invariant(!member->keyData.empty()); for (size_t i = 0; i < member->keyData.size(); i++) { BSONObjSet keys; // There's no need to compute the prefixes of the indexed fields that cause the index to // be multikey when ensuring the keyData is still valid. MultikeyPaths* multikeyPaths = nullptr; member->keyData[i].index->getKeys(member->obj.value(), &keys, multikeyPaths); if (!keys.count(member->keyData[i].keyData)) { // document would no longer be at this position in the index. return false; } } member->isSuspicious = false; } member->keyData.clear(); workingSet->transitionToRecordIdAndObj(id); return true; }
Status CachedPlanStage::pickBestPlan(PlanYieldPolicy* yieldPolicy) { // Adds the amount of time taken by pickBestPlan() to executionTimeMillis. There's lots of // execution work that happens here, so this is needed for the time accounting to // make sense. ScopedTimer timer(&_commonStats.executionTimeMillis); // If we work this many times during the trial period, then we will replan the // query from scratch. size_t maxWorksBeforeReplan = static_cast<size_t>(internalQueryCacheEvictionRatio * _decisionWorks); // The trial period ends without replanning if the cached plan produces this many results. size_t numResults = MultiPlanStage::getTrialPeriodNumToReturn(*_canonicalQuery); for (size_t i = 0; i < maxWorksBeforeReplan; ++i) { // Might need to yield between calls to work due to the timer elapsing. Status yieldStatus = tryYield(yieldPolicy); if (!yieldStatus.isOK()) { return yieldStatus; } WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState state = child()->work(&id); if (PlanStage::ADVANCED == state) { // Save result for later. WorkingSetMember* member = _ws->get(id); // Ensure that the BSONObj underlying the WorkingSetMember is owned in case we yield. member->makeObjOwnedIfNeeded(); _results.push_back(id); if (_results.size() >= numResults) { // Once a plan returns enough results, stop working. Update cache with stats // from this run and return. updatePlanCache(); return Status::OK(); } } else if (PlanStage::IS_EOF == state) { // Cached plan hit EOF quickly enough. No need to replan. Update cache with stats // from this run and return. updatePlanCache(); return Status::OK(); } else if (PlanStage::NEED_YIELD == state) { if (id == WorkingSet::INVALID_ID) { if (!yieldPolicy->allowedToYield()) { throw WriteConflictException(); } } else { WorkingSetMember* member = _ws->get(id); invariant(member->hasFetcher()); // Transfer ownership of the fetcher and yield. _fetcher.reset(member->releaseFetcher()); } if (yieldPolicy->allowedToYield()) { yieldPolicy->forceYield(); } Status yieldStatus = tryYield(yieldPolicy); if (!yieldStatus.isOK()) { return yieldStatus; } } else if (PlanStage::FAILURE == state) { // On failure, fall back to replanning the whole query. We neither evict the // existing cache entry nor cache the result of replanning. BSONObj statusObj; WorkingSetCommon::getStatusMemberObject(*_ws, id, &statusObj); LOG(1) << "Execution of cached plan failed, falling back to replan." << " query: " << _canonicalQuery->toStringShort() << " planSummary: " << Explain::getPlanSummary(child().get()) << " status: " << statusObj; const bool shouldCache = false; return replan(yieldPolicy, shouldCache); } else if (PlanStage::DEAD == state) { BSONObj statusObj; WorkingSetCommon::getStatusMemberObject(*_ws, id, &statusObj); LOG(1) << "Execution of cached plan failed: PlanStage died" << ", query: " << _canonicalQuery->toStringShort() << " planSummary: " << Explain::getPlanSummary(child().get()) << " status: " << statusObj; return WorkingSetCommon::getMemberObjectStatus(statusObj); } else { invariant(PlanStage::NEED_TIME == state); } } // If we're here, the trial period took more than 'maxWorksBeforeReplan' work cycles. This // plan is taking too long, so we replan from scratch. LOG(1) << "Execution of cached plan required " << maxWorksBeforeReplan << " works, but was originally cached with only " << _decisionWorks << " works. Evicting cache entry and replanning query: " << _canonicalQuery->toStringShort() << " plan summary before replan: " << Explain::getPlanSummary(child().get()); const bool shouldCache = true; return replan(yieldPolicy, shouldCache); }
bool MultiPlanStage::workAllPlans(size_t numResults, PlanYieldPolicy* yieldPolicy) { bool doneWorking = false; for (size_t ix = 0; ix < _candidates.size(); ++ix) { CandidatePlan& candidate = _candidates[ix]; if (candidate.failed) { continue; } // Might need to yield between calls to work due to the timer elapsing. if (!(tryYield(yieldPolicy)).isOK()) { return false; } WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState state = candidate.root->work(&id); if (PlanStage::ADVANCED == state) { // Save result for later. candidate.results.push_back(id); // Once a plan returns enough results, stop working. if (candidate.results.size() >= numResults) { doneWorking = true; } } else if (PlanStage::IS_EOF == state) { // First plan to hit EOF wins automatically. Stop evaluating other plans. // Assumes that the ranking will pick this plan. doneWorking = true; } else if (PlanStage::NEED_YIELD == state) { if (id == WorkingSet::INVALID_ID) { if (!yieldPolicy->allowedToYield()) throw WriteConflictException(); } else { WorkingSetMember* member = candidate.ws->get(id); invariant(member->hasFetcher()); // Transfer ownership of the fetcher and yield. _fetcher.reset(member->releaseFetcher()); } if (yieldPolicy->allowedToYield()) { yieldPolicy->forceYield(); } if (!(tryYield(yieldPolicy)).isOK()) { return false; } } else if (PlanStage::NEED_TIME != state) { // FAILURE or DEAD. Do we want to just tank that plan and try the rest? We // probably want to fail globally as this shouldn't happen anyway. candidate.failed = true; ++_failureCount; // Propagate most recent seen failure to parent. if (PlanStage::FAILURE == state) { _statusMemberId = id; } if (_failureCount == _candidates.size()) { _failure = true; return false; } } } return !doneWorking; }
PlanExecutor::ExecState PlanExecutor::getNextImpl(Snapshotted<BSONObj>* objOut, RecordId* dlOut) { if (MONGO_FAIL_POINT(planExecutorAlwaysFails)) { Status status(ErrorCodes::OperationFailed, str::stream() << "PlanExecutor hit planExecutorAlwaysFails fail point"); *objOut = Snapshotted<BSONObj>(SnapshotId(), WorkingSetCommon::buildMemberStatusObject(status)); return PlanExecutor::FAILURE; } invariant(_currentState == kUsable); if (isMarkedAsKilled()) { if (NULL != objOut) { Status status(ErrorCodes::OperationFailed, str::stream() << "Operation aborted because: " << *_killReason); *objOut = Snapshotted<BSONObj>(SnapshotId(), WorkingSetCommon::buildMemberStatusObject(status)); } return PlanExecutor::DEAD; } if (!_stash.empty()) { invariant(objOut && !dlOut); *objOut = {SnapshotId(), _stash.front()}; _stash.pop(); return PlanExecutor::ADVANCED; } // When a stage requests a yield for document fetch, it gives us back a RecordFetcher* // to use to pull the record into memory. We take ownership of the RecordFetcher here, // deleting it after we've had a chance to do the fetch. For timing-based yields, we // just pass a NULL fetcher. unique_ptr<RecordFetcher> fetcher; // Incremented on every writeConflict, reset to 0 on any successful call to _root->work. size_t writeConflictsInARow = 0; for (;;) { // These are the conditions which can cause us to yield: // 1) The yield policy's timer elapsed, or // 2) some stage requested a yield due to a document fetch, or // 3) we need to yield and retry due to a WriteConflictException. // In all cases, the actual yielding happens here. if (_yieldPolicy->shouldYield()) { if (!_yieldPolicy->yield(fetcher.get())) { // A return of false from a yield should only happen if we've been killed during the // yield. invariant(isMarkedAsKilled()); if (NULL != objOut) { Status status(ErrorCodes::OperationFailed, str::stream() << "Operation aborted because: " << *_killReason); *objOut = Snapshotted<BSONObj>( SnapshotId(), WorkingSetCommon::buildMemberStatusObject(status)); } return PlanExecutor::DEAD; } } // We're done using the fetcher, so it should be freed. We don't want to // use the same RecordFetcher twice. fetcher.reset(); WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState code = _root->work(&id); if (code != PlanStage::NEED_YIELD) writeConflictsInARow = 0; if (PlanStage::ADVANCED == code) { WorkingSetMember* member = _workingSet->get(id); bool hasRequestedData = true; if (NULL != objOut) { if (WorkingSetMember::RID_AND_IDX == member->getState()) { if (1 != member->keyData.size()) { _workingSet->free(id); hasRequestedData = false; } else { // TODO: currently snapshot ids are only associated with documents, and // not with index keys. *objOut = Snapshotted<BSONObj>(SnapshotId(), member->keyData[0].keyData); } } else if (member->hasObj()) { *objOut = member->obj; } else { _workingSet->free(id); hasRequestedData = false; } } if (NULL != dlOut) { if (member->hasRecordId()) { *dlOut = member->recordId; } else { _workingSet->free(id); hasRequestedData = false; } } if (hasRequestedData) { _workingSet->free(id); return PlanExecutor::ADVANCED; } // This result didn't have the data the caller wanted, try again. } else if (PlanStage::NEED_YIELD == code) { if (id == WorkingSet::INVALID_ID) { if (!_yieldPolicy->canAutoYield()) throw WriteConflictException(); CurOp::get(_opCtx)->debug().writeConflicts++; writeConflictsInARow++; WriteConflictException::logAndBackoff( writeConflictsInARow, "plan execution", _nss.ns()); } else { WorkingSetMember* member = _workingSet->get(id); invariant(member->hasFetcher()); // Transfer ownership of the fetcher. Next time around the loop a yield will // happen. fetcher.reset(member->releaseFetcher()); } // If we're allowed to, we will yield next time through the loop. if (_yieldPolicy->canAutoYield()) _yieldPolicy->forceYield(); } else if (PlanStage::NEED_TIME == code) { // Fall through to yield check at end of large conditional. } else if (PlanStage::IS_EOF == code) { if (shouldWaitForInserts()) { const bool locksReacquiredAfterYield = waitForInserts(); if (locksReacquiredAfterYield) { // There may be more results, try to get more data. continue; } invariant(isMarkedAsKilled()); if (objOut) { Status status(ErrorCodes::OperationFailed, str::stream() << "Operation aborted because: " << *_killReason); *objOut = Snapshotted<BSONObj>( SnapshotId(), WorkingSetCommon::buildMemberStatusObject(status)); } return PlanExecutor::DEAD; } else { return PlanExecutor::IS_EOF; } } else { invariant(PlanStage::DEAD == code || PlanStage::FAILURE == code); if (NULL != objOut) { BSONObj statusObj; WorkingSetCommon::getStatusMemberObject(*_workingSet, id, &statusObj); *objOut = Snapshotted<BSONObj>(SnapshotId(), statusObj); } return (PlanStage::DEAD == code) ? PlanExecutor::DEAD : PlanExecutor::FAILURE; } } }
PlanExecutor::ExecState PlanExecutor::getNext(BSONObj* objOut, DiskLoc* dlOut) { if (_killed) { return PlanExecutor::DEAD; } // When a stage requests a yield for document fetch, it gives us back a RecordFetcher* // to use to pull the record into memory. We take ownership of the RecordFetcher here, // deleting it after we've had a chance to do the fetch. For timing-based yields, we // just pass a NULL fetcher. boost::scoped_ptr<RecordFetcher> fetcher; for (;;) { // There are two conditions which cause us to yield if we have an YIELD_AUTO // policy: // 1) The yield policy's timer elapsed, or // 2) some stage requested a yield due to a document fetch (NEED_FETCH). // In both cases, the actual yielding happens here. if (NULL != _yieldPolicy.get() && (_yieldPolicy->shouldYield() || NULL != fetcher.get())) { // Here's where we yield. _yieldPolicy->yield(fetcher.get()); if (_killed) { return PlanExecutor::DEAD; } } // We're done using the fetcher, so it should be freed. We don't want to // use the same RecordFetcher twice. fetcher.reset(); WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState code = _root->work(&id); if (PlanStage::ADVANCED == code) { // Fast count. if (WorkingSet::INVALID_ID == id) { invariant(NULL == objOut); invariant(NULL == dlOut); return PlanExecutor::ADVANCED; } WorkingSetMember* member = _workingSet->get(id); bool hasRequestedData = true; if (NULL != objOut) { if (WorkingSetMember::LOC_AND_IDX == member->state) { if (1 != member->keyData.size()) { _workingSet->free(id); hasRequestedData = false; } else { *objOut = member->keyData[0].keyData; } } else if (member->hasObj()) { *objOut = member->obj; } else { _workingSet->free(id); hasRequestedData = false; } } if (NULL != dlOut) { if (member->hasLoc()) { *dlOut = member->loc; } else { _workingSet->free(id); hasRequestedData = false; } } if (hasRequestedData) { _workingSet->free(id); return PlanExecutor::ADVANCED; } // This result didn't have the data the caller wanted, try again. } else if (PlanStage::NEED_FETCH == code) { // Yielding on a NEED_FETCH is handled above, so there's not much to do here. // Just verify that the NEED_FETCH gave us back a WSM that is actually fetchable. WorkingSetMember* member = _workingSet->get(id); invariant(member->hasFetcher()); // Transfer ownership of the fetcher. Next time around the loop a yield will happen. fetcher.reset(member->releaseFetcher()); } else if (PlanStage::NEED_TIME == code) { // Fall through to yield check at end of large conditional. } else if (PlanStage::IS_EOF == code) { return PlanExecutor::IS_EOF; } else if (PlanStage::DEAD == code) { return PlanExecutor::DEAD; } else { verify(PlanStage::FAILURE == code); if (NULL != objOut) { WorkingSetCommon::getStatusMemberObject(*_workingSet, id, objOut); } return PlanExecutor::EXEC_ERROR; } } }
PlanExecutor::ExecState PlanExecutor::getNextSnapshotted(Snapshotted<BSONObj>* objOut, RecordId* dlOut) { if (_killed) { return PlanExecutor::DEAD; } // When a stage requests a yield for document fetch, it gives us back a RecordFetcher* // to use to pull the record into memory. We take ownership of the RecordFetcher here, // deleting it after we've had a chance to do the fetch. For timing-based yields, we // just pass a NULL fetcher. boost::scoped_ptr<RecordFetcher> fetcher; // Incremented on every writeConflict, reset to 0 on any successful call to _root->work. size_t writeConflictsInARow = 0; for (;;) { // These are the conditions which can cause us to yield: // 1) The yield policy's timer elapsed, or // 2) some stage requested a yield due to a document fetch, or // 3) we need to yield and retry due to a WriteConflictException. // In all cases, the actual yielding happens here. if (_yieldPolicy->shouldYield()) { _yieldPolicy->yield(fetcher.get()); if (_killed) { return PlanExecutor::DEAD; } } // We're done using the fetcher, so it should be freed. We don't want to // use the same RecordFetcher twice. fetcher.reset(); WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState code = _root->work(&id); if (code != PlanStage::NEED_YIELD) writeConflictsInARow = 0; if (PlanStage::ADVANCED == code) { // Fast count. if (WorkingSet::INVALID_ID == id) { invariant(NULL == objOut); invariant(NULL == dlOut); return PlanExecutor::ADVANCED; } WorkingSetMember* member = _workingSet->get(id); bool hasRequestedData = true; if (NULL != objOut) { if (WorkingSetMember::LOC_AND_IDX == member->state) { if (1 != member->keyData.size()) { _workingSet->free(id); hasRequestedData = false; } else { // TODO: currently snapshot ids are only associated with documents, and // not with index keys. *objOut = Snapshotted<BSONObj>(SnapshotId(), member->keyData[0].keyData); } } else if (member->hasObj()) { *objOut = member->obj; } else { _workingSet->free(id); hasRequestedData = false; } } if (NULL != dlOut) { if (member->hasLoc()) { *dlOut = member->loc; } else { _workingSet->free(id); hasRequestedData = false; } } if (hasRequestedData) { _workingSet->free(id); return PlanExecutor::ADVANCED; } // This result didn't have the data the caller wanted, try again. } else if (PlanStage::NEED_YIELD == code) { if (id == WorkingSet::INVALID_ID) { if (!_yieldPolicy->allowedToYield()) throw WriteConflictException(); _opCtx->getCurOp()->debug().writeConflicts++; writeConflictsInARow++; WriteConflictException::logAndBackoff(writeConflictsInARow, "plan execution", _collection->ns().ns()); } else { WorkingSetMember* member = _workingSet->get(id); invariant(member->hasFetcher()); // Transfer ownership of the fetcher. Next time around the loop a yield will // happen. fetcher.reset(member->releaseFetcher()); } // If we're allowed to, we will yield next time through the loop. if (_yieldPolicy->allowedToYield()) _yieldPolicy->forceYield(); } else if (PlanStage::NEED_TIME == code) { // Fall through to yield check at end of large conditional. } else if (PlanStage::IS_EOF == code) { return PlanExecutor::IS_EOF; } else if (PlanStage::DEAD == code) { return PlanExecutor::DEAD; } else { verify(PlanStage::FAILURE == code); if (NULL != objOut) { BSONObj statusObj; WorkingSetCommon::getStatusMemberObject(*_workingSet, id, &statusObj); *objOut = Snapshotted<BSONObj>(SnapshotId(), statusObj); } return PlanExecutor::FAILURE; } } }