PlanStage::StageState AndHashStage::readFirstChild(WorkingSetID* out) { verify(_currentChild == 0); WorkingSetID id = WorkingSet::INVALID_ID; StageState childStatus = workChild(0, &id); if (PlanStage::ADVANCED == childStatus) { WorkingSetMember* member = _ws->get(id); // The child must give us a WorkingSetMember with a record id, since we intersect index keys // based on the record id. The planner ensures that the child stage can never produce an WSM // with no record id. invariant(member->hasRecordId()); if (!_dataMap.insert(std::make_pair(member->recordId, id)).second) { // Didn't insert because we already had this RecordId inside the map. This should only // happen if we're seeing a newer copy of the same doc in a more recent snapshot. // Throw out the newer copy of the doc. _ws->free(id); return PlanStage::NEED_TIME; } // Ensure that the BSONObj underlying the WorkingSetMember is owned in case we yield. member->makeObjOwnedIfNeeded(); // Update memory stats. _memUsage += member->getMemUsage(); return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == childStatus) { // Done reading child 0. _currentChild = 1; // If our first child was empty, don't scan any others, no possible results. if (_dataMap.empty()) { _hashingChildren = false; return PlanStage::IS_EOF; } _specificStats.mapAfterChild.push_back(_dataMap.size()); return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == childStatus || PlanStage::DEAD == childStatus) { // The stage which produces a failure is responsible for allocating a working set member // with error details. invariant(WorkingSet::INVALID_ID != id); *out = id; return childStatus; } else { if (PlanStage::NEED_YIELD == childStatus) { *out = id; } return childStatus; } }
BSONObj getObj() const { if (!WorkingSetCommon::fetchIfUnfetched(_opCtx, _ws, _id, _recordCursor)) throw DocumentDeletedException(); WorkingSetMember* member = _ws->get(_id); // Make it owned since we are buffering results. member->makeObjOwnedIfNeeded(); return member->obj.value(); }
Status CachedPlanStage::pickBestPlan(PlanYieldPolicy* yieldPolicy) { // Adds the amount of time taken by pickBestPlan() to executionTimeMillis. There's lots of // execution work that happens here, so this is needed for the time accounting to // make sense. ScopedTimer timer(&_commonStats.executionTimeMillis); // If we work this many times during the trial period, then we will replan the // query from scratch. size_t maxWorksBeforeReplan = static_cast<size_t>(internalQueryCacheEvictionRatio * _decisionWorks); // The trial period ends without replanning if the cached plan produces this many results. size_t numResults = MultiPlanStage::getTrialPeriodNumToReturn(*_canonicalQuery); for (size_t i = 0; i < maxWorksBeforeReplan; ++i) { // Might need to yield between calls to work due to the timer elapsing. Status yieldStatus = tryYield(yieldPolicy); if (!yieldStatus.isOK()) { return yieldStatus; } WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState state = child()->work(&id); if (PlanStage::ADVANCED == state) { // Save result for later. WorkingSetMember* member = _ws->get(id); // Ensure that the BSONObj underlying the WorkingSetMember is owned in case we yield. member->makeObjOwnedIfNeeded(); _results.push_back(id); if (_results.size() >= numResults) { // Once a plan returns enough results, stop working. Update cache with stats // from this run and return. updatePlanCache(); return Status::OK(); } } else if (PlanStage::IS_EOF == state) { // Cached plan hit EOF quickly enough. No need to replan. Update cache with stats // from this run and return. updatePlanCache(); return Status::OK(); } else if (PlanStage::NEED_YIELD == state) { if (id == WorkingSet::INVALID_ID) { if (!yieldPolicy->allowedToYield()) { throw WriteConflictException(); } } else { WorkingSetMember* member = _ws->get(id); invariant(member->hasFetcher()); // Transfer ownership of the fetcher and yield. _fetcher.reset(member->releaseFetcher()); } if (yieldPolicy->allowedToYield()) { yieldPolicy->forceYield(); } Status yieldStatus = tryYield(yieldPolicy); if (!yieldStatus.isOK()) { return yieldStatus; } } else if (PlanStage::FAILURE == state) { // On failure, fall back to replanning the whole query. We neither evict the // existing cache entry nor cache the result of replanning. BSONObj statusObj; WorkingSetCommon::getStatusMemberObject(*_ws, id, &statusObj); LOG(1) << "Execution of cached plan failed, falling back to replan." << " query: " << _canonicalQuery->toStringShort() << " planSummary: " << Explain::getPlanSummary(child().get()) << " status: " << statusObj; const bool shouldCache = false; return replan(yieldPolicy, shouldCache); } else if (PlanStage::DEAD == state) { BSONObj statusObj; WorkingSetCommon::getStatusMemberObject(*_ws, id, &statusObj); LOG(1) << "Execution of cached plan failed: PlanStage died" << ", query: " << _canonicalQuery->toStringShort() << " planSummary: " << Explain::getPlanSummary(child().get()) << " status: " << statusObj; return WorkingSetCommon::getMemberObjectStatus(statusObj); } else { invariant(PlanStage::NEED_TIME == state); } } // If we're here, the trial period took more than 'maxWorksBeforeReplan' work cycles. This // plan is taking too long, so we replan from scratch. LOG(1) << "Execution of cached plan required " << maxWorksBeforeReplan << " works, but was originally cached with only " << _decisionWorks << " works. Evicting cache entry and replanning query: " << _canonicalQuery->toStringShort() << " plan summary before replan: " << Explain::getPlanSummary(child().get()); const bool shouldCache = true; return replan(yieldPolicy, shouldCache); }
/** * addToBuffer() and sortBuffer() work differently based on the * configured limit. addToBuffer() is also responsible for * performing some accounting on the overall memory usage to * make sure we're not using too much memory. * * limit == 0: * addToBuffer() - Adds item to vector. * sortBuffer() - Sorts vector. * limit == 1: * addToBuffer() - Replaces first item in vector with max of * current and new item. * Updates memory usage if item was replaced. * sortBuffer() - Does nothing. * limit > 1: * addToBuffer() - Does not update vector. Adds item to set. * If size of set exceeds limit, remove item from set * with lowest key. Updates memory usage accordingly. * sortBuffer() - Copies items from set to vectors. */ void SortStage::addToBuffer(const SortableDataItem& item) { // Holds ID of working set member to be freed at end of this function. WorkingSetID wsidToFree = WorkingSet::INVALID_ID; WorkingSetMember* member = _ws->get(item.wsid); if (_limit == 0) { // Ensure that the BSONObj underlying the WorkingSetMember is owned in case we yield. member->makeObjOwnedIfNeeded(); _data.push_back(item); _memUsage += member->getMemUsage(); } else if (_limit == 1) { if (_data.empty()) { member->makeObjOwnedIfNeeded(); _data.push_back(item); _memUsage = member->getMemUsage(); return; } wsidToFree = item.wsid; const WorkingSetComparator& cmp = *_sortKeyComparator; // Compare new item with existing item in vector. if (cmp(item, _data[0])) { wsidToFree = _data[0].wsid; member->makeObjOwnedIfNeeded(); _data[0] = item; _memUsage = member->getMemUsage(); } } else { // Update data item set instead of vector // Limit not reached - insert and return vector<SortableDataItem>::size_type limit(_limit); if (_dataSet->size() < limit) { member->makeObjOwnedIfNeeded(); _dataSet->insert(item); _memUsage += member->getMemUsage(); return; } // Limit will be exceeded - compare with item with lowest key // If new item does not have a lower key value than last item, // do nothing. wsidToFree = item.wsid; SortableDataItemSet::const_iterator lastItemIt = --(_dataSet->end()); const SortableDataItem& lastItem = *lastItemIt; const WorkingSetComparator& cmp = *_sortKeyComparator; if (cmp(item, lastItem)) { _memUsage -= _ws->get(lastItem.wsid)->getMemUsage(); _memUsage += member->getMemUsage(); wsidToFree = lastItem.wsid; // According to std::set iterator validity rules, // it does not matter which of erase()/insert() happens first. // Here, we choose to erase first to release potential resources // used by the last item and to keep the scope of the iterator to a minimum. _dataSet->erase(lastItemIt); member->makeObjOwnedIfNeeded(); _dataSet->insert(item); } } // If the working set ID is valid, remove from // RecordId invalidation map and free from working set. if (wsidToFree != WorkingSet::INVALID_ID) { WorkingSetMember* member = _ws->get(wsidToFree); if (member->hasLoc()) { _wsidByDiskLoc.erase(member->loc); } _ws->free(wsidToFree); } }
bool MultiPlanStage::workAllPlans(size_t numResults, PlanYieldPolicy* yieldPolicy) { bool doneWorking = false; for (size_t ix = 0; ix < _candidates.size(); ++ix) { CandidatePlan& candidate = _candidates[ix]; if (candidate.failed) { continue; } // Might need to yield between calls to work due to the timer elapsing. if (!(tryYield(yieldPolicy)).isOK()) { return false; } WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState state = candidate.root->work(&id); if (PlanStage::ADVANCED == state) { // Save result for later. WorkingSetMember* member = candidate.ws->get(id); // Ensure that the BSONObj underlying the WorkingSetMember is owned in case we choose to // return the results from the 'candidate' plan. member->makeObjOwnedIfNeeded(); candidate.results.push(id); // Once a plan returns enough results, stop working. if (candidate.results.size() >= numResults) { doneWorking = true; } } else if (PlanStage::IS_EOF == state) { // First plan to hit EOF wins automatically. Stop evaluating other plans. // Assumes that the ranking will pick this plan. doneWorking = true; } else if (PlanStage::NEED_YIELD == state) { invariant(id == WorkingSet::INVALID_ID); if (!yieldPolicy->canAutoYield()) { throw WriteConflictException(); } if (yieldPolicy->canAutoYield()) { yieldPolicy->forceYield(); } if (!(tryYield(yieldPolicy)).isOK()) { return false; } } else if (PlanStage::NEED_TIME != state) { // FAILURE or DEAD. Do we want to just tank that plan and try the rest? We // probably want to fail globally as this shouldn't happen anyway. candidate.failed = true; ++_failureCount; // Propagate most recent seen failure to parent. if (PlanStage::FAILURE == state) { _statusMemberId = id; } if (_failureCount == _candidates.size()) { _failure = true; return false; } } } return !doneWorking; }
// Set "toReturn" when NEED_YIELD. PlanStage::StageState NearStage::bufferNext(WorkingSetID* toReturn, Status* error) { // // Try to retrieve the next covered member // if (!_nextInterval) { StatusWith<CoveredInterval*> intervalStatus = nextInterval(getOpCtx(), _workingSet, _collection); if (!intervalStatus.isOK()) { _searchState = SearchState_Finished; *error = intervalStatus.getStatus(); return PlanStage::FAILURE; } if (NULL == intervalStatus.getValue()) { _searchState = SearchState_Finished; return PlanStage::IS_EOF; } // CoveredInterval and its child stage are owned by _childrenIntervals _childrenIntervals.push_back(intervalStatus.getValue()); _nextInterval = _childrenIntervals.back(); _specificStats.intervalStats.emplace_back(); _nextIntervalStats = &_specificStats.intervalStats.back(); _nextIntervalStats->minDistanceAllowed = _nextInterval->minDistance; _nextIntervalStats->maxDistanceAllowed = _nextInterval->maxDistance; _nextIntervalStats->inclusiveMaxDistanceAllowed = _nextInterval->inclusiveMax; } WorkingSetID nextMemberID; PlanStage::StageState intervalState = _nextInterval->covering->work(&nextMemberID); if (PlanStage::IS_EOF == intervalState) { _searchState = SearchState_Advancing; return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == intervalState) { *error = WorkingSetCommon::getMemberStatus(*_workingSet->get(nextMemberID)); return intervalState; } else if (PlanStage::NEED_YIELD == intervalState) { *toReturn = nextMemberID; return intervalState; } else if (PlanStage::ADVANCED != intervalState) { return intervalState; } // // Try to buffer the next covered member // WorkingSetMember* nextMember = _workingSet->get(nextMemberID); // The child stage may not dedup so we must dedup them ourselves. if (_nextInterval->dedupCovering && nextMember->hasLoc()) { if (_seenDocuments.end() != _seenDocuments.find(nextMember->loc)) { _workingSet->free(nextMemberID); return PlanStage::NEED_TIME; } } ++_nextIntervalStats->numResultsBuffered; StatusWith<double> distanceStatus = computeDistance(nextMember); if (!distanceStatus.isOK()) { _searchState = SearchState_Finished; *error = distanceStatus.getStatus(); return PlanStage::FAILURE; } // If the member's distance is in the current distance interval, add it to our buffered // results. double memberDistance = distanceStatus.getValue(); // Ensure that the BSONObj underlying the WorkingSetMember is owned in case we yield. nextMember->makeObjOwnedIfNeeded(); _resultBuffer.push(SearchResult(nextMemberID, memberDistance)); // Store the member's RecordId, if available, for quick invalidation if (nextMember->hasLoc()) { _seenDocuments.insert(std::make_pair(nextMember->loc, nextMemberID)); } return PlanStage::NEED_TIME; }
PlanStage::StageState DeleteStage::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (isEOF()) { return PlanStage::IS_EOF; } invariant(_collection); // If isEOF() returns false, we must have a collection. // It is possible that after a delete was executed, a WriteConflictException occurred // and prevented us from returning ADVANCED with the old version of the document. if (_idReturning != WorkingSet::INVALID_ID) { // We should only get here if we were trying to return something before. invariant(_params.returnDeleted); WorkingSetMember* member = _ws->get(_idReturning); invariant(member->getState() == WorkingSetMember::OWNED_OBJ); *out = _idReturning; _idReturning = WorkingSet::INVALID_ID; ++_commonStats.advanced; return PlanStage::ADVANCED; } // Either retry the last WSM we worked on or get a new one from our child. WorkingSetID id; StageState status; if (_idRetrying == WorkingSet::INVALID_ID) { status = child()->work(&id); } else { status = ADVANCED; id = _idRetrying; _idRetrying = WorkingSet::INVALID_ID; } if (PlanStage::ADVANCED == status) { WorkingSetMember* member = _ws->get(id); // We want to free this member when we return, unless we need to retry it. ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id); if (!member->hasLoc()) { // We expect to be here because of an invalidation causing a force-fetch, and // doc-locking storage engines do not issue invalidations. ++_specificStats.nInvalidateSkips; ++_commonStats.needTime; return PlanStage::NEED_TIME; } RecordId rloc = member->loc; // Deletes can't have projections. This means that covering analysis will always add // a fetch. We should always get fetched data, and never just key data. invariant(member->hasObj()); try { // If the snapshot changed, then we have to make sure we have the latest copy of the // doc and that it still matches. std::unique_ptr<RecordCursor> cursor; if (getOpCtx()->recoveryUnit()->getSnapshotId() != member->obj.snapshotId()) { cursor = _collection->getCursor(getOpCtx()); if (!WorkingSetCommon::fetch(getOpCtx(), _ws, id, cursor)) { // Doc is already deleted. Nothing more to do. ++_commonStats.needTime; return PlanStage::NEED_TIME; } // Make sure the re-fetched doc still matches the predicate. if (_params.canonicalQuery && !_params.canonicalQuery->root()->matchesBSON(member->obj.value(), NULL)) { // Doesn't match. ++_commonStats.needTime; return PlanStage::NEED_TIME; } } // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState() // is allowed to free the memory. if (_params.returnDeleted) { member->makeObjOwnedIfNeeded(); } // TODO: Do we want to buffer docs and delete them in a group rather than // saving/restoring state repeatedly? try { if (supportsDocLocking()) { // Doc-locking engines require this before saveState() since they don't use // invalidations. WorkingSetCommon::prepareForSnapshotChange(_ws); } child()->saveState(); } catch (const WriteConflictException& wce) { std::terminate(); } if (_params.returnDeleted) { // Save a copy of the document that is about to get deleted. BSONObj deletedDoc = member->obj.value(); member->obj.setValue(deletedDoc.getOwned()); member->loc = RecordId(); member->transitionToOwnedObj(); } // Do the write, unless this is an explain. if (!_params.isExplain) { WriteUnitOfWork wunit(getOpCtx()); _collection->deleteDocument(getOpCtx(), rloc); wunit.commit(); } ++_specificStats.docsDeleted; } catch (const WriteConflictException& wce) { // Ensure that the BSONObj underlying the WorkingSetMember is owned because it may be // freed when we yield. member->makeObjOwnedIfNeeded(); _idRetrying = id; memberFreer.Dismiss(); // Keep this member around so we can retry deleting it. *out = WorkingSet::INVALID_ID; _commonStats.needYield++; return NEED_YIELD; } // As restoreState may restore (recreate) cursors, cursors are tied to the // transaction in which they are created, and a WriteUnitOfWork is a // transaction, make sure to restore the state outside of the WritUnitOfWork. try { child()->restoreState(); } catch (const WriteConflictException& wce) { // Note we don't need to retry anything in this case since the delete already // was committed. However, we still need to return the deleted document // (if it was requested). if (_params.returnDeleted) { // member->obj should refer to the deleted document. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); _idReturning = id; // Keep this member around so that we can return it on the next work() call. memberFreer.Dismiss(); } *out = WorkingSet::INVALID_ID; _commonStats.needYield++; return NEED_YIELD; } if (_params.returnDeleted) { // member->obj should refer to the deleted document. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); memberFreer.Dismiss(); // Keep this member around so we can return it. *out = id; ++_commonStats.advanced; return PlanStage::ADVANCED; } ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == status || PlanStage::DEAD == status) { *out = id; // If a stage fails, it may create a status WSM to indicate why it failed, in which case // 'id' is valid. If ID is invalid, we create our own error message. if (WorkingSet::INVALID_ID == id) { const std::string errmsg = "delete stage failed to read in results from child"; *out = WorkingSetCommon::allocateStatusMember( _ws, Status(ErrorCodes::InternalError, errmsg)); } return status; } else if (PlanStage::NEED_TIME == status) { ++_commonStats.needTime; } else if (PlanStage::NEED_YIELD == status) { *out = id; ++_commonStats.needYield; } return status; }
PlanStage::StageState TextOrStage::addTerm(WorkingSetID wsid, WorkingSetID* out) { WorkingSetMember* wsm = _ws->get(wsid); invariant(wsm->getState() == WorkingSetMember::RID_AND_IDX); invariant(1 == wsm->keyData.size()); const IndexKeyDatum newKeyData = wsm->keyData.back(); // copy to keep it around. TextRecordData* textRecordData = &_scores[wsm->recordId]; if (textRecordData->score < 0) { // We have already rejected this document for not matching the filter. invariant(WorkingSet::INVALID_ID == textRecordData->wsid); _ws->free(wsid); return NEED_TIME; } if (WorkingSet::INVALID_ID == textRecordData->wsid) { // We haven't seen this RecordId before. invariant(textRecordData->score == 0); bool shouldKeep = true; if (_filter) { // We have not seen this document before and need to apply a filter. bool wasDeleted = false; try { TextMatchableDocument tdoc(getOpCtx(), newKeyData.indexKeyPattern, newKeyData.keyData, _ws, wsid, _recordCursor); shouldKeep = _filter->matches(&tdoc); } catch (const WriteConflictException& wce) { // Ensure that the BSONObj underlying the WorkingSetMember is owned because it may // be freed when we yield. wsm->makeObjOwnedIfNeeded(); _idRetrying = wsid; *out = WorkingSet::INVALID_ID; return NEED_YIELD; } catch (const TextMatchableDocument::DocumentDeletedException&) { // We attempted to fetch the document but decided it should be excluded from the // result set. shouldKeep = false; wasDeleted = true; } if (wasDeleted || wsm->hasObj()) { ++_specificStats.fetches; } } if (shouldKeep && !wsm->hasObj()) { // Our parent expects RID_AND_OBJ members, so we fetch the document here if we haven't // already. try { shouldKeep = WorkingSetCommon::fetch(getOpCtx(), _ws, wsid, _recordCursor); ++_specificStats.fetches; } catch (const WriteConflictException& wce) { wsm->makeObjOwnedIfNeeded(); _idRetrying = wsid; *out = WorkingSet::INVALID_ID; return NEED_YIELD; } } if (!shouldKeep) { _ws->free(wsid); textRecordData->score = -1; return NEED_TIME; } textRecordData->wsid = wsid; // Ensure that the BSONObj underlying the WorkingSetMember is owned in case we yield. wsm->makeObjOwnedIfNeeded(); } else { // We already have a working set member for this RecordId. Free the new WSM and retrieve the // old one. Note that since we don't keep all index keys, we could get a score that doesn't // match the document, but this has always been a problem. // TODO something to improve the situation. invariant(wsid != textRecordData->wsid); _ws->free(wsid); wsm = _ws->get(textRecordData->wsid); } // Locate score within possibly compound key: {prefix,term,score,suffix}. BSONObjIterator keyIt(newKeyData.keyData); for (unsigned i = 0; i < _ftsSpec.numExtraBefore(); i++) { keyIt.next(); } keyIt.next(); // Skip past 'term'. BSONElement scoreElement = keyIt.next(); double documentTermScore = scoreElement.number(); // Aggregate relevance score, term keys. textRecordData->score += documentTermScore; return NEED_TIME; }
PlanStage::StageState UpdateStage::doWork(WorkingSetID* out) { if (isEOF()) { return PlanStage::IS_EOF; } if (doneUpdating()) { // Even if we're done updating, we may have some inserting left to do. if (needInsert()) { // TODO we may want to handle WriteConflictException here. Currently we bounce it // out to a higher level since if this WCEs it is likely that we raced with another // upsert that may have matched our query, and therefore this may need to perform an // update rather than an insert. Bouncing to the higher level allows restarting the // query in this case. doInsert(); invariant(isEOF()); if (_params.request->shouldReturnNewDocs()) { // Want to return the document we just inserted, create it as a WorkingSetMember // so that we can return it. BSONObj newObj = _specificStats.objInserted; *out = _ws->allocate(); WorkingSetMember* member = _ws->get(*out); member->obj = Snapshotted<BSONObj>(getOpCtx()->recoveryUnit()->getSnapshotId(), newObj.getOwned()); member->transitionToOwnedObj(); return PlanStage::ADVANCED; } } // At this point either we're done updating and there was no insert to do, // or we're done updating and we're done inserting. Either way, we're EOF. invariant(isEOF()); return PlanStage::IS_EOF; } // If we're here, then we still have to ask for results from the child and apply // updates to them. We should only get here if the collection exists. invariant(_collection); // It is possible that after an update was applied, a WriteConflictException // occurred and prevented us from returning ADVANCED with the requested version // of the document. if (_idReturning != WorkingSet::INVALID_ID) { // We should only get here if we were trying to return something before. invariant(_params.request->shouldReturnAnyDocs()); WorkingSetMember* member = _ws->get(_idReturning); invariant(member->getState() == WorkingSetMember::OWNED_OBJ); *out = _idReturning; _idReturning = WorkingSet::INVALID_ID; return PlanStage::ADVANCED; } // Either retry the last WSM we worked on or get a new one from our child. WorkingSetID id; StageState status; if (_idRetrying == WorkingSet::INVALID_ID) { status = child()->work(&id); } else { status = ADVANCED; id = _idRetrying; _idRetrying = WorkingSet::INVALID_ID; } if (PlanStage::ADVANCED == status) { // Need to get these things from the result returned by the child. RecordId recordId; WorkingSetMember* member = _ws->get(id); // We want to free this member when we return, unless we need to retry updating or returning // it. ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id); if (!member->hasRecordId()) { // We expect to be here because of an invalidation causing a force-fetch. ++_specificStats.nInvalidateSkips; return PlanStage::NEED_TIME; } recordId = member->recordId; // Updates can't have projections. This means that covering analysis will always add // a fetch. We should always get fetched data, and never just key data. invariant(member->hasObj()); // We fill this with the new RecordIds of moved doc so we don't double-update. if (_updatedRecordIds && _updatedRecordIds->count(recordId) > 0) { // Found a RecordId that refers to a document we had already updated. Note that // we can never remove from _updatedRecordIds because updates by other clients // could cause us to encounter a document again later. return PlanStage::NEED_TIME; } bool docStillMatches; try { docStillMatches = write_stage_common::ensureStillMatches( _collection, getOpCtx(), _ws, id, _params.canonicalQuery); } catch (const WriteConflictException&) { // There was a problem trying to detect if the document still exists, so retry. memberFreer.Dismiss(); return prepareToRetryWSM(id, out); } if (!docStillMatches) { // Either the document has been deleted, or it has been updated such that it no longer // matches the predicate. if (shouldRestartUpdateIfNoLongerMatches(_params)) { throw WriteConflictException(); } return PlanStage::NEED_TIME; } // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState() // is allowed to free the memory. member->makeObjOwnedIfNeeded(); // Save state before making changes WorkingSetCommon::prepareForSnapshotChange(_ws); try { child()->saveState(); } catch (const WriteConflictException&) { std::terminate(); } // If we care about the pre-updated version of the doc, save it out here. BSONObj oldObj; if (_params.request->shouldReturnOldDocs()) { oldObj = member->obj.value().getOwned(); } BSONObj newObj; try { // Do the update, get us the new version of the doc. newObj = transformAndUpdate(member->obj, recordId); } catch (const WriteConflictException&) { memberFreer.Dismiss(); // Keep this member around so we can retry updating it. return prepareToRetryWSM(id, out); } // Set member's obj to be the doc we want to return. if (_params.request->shouldReturnAnyDocs()) { if (_params.request->shouldReturnNewDocs()) { member->obj = Snapshotted<BSONObj>(getOpCtx()->recoveryUnit()->getSnapshotId(), newObj.getOwned()); } else { invariant(_params.request->shouldReturnOldDocs()); member->obj.setValue(oldObj); } member->recordId = RecordId(); member->transitionToOwnedObj(); } // This should be after transformAndUpdate to make sure we actually updated this doc. ++_specificStats.nMatched; // Restore state after modification // As restoreState may restore (recreate) cursors, make sure to restore the // state outside of the WritUnitOfWork. try { child()->restoreState(); } catch (const WriteConflictException&) { // Note we don't need to retry updating anything in this case since the update // already was committed. However, we still need to return the updated document // (if it was requested). if (_params.request->shouldReturnAnyDocs()) { // member->obj should refer to the document we want to return. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); _idReturning = id; // Keep this member around so that we can return it on the next work() call. memberFreer.Dismiss(); } *out = WorkingSet::INVALID_ID; return NEED_YIELD; } if (_params.request->shouldReturnAnyDocs()) { // member->obj should refer to the document we want to return. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); memberFreer.Dismiss(); // Keep this member around so we can return it. *out = id; return PlanStage::ADVANCED; } return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == status) { // The child is out of results, but we might not be done yet because we still might // have to do an insert. return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == status) { *out = id; // If a stage fails, it may create a status WSM to indicate why it failed, in which case // 'id' is valid. If ID is invalid, we create our own error message. if (WorkingSet::INVALID_ID == id) { const std::string errmsg = "update stage failed to read in results from child"; *out = WorkingSetCommon::allocateStatusMember( _ws, Status(ErrorCodes::InternalError, errmsg)); return PlanStage::FAILURE; } return status; } else if (PlanStage::NEED_YIELD == status) { *out = id; } return status; }
PlanStage::StageState FetchStage::doWork(WorkingSetID* out) { if (isEOF()) { return PlanStage::IS_EOF; } // Either retry the last WSM we worked on or get a new one from our child. WorkingSetID id; StageState status; if (_idRetrying == WorkingSet::INVALID_ID) { status = child()->work(&id); } else { status = ADVANCED; id = _idRetrying; _idRetrying = WorkingSet::INVALID_ID; } if (PlanStage::ADVANCED == status) { WorkingSetMember* member = _ws->get(id); // If there's an obj there, there is no fetching to perform. if (member->hasObj()) { ++_specificStats.alreadyHasObj; } else { // We need a valid loc to fetch from and this is the only state that has one. verify(WorkingSetMember::LOC_AND_IDX == member->getState()); verify(member->hasLoc()); try { if (!_cursor) _cursor = _collection->getCursor(getOpCtx()); if (auto fetcher = _cursor->fetcherForId(member->loc)) { // There's something to fetch. Hand the fetcher off to the WSM, and pass up // a fetch request. _idRetrying = id; member->setFetcher(fetcher.release()); *out = id; return NEED_YIELD; } // The doc is already in memory, so go ahead and grab it. Now we have a RecordId // as well as an unowned object if (!WorkingSetCommon::fetch(getOpCtx(), _ws, id, _cursor)) { _ws->free(id); return NEED_TIME; } } catch (const WriteConflictException& wce) { // Ensure that the BSONObj underlying the WorkingSetMember is owned because it may // be freed when we yield. member->makeObjOwnedIfNeeded(); _idRetrying = id; *out = WorkingSet::INVALID_ID; return NEED_YIELD; } } return returnIfMatches(member, id, out); } else if (PlanStage::FAILURE == status || PlanStage::DEAD == status) { *out = id; // If a stage fails, it may create a status WSM to indicate why it // failed, in which case 'id' is valid. If ID is invalid, we // create our own error message. if (WorkingSet::INVALID_ID == id) { mongoutils::str::stream ss; ss << "fetch stage failed to read in results from child"; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember(_ws, status); } return status; } else if (PlanStage::NEED_YIELD == status) { *out = id; } return status; }
PlanStage::StageState MergeSortStage::doWork(WorkingSetID* out) { if (isEOF()) { return PlanStage::IS_EOF; } if (!_noResultToMerge.empty()) { // We have some child that we don't have a result from. Each child must have a result // in order to pick the minimum result among all our children. Work a child. PlanStage* child = _noResultToMerge.front(); WorkingSetID id = WorkingSet::INVALID_ID; StageState code = child->work(&id); if (PlanStage::ADVANCED == code) { WorkingSetMember* member = _ws->get(id); // If we're deduping... if (_dedup) { if (!member->hasRecordId()) { // Can't dedup data unless there's a RecordId. We go ahead and use its // result. _noResultToMerge.pop(); } else { ++_specificStats.dupsTested; // ...and there's a RecordId and and we've seen the RecordId before if (_seen.end() != _seen.find(member->recordId)) { // ...drop it. _ws->free(id); ++_specificStats.dupsDropped; return PlanStage::NEED_TIME; } else { // Otherwise, note that we've seen it. _seen.insert(member->recordId); // We're going to use the result from the child, so we remove it from // the queue of children without a result. _noResultToMerge.pop(); } } } else { // Not deduping. We use any result we get from the child. Remove the child // from the queue of things without a result. _noResultToMerge.pop(); } // Store the result in our list. StageWithValue value; value.id = id; value.stage = child; // Ensure that the BSONObj underlying the WorkingSetMember is owned in case we yield. member->makeObjOwnedIfNeeded(); _mergingData.push_front(value); // Insert the result (indirectly) into our priority queue. _merging.push(_mergingData.begin()); return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == code) { // There are no more results possible from this child. Don't bother with it // anymore. _noResultToMerge.pop(); return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == code || PlanStage::DEAD == code) { *out = id; // If a stage fails, it may create a status WSM to indicate why it // failed, in which case 'id' is valid. If ID is invalid, we // create our own error message. if (WorkingSet::INVALID_ID == id) { mongoutils::str::stream ss; ss << "merge sort stage failed to read in results from child"; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember(_ws, status); } return code; } else { if (PlanStage::NEED_YIELD == code) { *out = id; } return code; } } // If we're here, for each non-EOF child, we have a valid WSID. verify(!_merging.empty()); // Get the 'min' WSID. _merging is a priority queue so its top is the smallest. MergingRef top = _merging.top(); _merging.pop(); // Since we're returning the WSID that came from top->stage, we need to work(...) it again // to get a new result. _noResultToMerge.push(top->stage); // Save the ID that we're returning and remove the returned result from our data. WorkingSetID idToTest = top->id; _mergingData.erase(top); // Return the min. *out = idToTest; return PlanStage::ADVANCED; }
Status CachedPlanStage::pickBestPlan(PlanYieldPolicy* yieldPolicy) { // Adds the amount of time taken by pickBestPlan() to executionTimeMillis. There's lots of // execution work that happens here, so this is needed for the time accounting to // make sense. ScopedTimer timer(getClock(), &_commonStats.executionTimeMillis); // During plan selection, the list of indices we are using to plan must remain stable, so the // query will die during yield recovery if any index has been dropped. However, once plan // selection completes successfully, we no longer need all indices to stick around. The selected // plan should safely die on yield recovery if it is using the dropped index. // // Dismiss the requirement that no indices can be dropped when this method returns. ON_BLOCK_EXIT([this] { releaseAllIndicesRequirement(); }); // If we work this many times during the trial period, then we will replan the // query from scratch. size_t maxWorksBeforeReplan = static_cast<size_t>(internalQueryCacheEvictionRatio * _decisionWorks); // The trial period ends without replanning if the cached plan produces this many results. size_t numResults = MultiPlanStage::getTrialPeriodNumToReturn(*_canonicalQuery); for (size_t i = 0; i < maxWorksBeforeReplan; ++i) { // Might need to yield between calls to work due to the timer elapsing. Status yieldStatus = tryYield(yieldPolicy); if (!yieldStatus.isOK()) { return yieldStatus; } WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState state = child()->work(&id); if (PlanStage::ADVANCED == state) { // Save result for later. WorkingSetMember* member = _ws->get(id); // Ensure that the BSONObj underlying the WorkingSetMember is owned in case we yield. member->makeObjOwnedIfNeeded(); _results.push(id); if (_results.size() >= numResults) { // Once a plan returns enough results, stop working. Update cache with stats // from this run and return. updatePlanCache(); return Status::OK(); } } else if (PlanStage::IS_EOF == state) { // Cached plan hit EOF quickly enough. No need to replan. Update cache with stats // from this run and return. updatePlanCache(); return Status::OK(); } else if (PlanStage::NEED_YIELD == state) { invariant(id == WorkingSet::INVALID_ID); if (!yieldPolicy->canAutoYield()) { throw WriteConflictException(); } if (yieldPolicy->canAutoYield()) { yieldPolicy->forceYield(); } Status yieldStatus = tryYield(yieldPolicy); if (!yieldStatus.isOK()) { return yieldStatus; } } else if (PlanStage::FAILURE == state) { // On failure, fall back to replanning the whole query. We neither evict the // existing cache entry nor cache the result of replanning. BSONObj statusObj; WorkingSetCommon::getStatusMemberObject(*_ws, id, &statusObj); LOG(1) << "Execution of cached plan failed, falling back to replan." << " query: " << redact(_canonicalQuery->toStringShort()) << " planSummary: " << Explain::getPlanSummary(child().get()) << " status: " << redact(statusObj); const bool shouldCache = false; return replan(yieldPolicy, shouldCache); } else { invariant(PlanStage::NEED_TIME == state); } } // If we're here, the trial period took more than 'maxWorksBeforeReplan' work cycles. This // plan is taking too long, so we replan from scratch. LOG(1) << "Execution of cached plan required " << maxWorksBeforeReplan << " works, but was originally cached with only " << _decisionWorks << " works. Evicting cache entry and replanning query: " << redact(_canonicalQuery->toStringShort()) << " plan summary before replan: " << Explain::getPlanSummary(child().get()); const bool shouldCache = true; return replan(yieldPolicy, shouldCache); }
PlanStage::StageState FetchStage::doWork(WorkingSetID* out) { if (isEOF()) { return PlanStage::IS_EOF; } // Either retry the last WSM we worked on or get a new one from our child. WorkingSetID id; StageState status; if (_idRetrying == WorkingSet::INVALID_ID) { status = child()->work(&id); } else { status = ADVANCED; id = _idRetrying; _idRetrying = WorkingSet::INVALID_ID; } if (PlanStage::ADVANCED == status) { WorkingSetMember* member = _ws->get(id); // If there's an obj there, there is no fetching to perform. if (member->hasObj()) { ++_specificStats.alreadyHasObj; } else { // We need a valid RecordId to fetch from and this is the only state that has one. verify(WorkingSetMember::RID_AND_IDX == member->getState()); verify(member->hasRecordId()); try { if (!_cursor) _cursor = _collection->getCursor(getOpCtx()); if (auto fetcher = _cursor->fetcherForId(member->recordId)) { // There's something to fetch. Hand the fetcher off to the WSM, and pass up // a fetch request. _idRetrying = id; member->setFetcher(fetcher.release()); *out = id; return NEED_YIELD; } // The doc is already in memory, so go ahead and grab it. Now we have a RecordId // as well as an unowned object if (!WorkingSetCommon::fetch(getOpCtx(), _ws, id, _cursor)) { _ws->free(id); return NEED_TIME; } } catch (const WriteConflictException&) { // Ensure that the BSONObj underlying the WorkingSetMember is owned because it may // be freed when we yield. member->makeObjOwnedIfNeeded(); _idRetrying = id; *out = WorkingSet::INVALID_ID; return NEED_YIELD; } } return returnIfMatches(member, id, out); } else if (PlanStage::FAILURE == status || PlanStage::DEAD == status) { // The stage which produces a failure is responsible for allocating a working set member // with error details. invariant(WorkingSet::INVALID_ID != id); *out = id; return status; } else if (PlanStage::NEED_YIELD == status) { *out = id; } return status; }