void run() { // Various variables we'll need. dbtests::WriteContextForTests ctx(&_opCtx, nss.ns()); Collection* coll = ctx.getCollection(); ASSERT(coll); const int targetDocIndex = 0; const BSONObj query = BSON("foo" << BSON("$gte" << targetDocIndex)); const auto ws = make_unique<WorkingSet>(); const unique_ptr<CanonicalQuery> cq(canonicalize(query)); // Get the RecordIds that would be returned by an in-order scan. vector<RecordId> recordIds; getRecordIds(coll, CollectionScanParams::FORWARD, &recordIds); // Configure a QueuedDataStage to pass the first object in the collection back in a // RID_AND_OBJ state. auto qds = make_unique<QueuedDataStage>(&_opCtx, ws.get()); WorkingSetID id = ws->allocate(); WorkingSetMember* member = ws->get(id); member->recordId = recordIds[targetDocIndex]; const BSONObj oldDoc = BSON("_id" << targetDocIndex << "foo" << targetDocIndex); member->obj = Snapshotted<BSONObj>(SnapshotId(), oldDoc); ws->transitionToRecordIdAndObj(id); qds->pushBack(id); // Configure the delete. auto deleteParams = std::make_unique<DeleteStageParams>(); deleteParams->returnDeleted = true; deleteParams->canonicalQuery = cq.get(); const auto deleteStage = make_unique<DeleteStage>( &_opCtx, std::move(deleteParams), ws.get(), coll, qds.release()); const DeleteStats* stats = static_cast<const DeleteStats*>(deleteStage->getSpecificStats()); // Should return advanced. id = WorkingSet::INVALID_ID; PlanStage::StageState state = deleteStage->work(&id); ASSERT_EQUALS(PlanStage::ADVANCED, state); // Make sure the returned value is what we expect it to be. // Should give us back a valid id. ASSERT_TRUE(WorkingSet::INVALID_ID != id); WorkingSetMember* resultMember = ws->get(id); // With an owned copy of the object, with no RecordId. ASSERT_TRUE(resultMember->hasOwnedObj()); ASSERT_FALSE(resultMember->hasRecordId()); ASSERT_EQUALS(resultMember->getState(), WorkingSetMember::OWNED_OBJ); ASSERT_TRUE(resultMember->obj.value().isOwned()); // Should be the old value. ASSERT_BSONOBJ_EQ(resultMember->obj.value(), oldDoc); // Should have done the delete. ASSERT_EQUALS(stats->docsDeleted, 1U); // That should be it. id = WorkingSet::INVALID_ID; ASSERT_EQUALS(PlanStage::IS_EOF, deleteStage->work(&id)); }
PlanStage::StageState OplogStart::workBackwardsScan(WorkingSetID* out) { PlanStage::StageState state = child()->work(out); // EOF. Just start from the beginning, which is where we've hit. if (PlanStage::IS_EOF == state) { _done = true; return state; } if (PlanStage::ADVANCED != state) { return state; } WorkingSetMember* member = _workingSet->get(*out); verify(member->hasObj()); verify(member->hasRecordId()); if (!_filter->matchesBSON(member->obj.value())) { _done = true; // RecordId is returned in *out. return PlanStage::ADVANCED; } else { _workingSet->free(*out); return PlanStage::NEED_TIME; } }
PlanStage::StageState AndHashStage::readFirstChild(WorkingSetID* out) { verify(_currentChild == 0); WorkingSetID id = WorkingSet::INVALID_ID; StageState childStatus = workChild(0, &id); if (PlanStage::ADVANCED == childStatus) { WorkingSetMember* member = _ws->get(id); // The child must give us a WorkingSetMember with a record id, since we intersect index keys // based on the record id. The planner ensures that the child stage can never produce an WSM // with no record id. invariant(member->hasRecordId()); if (!_dataMap.insert(std::make_pair(member->recordId, id)).second) { // Didn't insert because we already had this RecordId inside the map. This should only // happen if we're seeing a newer copy of the same doc in a more recent snapshot. // Throw out the newer copy of the doc. _ws->free(id); return PlanStage::NEED_TIME; } // Ensure that the BSONObj underlying the WorkingSetMember is owned in case we yield. member->makeObjOwnedIfNeeded(); // Update memory stats. _memUsage += member->getMemUsage(); return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == childStatus) { // Done reading child 0. _currentChild = 1; // If our first child was empty, don't scan any others, no possible results. if (_dataMap.empty()) { _hashingChildren = false; return PlanStage::IS_EOF; } _specificStats.mapAfterChild.push_back(_dataMap.size()); return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == childStatus || PlanStage::DEAD == childStatus) { // The stage which produces a failure is responsible for allocating a working set member // with error details. invariant(WorkingSet::INVALID_ID != id); *out = id; return childStatus; } else { if (PlanStage::NEED_YIELD == childStatus) { *out = id; } return childStatus; } }
void CachedPlanStage::doInvalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { for (auto it = _results.begin(); it != _results.end(); ++it) { WorkingSetMember* member = _ws->get(*it); if (member->hasRecordId() && member->recordId == dl) { WorkingSetCommon::fetchAndInvalidateRecordId(txn, member, _collection); } } }
void FetchStage::doInvalidate(OperationContext* opCtx, const RecordId& dl, InvalidationType type) { // It's possible that the recordId getting invalidated is the one we're about to // fetch. In this case we do a "forced fetch" and put the WSM in owned object state. if (WorkingSet::INVALID_ID != _idRetrying) { WorkingSetMember* member = _ws->get(_idRetrying); if (member->hasRecordId() && (member->recordId == dl)) { // Fetch it now and kill the recordId. WorkingSetCommon::fetchAndInvalidateRecordId(opCtx, member, _collection); } } }
void IDHackStage::doInvalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { // Since updates can't mutate the '_id' field, we can ignore mutation invalidations. if (INVALIDATION_MUTATION == type) { return; } // It's possible that the RecordId getting invalidated is the one we're about to // fetch. In this case we do a "forced fetch" and put the WSM in owned object state. if (WorkingSet::INVALID_ID != _idBeingPagedIn) { WorkingSetMember* member = _workingSet->get(_idBeingPagedIn); if (member->hasRecordId() && (member->recordId == dl)) { // Fetch it now and kill the RecordId. WorkingSetCommon::fetchAndInvalidateRecordId(txn, member, _collection); } } }
// static bool WorkingSetCommon::fetch(OperationContext* txn, WorkingSet* workingSet, WorkingSetID id, unowned_ptr<SeekableRecordCursor> cursor) { WorkingSetMember* member = workingSet->get(id); // The RecordFetcher should already have been transferred out of the WSM and used. invariant(!member->hasFetcher()); // We should have a RecordId but need to retrieve the obj. Get the obj now and reset all WSM // state appropriately. invariant(member->hasRecordId()); member->obj.reset(); auto record = cursor->seekExact(member->recordId); if (!record) { return false; } member->obj = {txn->recoveryUnit()->getSnapshotId(), record->data.releaseToBson()}; if (member->isSuspicious) { // Make sure that all of the keyData is still valid for this copy of the document. // This ensures both that index-provided filters and sort orders still hold. // TODO provide a way for the query planner to opt out of this checking if it is // unneeded due to the structure of the plan. invariant(!member->keyData.empty()); for (size_t i = 0; i < member->keyData.size(); i++) { BSONObjSet keys; // There's no need to compute the prefixes of the indexed fields that cause the index to // be multikey when ensuring the keyData is still valid. MultikeyPaths* multikeyPaths = nullptr; member->keyData[i].index->getKeys(member->obj.value(), &keys, multikeyPaths); if (!keys.count(member->keyData[i].keyData)) { // document would no longer be at this position in the index. return false; } } member->isSuspicious = false; } member->keyData.clear(); workingSet->transitionToRecordIdAndObj(id); return true; }
void getRecordIds(Collection* collection, CollectionScanParams::Direction direction, vector<RecordId>* out) { WorkingSet ws; CollectionScanParams params; params.direction = direction; params.tailable = false; unique_ptr<CollectionScan> scan(new CollectionScan(&_opCtx, collection, params, &ws, NULL)); while (!scan->isEOF()) { WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState state = scan->work(&id); if (PlanStage::ADVANCED == state) { WorkingSetMember* member = ws.get(id); verify(member->hasRecordId()); out->push_back(member->recordId); } } }
void MergeSortStage::doInvalidate(OperationContext* txn, const RecordId& dl, InvalidationType type) { // Go through our data and see if we're holding on to the invalidated RecordId. for (list<StageWithValue>::iterator valueIt = _mergingData.begin(); valueIt != _mergingData.end(); valueIt++) { WorkingSetMember* member = _ws->get(valueIt->id); if (member->hasRecordId() && (dl == member->recordId)) { // Fetch the about-to-be mutated result. WorkingSetCommon::fetchAndInvalidateRecordId(txn, member, _collection); ++_specificStats.forcedFetches; } } // If we see the deleted RecordId again it is not the same record as it once was so we still // want to return it. if (_dedup && INVALIDATION_DELETION == type) { _seen.erase(dl); } }
PlanStage::StageState MergeSortStage::doWork(WorkingSetID* out) { if (isEOF()) { return PlanStage::IS_EOF; } if (!_noResultToMerge.empty()) { // We have some child that we don't have a result from. Each child must have a result // in order to pick the minimum result among all our children. Work a child. PlanStage* child = _noResultToMerge.front(); WorkingSetID id = WorkingSet::INVALID_ID; StageState code = child->work(&id); if (PlanStage::ADVANCED == code) { WorkingSetMember* member = _ws->get(id); // If we're deduping... if (_dedup) { if (!member->hasRecordId()) { // Can't dedup data unless there's a RecordId. We go ahead and use its // result. _noResultToMerge.pop(); } else { ++_specificStats.dupsTested; // ...and there's a RecordId and and we've seen the RecordId before if (_seen.end() != _seen.find(member->recordId)) { // ...drop it. _ws->free(id); ++_specificStats.dupsDropped; return PlanStage::NEED_TIME; } else { // Otherwise, note that we've seen it. _seen.insert(member->recordId); // We're going to use the result from the child, so we remove it from // the queue of children without a result. _noResultToMerge.pop(); } } } else { // Not deduping. We use any result we get from the child. Remove the child // from the queue of things without a result. _noResultToMerge.pop(); } // Store the result in our list. StageWithValue value; value.id = id; value.stage = child; // Ensure that the BSONObj underlying the WorkingSetMember is owned in case we yield. member->makeObjOwnedIfNeeded(); _mergingData.push_front(value); // Insert the result (indirectly) into our priority queue. _merging.push(_mergingData.begin()); return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == code) { // There are no more results possible from this child. Don't bother with it // anymore. _noResultToMerge.pop(); return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == code || PlanStage::DEAD == code) { *out = id; // If a stage fails, it may create a status WSM to indicate why it // failed, in which case 'id' is valid. If ID is invalid, we // create our own error message. if (WorkingSet::INVALID_ID == id) { mongoutils::str::stream ss; ss << "merge sort stage failed to read in results from child"; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember(_ws, status); } return code; } else { if (PlanStage::NEED_YIELD == code) { *out = id; } return code; } } // If we're here, for each non-EOF child, we have a valid WSID. verify(!_merging.empty()); // Get the 'min' WSID. _merging is a priority queue so its top is the smallest. MergingRef top = _merging.top(); _merging.pop(); // Since we're returning the WSID that came from top->stage, we need to work(...) it again // to get a new result. _noResultToMerge.push(top->stage); // Save the ID that we're returning and remove the returned result from our data. WorkingSetID idToTest = top->id; _mergingData.erase(top); // Return the min. *out = idToTest; return PlanStage::ADVANCED; }
PlanExecutor::ExecState PlanExecutor::getNextImpl(Snapshotted<BSONObj>* objOut, RecordId* dlOut) { if (MONGO_FAIL_POINT(planExecutorAlwaysFails)) { Status status(ErrorCodes::OperationFailed, str::stream() << "PlanExecutor hit planExecutorAlwaysFails fail point"); *objOut = Snapshotted<BSONObj>(SnapshotId(), WorkingSetCommon::buildMemberStatusObject(status)); return PlanExecutor::FAILURE; } invariant(_currentState == kUsable); if (isMarkedAsKilled()) { if (NULL != objOut) { Status status(ErrorCodes::OperationFailed, str::stream() << "Operation aborted because: " << *_killReason); *objOut = Snapshotted<BSONObj>(SnapshotId(), WorkingSetCommon::buildMemberStatusObject(status)); } return PlanExecutor::DEAD; } if (!_stash.empty()) { invariant(objOut && !dlOut); *objOut = {SnapshotId(), _stash.front()}; _stash.pop(); return PlanExecutor::ADVANCED; } // When a stage requests a yield for document fetch, it gives us back a RecordFetcher* // to use to pull the record into memory. We take ownership of the RecordFetcher here, // deleting it after we've had a chance to do the fetch. For timing-based yields, we // just pass a NULL fetcher. unique_ptr<RecordFetcher> fetcher; // Incremented on every writeConflict, reset to 0 on any successful call to _root->work. size_t writeConflictsInARow = 0; for (;;) { // These are the conditions which can cause us to yield: // 1) The yield policy's timer elapsed, or // 2) some stage requested a yield due to a document fetch, or // 3) we need to yield and retry due to a WriteConflictException. // In all cases, the actual yielding happens here. if (_yieldPolicy->shouldYield()) { if (!_yieldPolicy->yield(fetcher.get())) { // A return of false from a yield should only happen if we've been killed during the // yield. invariant(isMarkedAsKilled()); if (NULL != objOut) { Status status(ErrorCodes::OperationFailed, str::stream() << "Operation aborted because: " << *_killReason); *objOut = Snapshotted<BSONObj>( SnapshotId(), WorkingSetCommon::buildMemberStatusObject(status)); } return PlanExecutor::DEAD; } } // We're done using the fetcher, so it should be freed. We don't want to // use the same RecordFetcher twice. fetcher.reset(); WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState code = _root->work(&id); if (code != PlanStage::NEED_YIELD) writeConflictsInARow = 0; if (PlanStage::ADVANCED == code) { WorkingSetMember* member = _workingSet->get(id); bool hasRequestedData = true; if (NULL != objOut) { if (WorkingSetMember::RID_AND_IDX == member->getState()) { if (1 != member->keyData.size()) { _workingSet->free(id); hasRequestedData = false; } else { // TODO: currently snapshot ids are only associated with documents, and // not with index keys. *objOut = Snapshotted<BSONObj>(SnapshotId(), member->keyData[0].keyData); } } else if (member->hasObj()) { *objOut = member->obj; } else { _workingSet->free(id); hasRequestedData = false; } } if (NULL != dlOut) { if (member->hasRecordId()) { *dlOut = member->recordId; } else { _workingSet->free(id); hasRequestedData = false; } } if (hasRequestedData) { _workingSet->free(id); return PlanExecutor::ADVANCED; } // This result didn't have the data the caller wanted, try again. } else if (PlanStage::NEED_YIELD == code) { if (id == WorkingSet::INVALID_ID) { if (!_yieldPolicy->canAutoYield()) throw WriteConflictException(); CurOp::get(_opCtx)->debug().writeConflicts++; writeConflictsInARow++; WriteConflictException::logAndBackoff( writeConflictsInARow, "plan execution", _nss.ns()); } else { WorkingSetMember* member = _workingSet->get(id); invariant(member->hasFetcher()); // Transfer ownership of the fetcher. Next time around the loop a yield will // happen. fetcher.reset(member->releaseFetcher()); } // If we're allowed to, we will yield next time through the loop. if (_yieldPolicy->canAutoYield()) _yieldPolicy->forceYield(); } else if (PlanStage::NEED_TIME == code) { // Fall through to yield check at end of large conditional. } else if (PlanStage::IS_EOF == code) { if (shouldWaitForInserts()) { const bool locksReacquiredAfterYield = waitForInserts(); if (locksReacquiredAfterYield) { // There may be more results, try to get more data. continue; } invariant(isMarkedAsKilled()); if (objOut) { Status status(ErrorCodes::OperationFailed, str::stream() << "Operation aborted because: " << *_killReason); *objOut = Snapshotted<BSONObj>( SnapshotId(), WorkingSetCommon::buildMemberStatusObject(status)); } return PlanExecutor::DEAD; } else { return PlanExecutor::IS_EOF; } } else { invariant(PlanStage::DEAD == code || PlanStage::FAILURE == code); if (NULL != objOut) { BSONObj statusObj; WorkingSetCommon::getStatusMemberObject(*_workingSet, id, &statusObj); *objOut = Snapshotted<BSONObj>(SnapshotId(), statusObj); } return (PlanStage::DEAD == code) ? PlanExecutor::DEAD : PlanExecutor::FAILURE; } } }
PlanStage::StageState FetchStage::doWork(WorkingSetID* out) { if (isEOF()) { return PlanStage::IS_EOF; } // Either retry the last WSM we worked on or get a new one from our child. WorkingSetID id; StageState status; if (_idRetrying == WorkingSet::INVALID_ID) { status = child()->work(&id); } else { status = ADVANCED; id = _idRetrying; _idRetrying = WorkingSet::INVALID_ID; } if (PlanStage::ADVANCED == status) { WorkingSetMember* member = _ws->get(id); // If there's an obj there, there is no fetching to perform. if (member->hasObj()) { ++_specificStats.alreadyHasObj; } else { // We need a valid RecordId to fetch from and this is the only state that has one. verify(WorkingSetMember::RID_AND_IDX == member->getState()); verify(member->hasRecordId()); try { if (!_cursor) _cursor = _collection->getCursor(getOpCtx()); if (auto fetcher = _cursor->fetcherForId(member->recordId)) { // There's something to fetch. Hand the fetcher off to the WSM, and pass up // a fetch request. _idRetrying = id; member->setFetcher(fetcher.release()); *out = id; return NEED_YIELD; } // The doc is already in memory, so go ahead and grab it. Now we have a RecordId // as well as an unowned object if (!WorkingSetCommon::fetch(getOpCtx(), _ws, id, _cursor)) { _ws->free(id); return NEED_TIME; } } catch (const WriteConflictException&) { // Ensure that the BSONObj underlying the WorkingSetMember is owned because it may // be freed when we yield. member->makeObjOwnedIfNeeded(); _idRetrying = id; *out = WorkingSet::INVALID_ID; return NEED_YIELD; } } return returnIfMatches(member, id, out); } else if (PlanStage::FAILURE == status || PlanStage::DEAD == status) { // The stage which produces a failure is responsible for allocating a working set member // with error details. invariant(WorkingSet::INVALID_ID != id); *out = id; return status; } else if (PlanStage::NEED_YIELD == status) { *out = id; } return status; }
PlanStage::StageState AndHashStage::doWork(WorkingSetID* out) { if (isEOF()) { return PlanStage::IS_EOF; } // Fast-path for one of our children being EOF immediately. We work each child a few times. // If it hits EOF, the AND cannot output anything. If it produces a result, we stash that // result in _lookAheadResults. if (_lookAheadResults.empty()) { // INVALID_ID means that the child didn't produce a valid result. // We specifically are not using .resize(size, value) here because C++11 builds don't // seem to resolve WorkingSet::INVALID_ID during linking. _lookAheadResults.resize(_children.size()); for (size_t i = 0; i < _children.size(); ++i) { _lookAheadResults[i] = WorkingSet::INVALID_ID; } // Work each child some number of times until it's either EOF or produces // a result. If it's EOF this whole stage will be EOF. If it produces a // result we cache it for later. for (size_t i = 0; i < _children.size(); ++i) { auto& child = _children[i]; for (size_t j = 0; j < kLookAheadWorks; ++j) { // Cache the result in _lookAheadResults[i]. StageState childStatus = child->work(&_lookAheadResults[i]); if (PlanStage::IS_EOF == childStatus) { // A child went right to EOF. Bail out. _hashingChildren = false; _dataMap.clear(); return PlanStage::IS_EOF; } else if (PlanStage::ADVANCED == childStatus) { // Ensure that the BSONObj underlying the WorkingSetMember is owned in case we // yield. _ws->get(_lookAheadResults[i])->makeObjOwnedIfNeeded(); break; // Stop looking at this child. } else if (PlanStage::FAILURE == childStatus || PlanStage::DEAD == childStatus) { // The stage which produces a failure is responsible for allocating a working // set member with error details. invariant(WorkingSet::INVALID_ID != _lookAheadResults[i]); *out = _lookAheadResults[i]; _hashingChildren = false; _dataMap.clear(); return childStatus; } // We ignore NEED_TIME. TODO: what do we want to do if we get NEED_YIELD here? } } // We did a bunch of work above, return NEED_TIME to be fair. return PlanStage::NEED_TIME; } // An AND is either reading the first child into the hash table, probing against the hash // table with subsequent children, or checking the last child's results to see if they're // in the hash table. // We read the first child into our hash table. if (_hashingChildren) { // Check memory usage of previously hashed results. if (_memUsage > _maxMemUsage) { mongoutils::str::stream ss; ss << "hashed AND stage buffered data usage of " << _memUsage << " bytes exceeds internal limit of " << kDefaultMaxMemUsageBytes << " bytes"; Status status(ErrorCodes::Overflow, ss); *out = WorkingSetCommon::allocateStatusMember(_ws, status); return PlanStage::FAILURE; } if (0 == _currentChild) { return readFirstChild(out); } else if (_currentChild < _children.size() - 1) { return hashOtherChildren(out); } else { _hashingChildren = false; // We don't hash our last child. Instead, we probe the table created from the // previous children, returning results in the order of the last child. // Fall through to below. } } // Returning results. We read from the last child and return the results that are in our // hash map. // We should be EOF if we're not hashing results and the dataMap is empty. verify(!_dataMap.empty()); // We probe _dataMap with the last child. verify(_currentChild == _children.size() - 1); // Get the next result for the (_children.size() - 1)-th child. StageState childStatus = workChild(_children.size() - 1, out); if (PlanStage::ADVANCED != childStatus) { return childStatus; } // We know that we've ADVANCED. See if the WSM is in our table. WorkingSetMember* member = _ws->get(*out); // The child must give us a WorkingSetMember with a record id, since we intersect index keys // based on the record id. The planner ensures that the child stage can never produce an WSM // with no record id. invariant(member->hasRecordId()); DataMap::iterator it = _dataMap.find(member->recordId); if (_dataMap.end() == it) { // Child's output wasn't in every previous child. Throw it out. _ws->free(*out); return PlanStage::NEED_TIME; } else { // Child's output was in every previous child. Merge any key data in // the child's output and free the child's just-outputted WSM. WorkingSetID hashID = it->second; _dataMap.erase(it); AndCommon::mergeFrom(_ws, hashID, *member); _ws->free(*out); *out = hashID; return PlanStage::ADVANCED; } }
void run() { OldClientWriteContext ctx(&_txn, ns()); Database* db = ctx.db(); Collection* coll = db->getCollection(ns()); if (!coll) { WriteUnitOfWork wuow(&_txn); coll = db->createCollection(&_txn, ns()); wuow.commit(); } WorkingSet ws; // Sort by foo:1 MergeSortStageParams msparams; msparams.pattern = BSON("foo" << 1); auto ms = make_unique<MergeSortStage>(&_txn, msparams, &ws, coll); IndexScanParams params; params.bounds.isSimpleRange = true; params.bounds.startKey = objWithMinKey(1); params.bounds.endKey = objWithMaxKey(1); params.bounds.endKeyInclusive = true; params.direction = 1; // Index 'a'+i has foo equal to 'i'. int numIndices = 20; for (int i = 0; i < numIndices; ++i) { // 'a', 'b', ... string index(1, 'a' + i); insert(BSON(index << 1 << "foo" << i)); BSONObj indexSpec = BSON(index << 1 << "foo" << 1); addIndex(indexSpec); params.descriptor = getIndex(indexSpec, coll); ms->addChild(new IndexScan(&_txn, params, &ws, NULL)); } set<RecordId> recordIds; getRecordIds(&recordIds, coll); set<RecordId>::iterator it = recordIds.begin(); // Get 10 results. Should be getting results in order of 'recordIds'. int count = 0; while (!ms->isEOF() && count < 10) { WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState status = ms->work(&id); if (PlanStage::ADVANCED != status) { continue; } WorkingSetMember* member = ws.get(id); ASSERT_EQUALS(member->recordId, *it); BSONElement elt; string index(1, 'a' + count); ASSERT(member->getFieldDotted(index, &elt)); ASSERT_EQUALS(1, elt.numberInt()); ASSERT(member->getFieldDotted("foo", &elt)); ASSERT_EQUALS(count, elt.numberInt()); ++count; ++it; } // Invalidate recordIds[11]. Should force a fetch and return the deleted document. ms->saveState(); ms->invalidate(&_txn, *it, INVALIDATION_DELETION); ms->restoreState(); // Make sure recordIds[11] was fetched for us. { WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState status; do { status = ms->work(&id); } while (PlanStage::ADVANCED != status); WorkingSetMember* member = ws.get(id); ASSERT(!member->hasRecordId()); ASSERT(member->hasObj()); string index(1, 'a' + count); BSONElement elt; ASSERT_TRUE(member->getFieldDotted(index, &elt)); ASSERT_EQUALS(1, elt.numberInt()); ASSERT(member->getFieldDotted("foo", &elt)); ASSERT_EQUALS(count, elt.numberInt()); ++it; ++count; } // And get the rest. while (!ms->isEOF()) { WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState status = ms->work(&id); if (PlanStage::ADVANCED != status) { continue; } WorkingSetMember* member = ws.get(id); ASSERT_EQUALS(member->recordId, *it); BSONElement elt; string index(1, 'a' + count); ASSERT_TRUE(member->getFieldDotted(index, &elt)); ASSERT_EQUALS(1, elt.numberInt()); ASSERT(member->getFieldDotted("foo", &elt)); ASSERT_EQUALS(count, elt.numberInt()); ++count; ++it; } }
PlanStage::StageState SortStage::doWork(WorkingSetID* out) { const size_t maxBytes = static_cast<size_t>(internalQueryExecMaxBlockingSortBytes.load()); if (_memUsage > maxBytes) { str::stream ss; ss << "Sort operation used more than the maximum " << maxBytes << " bytes of RAM. Add an index, or specify a smaller limit."; Status status(ErrorCodes::OperationFailed, ss); *out = WorkingSetCommon::allocateStatusMember(_ws, status); return PlanStage::FAILURE; } if (isEOF()) { return PlanStage::IS_EOF; } // Still reading in results to sort. if (!_sorted) { WorkingSetID id = WorkingSet::INVALID_ID; StageState code = child()->work(&id); if (PlanStage::ADVANCED == code) { WorkingSetMember* member = _ws->get(id); SortableDataItem item; item.wsid = id; // We extract the sort key from the WSM's computed data. This must have been generated // by a SortKeyGeneratorStage descendent in the execution tree. auto sortKeyComputedData = static_cast<const SortKeyComputedData*>(member->getComputed(WSM_SORT_KEY)); item.sortKey = sortKeyComputedData->getSortKey(); if (member->hasRecordId()) { // The RecordId breaks ties when sorting two WSMs with the same sort key. item.recordId = member->recordId; } addToBuffer(item); return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == code) { // TODO: We don't need the lock for this. We could ask for a yield and do this work // unlocked. Also, this is performing a lot of work for one call to work(...) sortBuffer(); _resultIterator = _data.begin(); _sorted = true; return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == code) { // The stage which produces a failure is responsible for allocating a working set member // with error details. invariant(WorkingSet::INVALID_ID != id); *out = id; return code; } else if (PlanStage::NEED_YIELD == code) { *out = id; } return code; } // Returning results. verify(_resultIterator != _data.end()); verify(_sorted); *out = _resultIterator->wsid; _resultIterator++; return PlanStage::ADVANCED; }
PlanStage::StageState AndHashStage::hashOtherChildren(WorkingSetID* out) { verify(_currentChild > 0); WorkingSetID id = WorkingSet::INVALID_ID; StageState childStatus = workChild(_currentChild, &id); if (PlanStage::ADVANCED == childStatus) { WorkingSetMember* member = _ws->get(id); // The child must give us a WorkingSetMember with a record id, since we intersect index keys // based on the record id. The planner ensures that the child stage can never produce an // WSM with no record id. invariant(member->hasRecordId()); if (_dataMap.end() == _dataMap.find(member->recordId)) { // Ignore. It's not in any previous child. } else { // We have a hit. Copy data into the WSM we already have. _seenMap.insert(member->recordId); WorkingSetID olderMemberID = _dataMap[member->recordId]; WorkingSetMember* olderMember = _ws->get(olderMemberID); size_t memUsageBefore = olderMember->getMemUsage(); AndCommon::mergeFrom(_ws, olderMemberID, *member); // Update memory stats. _memUsage += olderMember->getMemUsage() - memUsageBefore; } _ws->free(id); return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == childStatus) { // Finished with a child. ++_currentChild; // Keep elements of _dataMap that are in _seenMap. DataMap::iterator it = _dataMap.begin(); while (it != _dataMap.end()) { if (_seenMap.end() == _seenMap.find(it->first)) { DataMap::iterator toErase = it; ++it; // Update memory stats. WorkingSetMember* member = _ws->get(toErase->second); _memUsage -= member->getMemUsage(); _ws->free(toErase->second); _dataMap.erase(toErase); } else { ++it; } } _specificStats.mapAfterChild.push_back(_dataMap.size()); _seenMap.clear(); // _dataMap is now the intersection of the first _currentChild nodes. // If we have nothing to AND with after finishing any child, stop. if (_dataMap.empty()) { _hashingChildren = false; return PlanStage::IS_EOF; } // We've finished scanning all children. Return results with the next call to work(). if (_currentChild == _children.size()) { _hashingChildren = false; } return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == childStatus || PlanStage::DEAD == childStatus) { // The stage which produces a failure is responsible for allocating a working set member // with error details. invariant(WorkingSet::INVALID_ID != id); *out = id; return childStatus; } else { if (PlanStage::NEED_YIELD == childStatus) { *out = id; } return childStatus; } }
PlanStage::StageState UpdateStage::doWork(WorkingSetID* out) { if (isEOF()) { return PlanStage::IS_EOF; } if (doneUpdating()) { // Even if we're done updating, we may have some inserting left to do. if (needInsert()) { // TODO we may want to handle WriteConflictException here. Currently we bounce it // out to a higher level since if this WCEs it is likely that we raced with another // upsert that may have matched our query, and therefore this may need to perform an // update rather than an insert. Bouncing to the higher level allows restarting the // query in this case. doInsert(); invariant(isEOF()); if (_params.request->shouldReturnNewDocs()) { // Want to return the document we just inserted, create it as a WorkingSetMember // so that we can return it. BSONObj newObj = _specificStats.objInserted; *out = _ws->allocate(); WorkingSetMember* member = _ws->get(*out); member->obj = Snapshotted<BSONObj>(getOpCtx()->recoveryUnit()->getSnapshotId(), newObj.getOwned()); member->transitionToOwnedObj(); return PlanStage::ADVANCED; } } // At this point either we're done updating and there was no insert to do, // or we're done updating and we're done inserting. Either way, we're EOF. invariant(isEOF()); return PlanStage::IS_EOF; } // If we're here, then we still have to ask for results from the child and apply // updates to them. We should only get here if the collection exists. invariant(_collection); // It is possible that after an update was applied, a WriteConflictException // occurred and prevented us from returning ADVANCED with the requested version // of the document. if (_idReturning != WorkingSet::INVALID_ID) { // We should only get here if we were trying to return something before. invariant(_params.request->shouldReturnAnyDocs()); WorkingSetMember* member = _ws->get(_idReturning); invariant(member->getState() == WorkingSetMember::OWNED_OBJ); *out = _idReturning; _idReturning = WorkingSet::INVALID_ID; return PlanStage::ADVANCED; } // Either retry the last WSM we worked on or get a new one from our child. WorkingSetID id; StageState status; if (_idRetrying == WorkingSet::INVALID_ID) { status = child()->work(&id); } else { status = ADVANCED; id = _idRetrying; _idRetrying = WorkingSet::INVALID_ID; } if (PlanStage::ADVANCED == status) { // Need to get these things from the result returned by the child. RecordId recordId; WorkingSetMember* member = _ws->get(id); // We want to free this member when we return, unless we need to retry updating or returning // it. ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id); if (!member->hasRecordId()) { // We expect to be here because of an invalidation causing a force-fetch. ++_specificStats.nInvalidateSkips; return PlanStage::NEED_TIME; } recordId = member->recordId; // Updates can't have projections. This means that covering analysis will always add // a fetch. We should always get fetched data, and never just key data. invariant(member->hasObj()); // We fill this with the new RecordIds of moved doc so we don't double-update. if (_updatedRecordIds && _updatedRecordIds->count(recordId) > 0) { // Found a RecordId that refers to a document we had already updated. Note that // we can never remove from _updatedRecordIds because updates by other clients // could cause us to encounter a document again later. return PlanStage::NEED_TIME; } bool docStillMatches; try { docStillMatches = write_stage_common::ensureStillMatches( _collection, getOpCtx(), _ws, id, _params.canonicalQuery); } catch (const WriteConflictException&) { // There was a problem trying to detect if the document still exists, so retry. memberFreer.Dismiss(); return prepareToRetryWSM(id, out); } if (!docStillMatches) { // Either the document has been deleted, or it has been updated such that it no longer // matches the predicate. if (shouldRestartUpdateIfNoLongerMatches(_params)) { throw WriteConflictException(); } return PlanStage::NEED_TIME; } // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState() // is allowed to free the memory. member->makeObjOwnedIfNeeded(); // Save state before making changes WorkingSetCommon::prepareForSnapshotChange(_ws); try { child()->saveState(); } catch (const WriteConflictException&) { std::terminate(); } // If we care about the pre-updated version of the doc, save it out here. BSONObj oldObj; if (_params.request->shouldReturnOldDocs()) { oldObj = member->obj.value().getOwned(); } BSONObj newObj; try { // Do the update, get us the new version of the doc. newObj = transformAndUpdate(member->obj, recordId); } catch (const WriteConflictException&) { memberFreer.Dismiss(); // Keep this member around so we can retry updating it. return prepareToRetryWSM(id, out); } // Set member's obj to be the doc we want to return. if (_params.request->shouldReturnAnyDocs()) { if (_params.request->shouldReturnNewDocs()) { member->obj = Snapshotted<BSONObj>(getOpCtx()->recoveryUnit()->getSnapshotId(), newObj.getOwned()); } else { invariant(_params.request->shouldReturnOldDocs()); member->obj.setValue(oldObj); } member->recordId = RecordId(); member->transitionToOwnedObj(); } // This should be after transformAndUpdate to make sure we actually updated this doc. ++_specificStats.nMatched; // Restore state after modification // As restoreState may restore (recreate) cursors, make sure to restore the // state outside of the WritUnitOfWork. try { child()->restoreState(); } catch (const WriteConflictException&) { // Note we don't need to retry updating anything in this case since the update // already was committed. However, we still need to return the updated document // (if it was requested). if (_params.request->shouldReturnAnyDocs()) { // member->obj should refer to the document we want to return. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); _idReturning = id; // Keep this member around so that we can return it on the next work() call. memberFreer.Dismiss(); } *out = WorkingSet::INVALID_ID; return NEED_YIELD; } if (_params.request->shouldReturnAnyDocs()) { // member->obj should refer to the document we want to return. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); memberFreer.Dismiss(); // Keep this member around so we can return it. *out = id; return PlanStage::ADVANCED; } return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == status) { // The child is out of results, but we might not be done yet because we still might // have to do an insert. return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == status) { *out = id; // If a stage fails, it may create a status WSM to indicate why it failed, in which case // 'id' is valid. If ID is invalid, we create our own error message. if (WorkingSet::INVALID_ID == id) { const std::string errmsg = "update stage failed to read in results from child"; *out = WorkingSetCommon::allocateStatusMember( _ws, Status(ErrorCodes::InternalError, errmsg)); return PlanStage::FAILURE; } return status; } else if (PlanStage::NEED_YIELD == status) { *out = id; } return status; }
PlanStage::StageState DeleteStage::doWork(WorkingSetID* out) { if (isEOF()) { return PlanStage::IS_EOF; } invariant(_collection); // If isEOF() returns false, we must have a collection. // It is possible that after a delete was executed, a WriteConflictException occurred // and prevented us from returning ADVANCED with the old version of the document. if (_idReturning != WorkingSet::INVALID_ID) { // We should only get here if we were trying to return something before. invariant(_params.returnDeleted); WorkingSetMember* member = _ws->get(_idReturning); invariant(member->getState() == WorkingSetMember::OWNED_OBJ); *out = _idReturning; _idReturning = WorkingSet::INVALID_ID; return PlanStage::ADVANCED; } // Either retry the last WSM we worked on or get a new one from our child. WorkingSetID id; if (_idRetrying != WorkingSet::INVALID_ID) { id = _idRetrying; _idRetrying = WorkingSet::INVALID_ID; } else { auto status = child()->work(&id); switch (status) { case PlanStage::ADVANCED: break; case PlanStage::FAILURE: case PlanStage::DEAD: // The stage which produces a failure is responsible for allocating a working set // member with error details. invariant(WorkingSet::INVALID_ID != id); *out = id; return status; case PlanStage::NEED_TIME: return status; case PlanStage::NEED_YIELD: *out = id; return status; case PlanStage::IS_EOF: return status; default: MONGO_UNREACHABLE; } } // We advanced, or are retrying, and id is set to the WSM to work on. WorkingSetMember* member = _ws->get(id); // We want to free this member when we return, unless we need to retry deleting or returning it. ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id); invariant(member->hasRecordId()); RecordId recordId = member->recordId; // Deletes can't have projections. This means that covering analysis will always add // a fetch. We should always get fetched data, and never just key data. invariant(member->hasObj()); // Ensure the document still exists and matches the predicate. bool docStillMatches; try { docStillMatches = write_stage_common::ensureStillMatches( _collection, getOpCtx(), _ws, id, _params.canonicalQuery); } catch (const WriteConflictException&) { // There was a problem trying to detect if the document still exists, so retry. memberFreer.Dismiss(); return prepareToRetryWSM(id, out); } if (!docStillMatches) { // Either the document has already been deleted, or it has been updated such that it no // longer matches the predicate. if (shouldRestartDeleteIfNoLongerMatches(_params)) { throw WriteConflictException(); } return PlanStage::NEED_TIME; } // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState() is // allowed to free the memory. if (_params.returnDeleted) { // Save a copy of the document that is about to get deleted, but keep it in the RID_AND_OBJ // state in case we need to retry deleting it. BSONObj deletedDoc = member->obj.value(); member->obj.setValue(deletedDoc.getOwned()); } // TODO: Do we want to buffer docs and delete them in a group rather than saving/restoring state // repeatedly? WorkingSetCommon::prepareForSnapshotChange(_ws); try { child()->saveState(); } catch (const WriteConflictException&) { std::terminate(); } // Do the write, unless this is an explain. if (!_params.isExplain) { try { WriteUnitOfWork wunit(getOpCtx()); _collection->deleteDocument(getOpCtx(), _params.stmtId, recordId, _params.opDebug, _params.fromMigrate, false, _params.returnDeleted ? Collection::StoreDeletedDoc::On : Collection::StoreDeletedDoc::Off); wunit.commit(); } catch (const WriteConflictException&) { memberFreer.Dismiss(); // Keep this member around so we can retry deleting it. return prepareToRetryWSM(id, out); } } ++_specificStats.docsDeleted; if (_params.returnDeleted) { // After deleting the document, the RecordId associated with this member is invalid. // Remove the 'recordId' from the WorkingSetMember before returning it. member->recordId = RecordId(); member->transitionToOwnedObj(); } // As restoreState may restore (recreate) cursors, cursors are tied to the transaction in which // they are created, and a WriteUnitOfWork is a transaction, make sure to restore the state // outside of the WriteUnitOfWork. try { child()->restoreState(); } catch (const WriteConflictException&) { // Note we don't need to retry anything in this case since the delete already was committed. // However, we still need to return the deleted document (if it was requested). if (_params.returnDeleted) { // member->obj should refer to the deleted document. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); _idReturning = id; // Keep this member around so that we can return it on the next work() call. memberFreer.Dismiss(); } *out = WorkingSet::INVALID_ID; return NEED_YIELD; } if (_params.returnDeleted) { // member->obj should refer to the deleted document. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); memberFreer.Dismiss(); // Keep this member around so we can return it. *out = id; return PlanStage::ADVANCED; } return PlanStage::NEED_TIME; }
PlanStage::StageState SortStage::doWork(WorkingSetID* out) { const size_t maxBytes = static_cast<size_t>(internalQueryExecMaxBlockingSortBytes); if (_memUsage > maxBytes) { mongoutils::str::stream ss; ss << "Sort operation used more than the maximum " << maxBytes << " bytes of RAM. Add an index, or specify a smaller limit."; Status status(ErrorCodes::OperationFailed, ss); *out = WorkingSetCommon::allocateStatusMember(_ws, status); return PlanStage::FAILURE; } if (isEOF()) { return PlanStage::IS_EOF; } // Still reading in results to sort. if (!_sorted) { WorkingSetID id = WorkingSet::INVALID_ID; StageState code = child()->work(&id); if (PlanStage::ADVANCED == code) { // Add it into the map for quick invalidation if it has a valid RecordId. // A RecordId may be invalidated at any time (during a yield). We need to get into // the WorkingSet as quickly as possible to handle it. WorkingSetMember* member = _ws->get(id); // Planner must put a fetch before we get here. verify(member->hasObj()); // We might be sorting something that was invalidated at some point. if (member->hasRecordId()) { _wsidByRecordId[member->recordId] = id; } SortableDataItem item; item.wsid = id; // We extract the sort key from the WSM's computed data. This must have been generated // by a SortKeyGeneratorStage descendent in the execution tree. auto sortKeyComputedData = static_cast<const SortKeyComputedData*>(member->getComputed(WSM_SORT_KEY)); item.sortKey = sortKeyComputedData->getSortKey(); if (member->hasRecordId()) { // The RecordId breaks ties when sorting two WSMs with the same sort key. item.recordId = member->recordId; } addToBuffer(item); return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == code) { // TODO: We don't need the lock for this. We could ask for a yield and do this work // unlocked. Also, this is performing a lot of work for one call to work(...) sortBuffer(); _resultIterator = _data.begin(); _sorted = true; return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == code || PlanStage::DEAD == code) { *out = id; // If a stage fails, it may create a status WSM to indicate why it // failed, in which case 'id' is valid. If ID is invalid, we // create our own error message. if (WorkingSet::INVALID_ID == id) { mongoutils::str::stream ss; ss << "sort stage failed to read in results to sort from child"; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember(_ws, status); } return code; } else if (PlanStage::NEED_YIELD == code) { *out = id; } return code; } // Returning results. verify(_resultIterator != _data.end()); verify(_sorted); *out = _resultIterator->wsid; _resultIterator++; // If we're returning something, take it out of our DL -> WSID map so that future // calls to invalidate don't cause us to take action for a DL we're done with. WorkingSetMember* member = _ws->get(*out); if (member->hasRecordId()) { _wsidByRecordId.erase(member->recordId); } return PlanStage::ADVANCED; }
void run() { // Populate the collection. for (int i = 0; i < 50; ++i) { insert(BSON("_id" << i << "foo" << i)); } ASSERT_EQUALS(50U, count(BSONObj())); // Various variables we'll need. dbtests::WriteContextForTests ctx(&_opCtx, nss.ns()); OpDebug* opDebug = &CurOp::get(_opCtx)->debug(); Collection* coll = ctx.getCollection(); ASSERT(coll); UpdateRequest request(nss); const CollatorInterface* collator = nullptr; UpdateDriver driver(new ExpressionContext(&_opCtx, collator)); const int targetDocIndex = 10; const BSONObj query = BSON("foo" << BSON("$gte" << targetDocIndex)); const auto ws = make_unique<WorkingSet>(); const unique_ptr<CanonicalQuery> cq(canonicalize(query)); // Get the RecordIds that would be returned by an in-order scan. vector<RecordId> recordIds; getRecordIds(coll, CollectionScanParams::FORWARD, &recordIds); // Populate the request. request.setQuery(query); request.setUpdates(fromjson("{$set: {x: 0}}")); request.setSort(BSONObj()); request.setMulti(false); request.setReturnDocs(UpdateRequest::RETURN_NEW); const std::map<StringData, std::unique_ptr<ExpressionWithPlaceholder>> arrayFilters; ASSERT_DOES_NOT_THROW(driver.parse(request.getUpdates(), arrayFilters, request.isMulti())); // Configure a QueuedDataStage to pass the first object in the collection back in a // RID_AND_OBJ state. auto qds = make_unique<QueuedDataStage>(&_opCtx, ws.get()); WorkingSetID id = ws->allocate(); WorkingSetMember* member = ws->get(id); member->recordId = recordIds[targetDocIndex]; const BSONObj oldDoc = BSON("_id" << targetDocIndex << "foo" << targetDocIndex); member->obj = Snapshotted<BSONObj>(SnapshotId(), oldDoc); ws->transitionToRecordIdAndObj(id); qds->pushBack(id); // Configure the update. UpdateStageParams updateParams(&request, &driver, opDebug); updateParams.canonicalQuery = cq.get(); auto updateStage = make_unique<UpdateStage>(&_opCtx, updateParams, ws.get(), coll, qds.release()); // Should return advanced. id = WorkingSet::INVALID_ID; PlanStage::StageState state = updateStage->work(&id); ASSERT_EQUALS(PlanStage::ADVANCED, state); // Make sure the returned value is what we expect it to be. // Should give us back a valid id. ASSERT_TRUE(WorkingSet::INVALID_ID != id); WorkingSetMember* resultMember = ws->get(id); // With an owned copy of the object, with no RecordId. ASSERT_TRUE(resultMember->hasOwnedObj()); ASSERT_FALSE(resultMember->hasRecordId()); ASSERT_EQUALS(resultMember->getState(), WorkingSetMember::OWNED_OBJ); ASSERT_TRUE(resultMember->obj.value().isOwned()); // Should be the new value. BSONObj newDoc = BSON("_id" << targetDocIndex << "foo" << targetDocIndex << "x" << 0); ASSERT_BSONOBJ_EQ(resultMember->obj.value(), newDoc); // Should have done the update. vector<BSONObj> objs; getCollContents(coll, &objs); ASSERT_BSONOBJ_EQ(objs[targetDocIndex], newDoc); // That should be it. id = WorkingSet::INVALID_ID; ASSERT_EQUALS(PlanStage::IS_EOF, updateStage->work(&id)); }
/** * addToBuffer() and sortBuffer() work differently based on the * configured limit. addToBuffer() is also responsible for * performing some accounting on the overall memory usage to * make sure we're not using too much memory. * * limit == 0: * addToBuffer() - Adds item to vector. * sortBuffer() - Sorts vector. * limit == 1: * addToBuffer() - Replaces first item in vector with max of * current and new item. * Updates memory usage if item was replaced. * sortBuffer() - Does nothing. * limit > 1: * addToBuffer() - Does not update vector. Adds item to set. * If size of set exceeds limit, remove item from set * with lowest key. Updates memory usage accordingly. * sortBuffer() - Copies items from set to vectors. */ void SortStage::addToBuffer(const SortableDataItem& item) { // Holds ID of working set member to be freed at end of this function. WorkingSetID wsidToFree = WorkingSet::INVALID_ID; WorkingSetMember* member = _ws->get(item.wsid); if (_limit == 0) { // Ensure that the BSONObj underlying the WorkingSetMember is owned in case we yield. member->makeObjOwnedIfNeeded(); _data.push_back(item); _memUsage += member->getMemUsage(); } else if (_limit == 1) { if (_data.empty()) { member->makeObjOwnedIfNeeded(); _data.push_back(item); _memUsage = member->getMemUsage(); return; } wsidToFree = item.wsid; const WorkingSetComparator& cmp = *_sortKeyComparator; // Compare new item with existing item in vector. if (cmp(item, _data[0])) { wsidToFree = _data[0].wsid; member->makeObjOwnedIfNeeded(); _data[0] = item; _memUsage = member->getMemUsage(); } } else { // Update data item set instead of vector // Limit not reached - insert and return vector<SortableDataItem>::size_type limit(_limit); if (_dataSet->size() < limit) { member->makeObjOwnedIfNeeded(); _dataSet->insert(item); _memUsage += member->getMemUsage(); return; } // Limit will be exceeded - compare with item with lowest key // If new item does not have a lower key value than last item, // do nothing. wsidToFree = item.wsid; SortableDataItemSet::const_iterator lastItemIt = --(_dataSet->end()); const SortableDataItem& lastItem = *lastItemIt; const WorkingSetComparator& cmp = *_sortKeyComparator; if (cmp(item, lastItem)) { _memUsage -= _ws->get(lastItem.wsid)->getMemUsage(); _memUsage += member->getMemUsage(); wsidToFree = lastItem.wsid; // According to std::set iterator validity rules, // it does not matter which of erase()/insert() happens first. // Here, we choose to erase first to release potential resources // used by the last item and to keep the scope of the iterator to a minimum. _dataSet->erase(lastItemIt); member->makeObjOwnedIfNeeded(); _dataSet->insert(item); } } // If the working set ID is valid, remove from // RecordId invalidation map and free from working set. if (wsidToFree != WorkingSet::INVALID_ID) { WorkingSetMember* member = _ws->get(wsidToFree); if (member->hasRecordId()) { _wsidByRecordId.erase(member->recordId); } _ws->free(wsidToFree); } }
PlanStage::StageState DeleteStage::doWork(WorkingSetID* out) { if (isEOF()) { return PlanStage::IS_EOF; } invariant(_collection); // If isEOF() returns false, we must have a collection. // It is possible that after a delete was executed, a WriteConflictException occurred // and prevented us from returning ADVANCED with the old version of the document. if (_idReturning != WorkingSet::INVALID_ID) { // We should only get here if we were trying to return something before. invariant(_params.returnDeleted); WorkingSetMember* member = _ws->get(_idReturning); invariant(member->getState() == WorkingSetMember::OWNED_OBJ); *out = _idReturning; _idReturning = WorkingSet::INVALID_ID; return PlanStage::ADVANCED; } // Either retry the last WSM we worked on or get a new one from our child. WorkingSetID id; if (_idRetrying != WorkingSet::INVALID_ID) { id = _idRetrying; _idRetrying = WorkingSet::INVALID_ID; } else { auto status = child()->work(&id); switch (status) { case PlanStage::ADVANCED: break; case PlanStage::FAILURE: case PlanStage::DEAD: *out = id; // If a stage fails, it may create a status WSM to indicate why it failed, in which // case 'id' is valid. If ID is invalid, we create our own error message. if (WorkingSet::INVALID_ID == id) { const std::string errmsg = "delete stage failed to read in results from child"; *out = WorkingSetCommon::allocateStatusMember( _ws, Status(ErrorCodes::InternalError, errmsg)); } return status; case PlanStage::NEED_TIME: return status; case PlanStage::NEED_YIELD: *out = id; return status; case PlanStage::IS_EOF: return status; default: MONGO_UNREACHABLE; } } // We advanced, or are retrying, and id is set to the WSM to work on. WorkingSetMember* member = _ws->get(id); // We want to free this member when we return, unless we need to retry it. ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id); if (!member->hasRecordId()) { // We expect to be here because of an invalidation causing a force-fetch. ++_specificStats.nInvalidateSkips; return PlanStage::NEED_TIME; } RecordId recordId = member->recordId; // Deletes can't have projections. This means that covering analysis will always add // a fetch. We should always get fetched data, and never just key data. invariant(member->hasObj()); try { // If the snapshot changed, then we have to make sure we have the latest copy of the // doc and that it still matches. std::unique_ptr<SeekableRecordCursor> cursor; if (getOpCtx()->recoveryUnit()->getSnapshotId() != member->obj.snapshotId()) { cursor = _collection->getCursor(getOpCtx()); if (!WorkingSetCommon::fetch(getOpCtx(), _ws, id, cursor)) { // Doc is already deleted. Nothing more to do. return PlanStage::NEED_TIME; } // Make sure the re-fetched doc still matches the predicate. if (_params.canonicalQuery && !_params.canonicalQuery->root()->matchesBSON(member->obj.value(), NULL)) { // Doesn't match. return PlanStage::NEED_TIME; } } // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState() // is allowed to free the memory. if (_params.returnDeleted) { // Save a copy of the document that is about to get deleted, but keep it in the // RID_AND_OBJ state in case we need to retry deleting it. BSONObj deletedDoc = member->obj.value(); member->obj.setValue(deletedDoc.getOwned()); } // TODO: Do we want to buffer docs and delete them in a group rather than // saving/restoring state repeatedly? try { WorkingSetCommon::prepareForSnapshotChange(_ws); child()->saveState(); } catch (const WriteConflictException& wce) { std::terminate(); } // Do the write, unless this is an explain. if (!_params.isExplain) { WriteUnitOfWork wunit(getOpCtx()); _collection->deleteDocument(getOpCtx(), recordId, _params.fromMigrate); wunit.commit(); } ++_specificStats.docsDeleted; } catch (const WriteConflictException& wce) { // When we're doing a findAndModify with a sort, the sort will have a limit of 1, so will // not produce any more results even if there is another matching document. Re-throw the WCE // here so that these operations get another chance to find a matching document. The // findAndModify command should automatically retry if it gets a WCE. // TODO: this is not necessary if there was no sort specified. if (_params.returnDeleted) { throw; } _idRetrying = id; memberFreer.Dismiss(); // Keep this member around so we can retry deleting it. *out = WorkingSet::INVALID_ID; return NEED_YIELD; } if (_params.returnDeleted) { // After deleting the document, the RecordId associated with this member is invalid. // Remove the 'recordId' from the WorkingSetMember before returning it. member->recordId = RecordId(); member->transitionToOwnedObj(); } // As restoreState may restore (recreate) cursors, cursors are tied to the // transaction in which they are created, and a WriteUnitOfWork is a // transaction, make sure to restore the state outside of the WritUnitOfWork. try { child()->restoreState(); } catch (const WriteConflictException& wce) { // Note we don't need to retry anything in this case since the delete already // was committed. However, we still need to return the deleted document // (if it was requested). if (_params.returnDeleted) { // member->obj should refer to the deleted document. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); _idReturning = id; // Keep this member around so that we can return it on the next work() call. memberFreer.Dismiss(); } *out = WorkingSet::INVALID_ID; return NEED_YIELD; } if (_params.returnDeleted) { // member->obj should refer to the deleted document. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); memberFreer.Dismiss(); // Keep this member around so we can return it. *out = id; return PlanStage::ADVANCED; } return PlanStage::NEED_TIME; }
PlanStage::StageState FetchStage::doWork(WorkingSetID* out) { if (isEOF()) { return PlanStage::IS_EOF; } // Either retry the last WSM we worked on or get a new one from our child. WorkingSetID id; StageState status; if (_idRetrying == WorkingSet::INVALID_ID) { status = child()->work(&id); } else { status = ADVANCED; id = _idRetrying; _idRetrying = WorkingSet::INVALID_ID; } if (PlanStage::ADVANCED == status) { WorkingSetMember* member = _ws->get(id); // If there's an obj there, there is no fetching to perform. if (member->hasObj()) { ++_specificStats.alreadyHasObj; } else { // We need a valid RecordId to fetch from and this is the only state that has one. verify(WorkingSetMember::RID_AND_IDX == member->getState()); verify(member->hasRecordId()); try { if (!_cursor) _cursor = _collection->getCursor(getOpCtx()); if (auto fetcher = _cursor->fetcherForId(member->recordId)) { // There's something to fetch. Hand the fetcher off to the WSM, and pass up // a fetch request. _idRetrying = id; member->setFetcher(fetcher.release()); *out = id; return NEED_YIELD; } // The doc is already in memory, so go ahead and grab it. Now we have a RecordId // as well as an unowned object if (!WorkingSetCommon::fetch(getOpCtx(), _ws, id, _cursor)) { _ws->free(id); return NEED_TIME; } } catch (const WriteConflictException& wce) { // Ensure that the BSONObj underlying the WorkingSetMember is owned because it may // be freed when we yield. member->makeObjOwnedIfNeeded(); _idRetrying = id; *out = WorkingSet::INVALID_ID; return NEED_YIELD; } } return returnIfMatches(member, id, out); } else if (PlanStage::FAILURE == status || PlanStage::DEAD == status) { *out = id; // If a stage fails, it may create a status WSM to indicate why it // failed, in which case 'id' is valid. If ID is invalid, we // create our own error message. if (WorkingSet::INVALID_ID == id) { mongoutils::str::stream ss; ss << "fetch stage failed to read in results from child"; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember(_ws, status); } return status; } else if (PlanStage::NEED_YIELD == status) { *out = id; } return status; }