void run() { Client::WriteContext ctx(ns()); fillData(); // The data we're going to later invalidate. set<DiskLoc> locs; getLocs(&locs); // Build the mock stage which feeds the data. WorkingSet ws; auto_ptr<MockStage> ms(new MockStage(&ws)); insertVarietyOfObjects(ms.get()); SortStageParams params; params.pattern = BSON("foo" << 1); auto_ptr<SortStage> ss(new SortStage(params, &ws, ms.get())); const int firstRead = 10; // Have sort read in data from the mock stage. for (int i = 0; i < firstRead; ++i) { WorkingSetID id; PlanStage::StageState status = ss->work(&id); ASSERT_NOT_EQUALS(PlanStage::ADVANCED, status); } // We should have read in the first 'firstRead' locs. Invalidate the first. ss->prepareToYield(); set<DiskLoc>::iterator it = locs.begin(); ss->invalidate(*it++); ss->recoverFromYield(); // Read the rest of the data from the mock stage. while (!ms->isEOF()) { WorkingSetID id; ss->work(&id); } // Release to prevent double-deletion. ms.release(); // Let's just invalidate everything now. ss->prepareToYield(); while (it != locs.end()) { ss->invalidate(*it++); } ss->recoverFromYield(); // The sort should still work. int count = 0; while (!ss->isEOF()) { WorkingSetID id; PlanStage::StageState status = ss->work(&id); if (PlanStage::ADVANCED != status) { continue; } WorkingSetMember* member = ws.get(id); ASSERT(member->hasObj()); ASSERT(!member->hasLoc()); ++count; } // We've invalidated everything, but only 2/3 of our data had a DiskLoc to be // invalidated. We get the rest as-is. ASSERT_EQUALS(count, numObj()); }
PlanStage::StageState S2NearStage::addResultToQueue(WorkingSetID* out) { PlanStage::StageState state = _child->work(out); // All done reading from _child. if (PlanStage::IS_EOF == state) { _child.reset(); // Adjust the annulus size depending on how many results we got. if (_results.empty()) { _radiusIncrement *= 2; } else if (_results.size() < 300) { _radiusIncrement *= 2; } else if (_results.size() > 600) { _radiusIncrement /= 2; } // Make a new ixscan next time. return PlanStage::NEED_TIME; } // Nothing to do unless we advance. if (PlanStage::ADVANCED != state) { return state; } // TODO Speed improvements: // // 0. Modify fetch to preserve key data and test for intersection w/annulus. // // 1. keep track of what we've seen in this scan and possibly ignore it. // // 2. keep track of results we've returned before and ignore them. WorkingSetMember* member = _ws->get(*out); // Must have an object in order to get geometry out of it. verify(member->hasObj()); // Get all the fields with that name from the document. BSONElementSet geom; member->obj.getFieldsDotted(_params.nearQuery.field, geom, false); if (geom.empty()) {return PlanStage::NEED_TIME; } // Some value that any distance we can calculate will be less than. double minDistance = numeric_limits<double>::max(); BSONObj minDistanceObj; for (BSONElementSet::iterator git = geom.begin(); git != geom.end(); ++git) { if (!git->isABSONObj()) { return PlanStage::FAILURE; } BSONObj obj = git->Obj(); double distToObj; if (S2SearchUtil::distanceBetween(_params.nearQuery.centroid.point, obj, &distToObj)) { if (distToObj < minDistance) { minDistance = distToObj; minDistanceObj = obj; } } else { warning() << "unknown geometry: " << obj.toString(); } } // If the distance to the doc satisfies our distance criteria, add it to our buffered // results. if (minDistance >= _innerRadius && (_outerRadiusInclusive ? minDistance <= _outerRadius : minDistance < _outerRadius)) { _results.push(Result(*out, minDistance)); if (_params.addDistMeta) { member->addComputed(new GeoDistanceComputedData(minDistance)); } if (_params.addPointMeta) { member->addComputed(new GeoNearPointComputedData(minDistanceObj)); } if (member->hasLoc()) { _invalidationMap[member->loc] = *out; } } return PlanStage::NEED_TIME; }
void run() { // Populate the collection. for (int i = 0; i < 50; ++i) { insert(BSON("_id" << i << "foo" << i)); } ASSERT_EQUALS(50U, count(BSONObj())); // Various variables we'll need. OldClientWriteContext ctx(&_txn, nss.ns()); OpDebug* opDebug = &CurOp::get(_txn)->debug(); Collection* coll = ctx.getCollection(); UpdateLifecycleImpl updateLifecycle(false, nss); UpdateRequest request(nss); UpdateDriver driver((UpdateDriver::Options())); const int targetDocIndex = 10; const BSONObj query = BSON("foo" << BSON("$gte" << targetDocIndex)); const unique_ptr<WorkingSet> ws(stdx::make_unique<WorkingSet>()); const unique_ptr<CanonicalQuery> cq(canonicalize(query)); // Get the RecordIds that would be returned by an in-order scan. vector<RecordId> locs; getLocs(coll, CollectionScanParams::FORWARD, &locs); // Populate the request. request.setQuery(query); request.setUpdates(fromjson("{$set: {x: 0}}")); request.setSort(BSONObj()); request.setMulti(false); request.setReturnDocs(UpdateRequest::RETURN_NEW); request.setLifecycle(&updateLifecycle); ASSERT_OK(driver.parse(request.getUpdates(), request.isMulti())); // Configure a QueuedDataStage to pass the first object in the collection back in a // LOC_AND_OBJ state. std::unique_ptr<QueuedDataStage> qds(stdx::make_unique<QueuedDataStage>(ws.get())); WorkingSetID id = ws->allocate(); WorkingSetMember* member = ws->get(id); member->loc = locs[targetDocIndex]; const BSONObj oldDoc = BSON("_id" << targetDocIndex << "foo" << targetDocIndex); member->obj = Snapshotted<BSONObj>(SnapshotId(), oldDoc); ws->transitionToLocAndObj(id); qds->pushBack(id); // Configure the update. UpdateStageParams updateParams(&request, &driver, opDebug); updateParams.canonicalQuery = cq.get(); unique_ptr<UpdateStage> updateStage( stdx::make_unique<UpdateStage>(&_txn, updateParams, ws.get(), coll, qds.release())); // Should return advanced. id = WorkingSet::INVALID_ID; PlanStage::StageState state = updateStage->work(&id); ASSERT_EQUALS(PlanStage::ADVANCED, state); // Make sure the returned value is what we expect it to be. // Should give us back a valid id. ASSERT_TRUE(WorkingSet::INVALID_ID != id); WorkingSetMember* resultMember = ws->get(id); // With an owned copy of the object, with no RecordId. ASSERT_TRUE(resultMember->hasOwnedObj()); ASSERT_FALSE(resultMember->hasLoc()); ASSERT_EQUALS(resultMember->getState(), WorkingSetMember::OWNED_OBJ); ASSERT_TRUE(resultMember->obj.value().isOwned()); // Should be the new value. BSONObj newDoc = BSON("_id" << targetDocIndex << "foo" << targetDocIndex << "x" << 0); ASSERT_EQUALS(resultMember->obj.value(), newDoc); // Should have done the update. vector<BSONObj> objs; getCollContents(coll, &objs); ASSERT_EQUALS(objs[targetDocIndex], newDoc); // That should be it. id = WorkingSet::INVALID_ID; ASSERT_EQUALS(PlanStage::IS_EOF, updateStage->work(&id)); }
PlanStage::StageState S2NearStage::addResultToQueue(WorkingSetID* out) { PlanStage::StageState state = _child->work(out); // All done reading from _child. if (PlanStage::IS_EOF == state) { _child.reset(); _keyGeoFilter.reset(); // Adjust the annulus size depending on how many results we got. if (_results.empty()) { _radiusIncrement *= 2; } else if (_results.size() < 300) { _radiusIncrement *= 2; } else if (_results.size() > 600) { _radiusIncrement /= 2; } // Make a new ixscan next time. return PlanStage::NEED_TIME; } // Nothing to do unless we advance. if (PlanStage::ADVANCED != state) { return state; } WorkingSetMember* member = _ws->get(*out); // Must have an object in order to get geometry out of it. verify(member->hasObj()); // The scans we use don't dedup so we must dedup them ourselves. We only put locs into here // if we know for sure whether or not we'll return them in this annulus. if (member->hasLoc()) { if (_seenInScan.end() != _seenInScan.find(member->loc)) { return PlanStage::NEED_TIME; } } // Get all the fields with that name from the document. BSONElementSet geom; member->obj.getFieldsDotted(_params.nearQuery.field, geom, false); if (geom.empty()) { return PlanStage::NEED_TIME; } // Some value that any distance we can calculate will be less than. double minDistance = numeric_limits<double>::max(); BSONObj minDistanceObj; for (BSONElementSet::iterator git = geom.begin(); git != geom.end(); ++git) { if (!git->isABSONObj()) { mongoutils::str::stream ss; ss << "s2near stage read invalid geometry element " << *git << " from child"; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember( _ws, status); return PlanStage::FAILURE; } BSONObj obj = git->Obj(); double distToObj; if (S2SearchUtil::distanceBetween(_params.nearQuery.centroid.point, obj, &distToObj)) { if (distToObj < minDistance) { minDistance = distToObj; minDistanceObj = obj; } } else { warning() << "unknown geometry: " << obj.toString(); } } // If we're here we'll either include the doc in this annulus or reject it. It's safe to // ignore it if it pops up again in this annulus. if (member->hasLoc()) { _seenInScan.insert(member->loc); } // If the distance to the doc satisfies our distance criteria, add it to our buffered // results. if (minDistance >= _innerRadius && (_outerRadiusInclusive ? minDistance <= _outerRadius : minDistance < _outerRadius)) { _results.push(Result(*out, minDistance)); if (_params.addDistMeta) { member->addComputed(new GeoDistanceComputedData(minDistance)); } if (_params.addPointMeta) { member->addComputed(new GeoNearPointComputedData(minDistanceObj)); } if (member->hasLoc()) { _invalidationMap[member->loc] = *out; } } return PlanStage::NEED_TIME; }
PlanStage::StageState AndSortedStage::moveTowardTargetLoc(WorkingSetID* out) { verify(numeric_limits<size_t>::max() != _targetNode); verify(WorkingSet::INVALID_ID != _targetId); // We have nodes that haven't hit _targetLoc yet. size_t workingChildNumber = _workingTowardRep.front(); PlanStage* next = _children[workingChildNumber]; WorkingSetID id = WorkingSet::INVALID_ID; StageState state = next->work(&id); if (PlanStage::ADVANCED == state) { WorkingSetMember* member = _ws->get(id); // Maybe the child had an invalidation. We intersect RecordId(s) so we can't do anything // with this WSM. if (!member->hasLoc()) { _ws->flagForReview(id); return PlanStage::NEED_TIME; } verify(member->hasLoc()); if (member->loc == _targetLoc) { // The front element has hit _targetLoc. Don't move it forward anymore/work on // another element. _workingTowardRep.pop(); AndCommon::mergeFrom(_ws, _targetId, *member); _ws->free(id); if (0 == _workingTowardRep.size()) { WorkingSetID toReturn = _targetId; _targetNode = numeric_limits<size_t>::max(); _targetId = WorkingSet::INVALID_ID; _targetLoc = RecordId(); *out = toReturn; ++_commonStats.advanced; return PlanStage::ADVANCED; } // More children need to be advanced to _targetLoc. ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (member->loc < _targetLoc) { // The front element of _workingTowardRep hasn't hit the thing we're AND-ing with // yet. Try again later. _ws->free(id); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else { // member->loc > _targetLoc. // _targetLoc wasn't successfully AND-ed with the other sub-plans. We toss it and // try AND-ing with the next value. _specificStats.failedAnd[_targetNode]++; _ws->free(_targetId); _targetNode = workingChildNumber; _targetLoc = member->loc; _targetId = id; _workingTowardRep = std::queue<size_t>(); for (size_t i = 0; i < _children.size(); ++i) { if (workingChildNumber != i) { _workingTowardRep.push(i); } } // Need time to chase after the new _targetLoc. ++_commonStats.needTime; return PlanStage::NEED_TIME; } } else if (PlanStage::IS_EOF == state) { _isEOF = true; _ws->free(_targetId); return state; } else if (PlanStage::FAILURE == state || PlanStage::DEAD == state) { *out = id; // If a stage fails, it may create a status WSM to indicate why it // failed, in which case 'id' is valid. If ID is invalid, we // create our own error message. if (WorkingSet::INVALID_ID == id) { mongoutils::str::stream ss; ss << "sorted AND stage failed to read in results from child " << workingChildNumber; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember(_ws, status); } _isEOF = true; _ws->free(_targetId); return state; } else { if (PlanStage::NEED_TIME == state) { ++_commonStats.needTime; } else if (PlanStage::NEED_YIELD == state) { ++_commonStats.needYield; *out = id; } return state; } }
/** * addToBuffer() and sortBuffer() work differently based on the * configured limit. addToBuffer() is also responsible for * performing some accounting on the overall memory usage to * make sure we're not using too much memory. * * limit == 0: * addToBuffer() - Adds item to vector. * sortBuffer() - Sorts vector. * limit == 1: * addToBuffer() - Replaces first item in vector with max of * current and new item. * Updates memory usage if item was replaced. * sortBuffer() - Does nothing. * limit > 1: * addToBuffer() - Does not update vector. Adds item to set. * If size of set exceeds limit, remove item from set * with lowest key. Updates memory usage accordingly. * sortBuffer() - Copies items from set to vectors. */ void SortStage::addToBuffer(const SortableDataItem& item) { // Holds ID of working set member to be freed at end of this function. WorkingSetID wsidToFree = WorkingSet::INVALID_ID; if (_limit == 0) { _data.push_back(item); _memUsage += _ws->get(item.wsid)->getMemUsage(); } else if (_limit == 1) { if (_data.empty()) { _data.push_back(item); _memUsage = _ws->get(item.wsid)->getMemUsage(); return; } wsidToFree = item.wsid; const WorkingSetComparator& cmp = *_sortKeyComparator; // Compare new item with existing item in vector. if (cmp(item, _data[0])) { wsidToFree = _data[0].wsid; _data[0] = item; _memUsage = _ws->get(item.wsid)->getMemUsage(); } } else { // Update data item set instead of vector // Limit not reached - insert and return vector<SortableDataItem>::size_type limit(_limit); if (_dataSet->size() < limit) { _dataSet->insert(item); _memUsage += _ws->get(item.wsid)->getMemUsage(); return; } // Limit will be exceeded - compare with item with lowest key // If new item does not have a lower key value than last item, // do nothing. wsidToFree = item.wsid; SortableDataItemSet::const_iterator lastItemIt = --(_dataSet->end()); const SortableDataItem& lastItem = *lastItemIt; const WorkingSetComparator& cmp = *_sortKeyComparator; if (cmp(item, lastItem)) { _memUsage -= _ws->get(lastItem.wsid)->getMemUsage(); _memUsage += _ws->get(item.wsid)->getMemUsage(); wsidToFree = lastItem.wsid; // According to std::set iterator validity rules, // it does not matter which of erase()/insert() happens first. // Here, we choose to erase first to release potential resources // used by the last item and to keep the scope of the iterator to a minimum. _dataSet->erase(lastItemIt); _dataSet->insert(item); } } // If the working set ID is valid, remove from // DiskLoc invalidation map and free from working set. if (wsidToFree != WorkingSet::INVALID_ID) { WorkingSetMember* member = _ws->get(wsidToFree); if (member->hasLoc()) { _wsidByDiskLoc.erase(member->loc); } _ws->free(wsidToFree); } }
PlanStage::StageState OrStage::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (isEOF()) { return PlanStage::IS_EOF; } if (0 == _specificStats.matchTested.size()) { _specificStats.matchTested = vector<size_t>(_children.size(), 0); } WorkingSetID id = WorkingSet::INVALID_ID; StageState childStatus = _children[_currentChild]->work(&id); if (PlanStage::ADVANCED == childStatus) { WorkingSetMember* member = _ws->get(id); // If we're deduping (and there's something to dedup by) if (_dedup && member->hasLoc()) { ++_specificStats.dupsTested; // ...and we've seen the DiskLoc before if (_seen.end() != _seen.find(member->loc)) { // ...drop it. ++_specificStats.dupsDropped; _ws->free(id); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else { // Otherwise, note that we've seen it. _seen.insert(member->loc); } } if (Filter::passes(member, _filter)) { if (NULL != _filter) { ++_specificStats.matchTested[_currentChild]; } // Match! return it. *out = id; ++_commonStats.advanced; return PlanStage::ADVANCED; } else { // Does not match, try again. _ws->free(id); ++_commonStats.needTime; return PlanStage::NEED_TIME; } } else if (PlanStage::IS_EOF == childStatus) { // Done with _currentChild, move to the next one. ++_currentChild; // Maybe we're out of children. if (isEOF()) { return PlanStage::IS_EOF; } else { ++_commonStats.needTime; return PlanStage::NEED_TIME; } } else if (PlanStage::FAILURE == childStatus) { *out = id; // If a stage fails, it may create a status WSM to indicate why it // failed, in which case 'id' is valid. If ID is invalid, we // create our own error message. if (WorkingSet::INVALID_ID == id) { mongoutils::str::stream ss; ss << "OR stage failed to read in results from child " << _currentChild; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember( _ws, status); } return childStatus; } else if (PlanStage::NEED_TIME == childStatus) { ++_commonStats.needTime; } else if (PlanStage::NEED_FETCH == childStatus) { ++_commonStats.needFetch; *out = id; } // NEED_TIME, ERROR, NEED_FETCH, pass them up. return childStatus; }
PlanStage::StageState AndHashStage::work(WorkingSetID* out) { ++_commonStats.works; if (isEOF()) { return PlanStage::IS_EOF; } // An AND is either reading the first child into the hash table, probing against the hash // table with subsequent children, or checking the last child's results to see if they're // in the hash table. // We read the first child into our hash table. if (_hashingChildren) { if (0 == _currentChild) { return readFirstChild(out); } else if (_currentChild < _children.size() - 1) { return hashOtherChildren(out); } else { _hashingChildren = false; // We don't hash our last child. Instead, we probe the table created from the // previous children, returning results in the order of the last child. // Fall through to below. } } // Returning results. We read from the last child and return the results that are in our // hash map. // We should be EOF if we're not hashing results and the dataMap is empty. verify(!_dataMap.empty()); // We probe _dataMap with the last child. verify(_currentChild == _children.size() - 1); // Work the last child. StageState childStatus = _children[_children.size() - 1]->work(out); if (PlanStage::ADVANCED != childStatus) { return childStatus; } // We know that we've ADVANCED. See if the WSM is in our table. WorkingSetMember* member = _ws->get(*out); // Maybe the child had an invalidation. We intersect DiskLoc(s) so we can't do anything // with this WSM. if (!member->hasLoc()) { _ws->flagForReview(*out); return PlanStage::NEED_TIME; } DataMap::iterator it = _dataMap.find(member->loc); if (_dataMap.end() == it) { // Child's output wasn't in every previous child. Throw it out. _ws->free(*out); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else { // Child's output was in every previous child. Merge any key data in // the child's output and free the child's just-outputted WSM. WorkingSetID hashID = it->second; _dataMap.erase(it); WorkingSetMember* olderMember = _ws->get(hashID); AndCommon::mergeFrom(olderMember, *member); _ws->free(*out); // We should check for matching at the end so the matcher can use information in the // indices of all our children. if (Filter::passes(olderMember, _filter)) { *out = hashID; ++_commonStats.advanced; return PlanStage::ADVANCED; } else { _ws->free(hashID); ++_commonStats.needTime; return PlanStage::NEED_TIME; } } }
PlanStage::StageState AndHashStage::readFirstChild(WorkingSetID* out) { verify(_currentChild == 0); WorkingSetID id = WorkingSet::INVALID_ID; StageState childStatus = workChild(0, &id); if (PlanStage::ADVANCED == childStatus) { WorkingSetMember* member = _ws->get(id); // Maybe the child had an invalidation. We intersect RecordId(s) so we can't do anything // with this WSM. if (!member->hasLoc()) { _ws->flagForReview(id); return PlanStage::NEED_TIME; } verify(member->hasLoc()); verify(_dataMap.end() == _dataMap.find(member->loc)); _dataMap[member->loc] = id; // Update memory stats. _memUsage += member->getMemUsage(); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == childStatus) { // Done reading child 0. _currentChild = 1; // If our first child was empty, don't scan any others, no possible results. if (_dataMap.empty()) { _hashingChildren = false; return PlanStage::IS_EOF; } ++_commonStats.needTime; _specificStats.mapAfterChild.push_back(_dataMap.size()); return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == childStatus) { *out = id; // If a stage fails, it may create a status WSM to indicate why it // failed, in which case 'id' is valid. If ID is invalid, we // create our own error message. if (WorkingSet::INVALID_ID == id) { mongoutils::str::stream ss; ss << "hashed AND stage failed to read in results to from first child"; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember( _ws, status); } return childStatus; } else { if (PlanStage::NEED_TIME == childStatus) { ++_commonStats.needTime; } else if (PlanStage::NEED_FETCH == childStatus) { ++_commonStats.needFetch; *out = id; } return childStatus; } }
PlanStage::StageState MergeSortStage::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (isEOF()) { return PlanStage::IS_EOF; } if (!_noResultToMerge.empty()) { // We have some child that we don't have a result from. Each child must have a result // in order to pick the minimum result among all our children. Work a child. PlanStage* child = _noResultToMerge.front(); WorkingSetID id = WorkingSet::INVALID_ID; StageState code = child->work(&id); if (PlanStage::ADVANCED == code) { // If we're deduping... if (_dedup) { WorkingSetMember* member = _ws->get(id); if (!member->hasLoc()) { // Can't dedup data unless there's a DiskLoc. We go ahead and use its // result. _noResultToMerge.pop(); } else { ++_specificStats.dupsTested; // ...and there's a diskloc and and we've seen the DiskLoc before if (_seen.end() != _seen.find(member->loc)) { // ...drop it. _ws->free(id); ++_commonStats.needTime; ++_specificStats.dupsDropped; return PlanStage::NEED_TIME; } else { // Otherwise, note that we've seen it. _seen.insert(member->loc); // We're going to use the result from the child, so we remove it from // the queue of children without a result. _noResultToMerge.pop(); } } } else { // Not deduping. We use any result we get from the child. Remove the child // from the queue of things without a result. _noResultToMerge.pop(); } // Store the result in our list. StageWithValue value; value.id = id; value.stage = child; _mergingData.push_front(value); // Insert the result (indirectly) into our priority queue. _merging.push(_mergingData.begin()); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == code) { // There are no more results possible from this child. Don't bother with it // anymore. _noResultToMerge.pop(); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == code) { *out = id; // If a stage fails, it may create a status WSM to indicate why it // failed, in which case 'id' is valid. If ID is invalid, we // create our own error message. if (WorkingSet::INVALID_ID == id) { mongoutils::str::stream ss; ss << "merge sort stage failed to read in results from child"; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember( _ws, status); } return code; } else { if (PlanStage::NEED_TIME == code) { ++_commonStats.needTime; } return code; } } // If we're here, for each non-EOF child, we have a valid WSID. verify(!_merging.empty()); // Get the 'min' WSID. _merging is a priority queue so its top is the smallest. MergingRef top = _merging.top(); _merging.pop(); // Since we're returning the WSID that came from top->stage, we need to work(...) it again // to get a new result. _noResultToMerge.push(top->stage); // Save the ID that we're returning and remove the returned result from our data. WorkingSetID idToTest = top->id; _mergingData.erase(top); // Return the min. *out = idToTest; ++_commonStats.advanced; // But don't return it if it's flagged. if (_ws->isFlagged(*out)) { _ws->free(*out); return PlanStage::NEED_TIME; } return PlanStage::ADVANCED; }
PlanStage::StageState AndHashStage::hashOtherChildren(WorkingSetID* out) { verify(_currentChild > 0); WorkingSetID id; StageState childStatus = _children[_currentChild]->work(&id); if (PlanStage::ADVANCED == childStatus) { WorkingSetMember* member = _ws->get(id); // Maybe the child had an invalidation. We intersect DiskLoc(s) so we can't do anything // with this WSM. if (!member->hasLoc()) { _ws->flagForReview(id); return PlanStage::NEED_TIME; } verify(member->hasLoc()); if (_dataMap.end() == _dataMap.find(member->loc)) { // Ignore. It's not in any previous child. } else { // We have a hit. Copy data into the WSM we already have. _seenMap.insert(member->loc); WorkingSetMember* olderMember = _ws->get(_dataMap[member->loc]); AndCommon::mergeFrom(olderMember, *member); } _ws->free(id); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == childStatus) { // Finished with a child. ++_currentChild; // Keep elements of _dataMap that are in _seenMap. DataMap::iterator it = _dataMap.begin(); while (it != _dataMap.end()) { if (_seenMap.end() == _seenMap.find(it->first)) { DataMap::iterator toErase = it; ++it; _ws->free(toErase->second); _dataMap.erase(toErase); } else { ++it; } } _specificStats.mapAfterChild.push_back(_dataMap.size()); _seenMap.clear(); // _dataMap is now the intersection of the first _currentChild nodes. // If we have nothing to AND with after finishing any child, stop. if (_dataMap.empty()) { _hashingChildren = false; return PlanStage::IS_EOF; } // We've finished scanning all children. Return results with the next call to work(). if (_currentChild == _children.size()) { _hashingChildren = false; } ++_commonStats.needTime; return PlanStage::NEED_TIME; } else { if (PlanStage::NEED_FETCH == childStatus) { *out = id; ++_commonStats.needFetch; } else if (PlanStage::NEED_TIME == childStatus) { ++_commonStats.needTime; } return childStatus; } }
Runner::RunnerState PlanExecutor::getNext(BSONObj* objOut, DiskLoc* dlOut) { if (_killed) { return Runner::RUNNER_DEAD; } for (;;) { // Yield, if we can yield ourselves. if (NULL != _yieldPolicy.get() && _yieldPolicy->shouldYield()) { saveState(); _yieldPolicy->yield(); if (_killed) { return Runner::RUNNER_DEAD; } restoreState(); } WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState code = _root->work(&id); if (PlanStage::ADVANCED == code) { // Fast count. if (WorkingSet::INVALID_ID == id) { invariant(NULL == objOut); invariant(NULL == dlOut); return Runner::RUNNER_ADVANCED; } WorkingSetMember* member = _workingSet->get(id); bool hasRequestedData = true; if (NULL != objOut) { if (WorkingSetMember::LOC_AND_IDX == member->state) { if (1 != member->keyData.size()) { _workingSet->free(id); hasRequestedData = false; } else { *objOut = member->keyData[0].keyData; } } else if (member->hasObj()) { *objOut = member->obj; } else { _workingSet->free(id); hasRequestedData = false; } } if (NULL != dlOut) { if (member->hasLoc()) { *dlOut = member->loc; } else { _workingSet->free(id); hasRequestedData = false; } } if (hasRequestedData) { _workingSet->free(id); return Runner::RUNNER_ADVANCED; } // This result didn't have the data the caller wanted, try again. } else if (PlanStage::NEED_TIME == code) { // Fall through to yield check at end of large conditional. } else if (PlanStage::NEED_FETCH == code) { // id has a loc and refers to an obj we need to fetch. WorkingSetMember* member = _workingSet->get(id); // This must be true for somebody to request a fetch and can only change when an // invalidation happens, which is when we give up a lock. Don't give up the // lock between receiving the NEED_FETCH and actually fetching(?). verify(member->hasLoc()); // XXX: remove NEED_FETCH } else if (PlanStage::IS_EOF == code) { return Runner::RUNNER_EOF; } else if (PlanStage::DEAD == code) { return Runner::RUNNER_DEAD; } else { verify(PlanStage::FAILURE == code); if (NULL != objOut) { WorkingSetCommon::getStatusMemberObject(*_workingSet, id, objOut); } return Runner::RUNNER_ERROR; } } }
// Set "toReturn" when NEED_FETCH. PlanStage::StageState NearStage::bufferNext(WorkingSetID* toReturn, Status* error) { // // Try to retrieve the next covered member // if (!_nextInterval) { StatusWith<CoveredInterval*> intervalStatus = nextInterval(_txn, _workingSet, _collection); if (!intervalStatus.isOK()) { _searchState = SearchState_Finished; *error = intervalStatus.getStatus(); return PlanStage::FAILURE; } if (NULL == intervalStatus.getValue()) { _searchState = SearchState_Finished; return PlanStage::IS_EOF; } // CoveredInterval and its child stage are owned by _childrenIntervals _childrenIntervals.push_back(intervalStatus.getValue()); _nextInterval = _childrenIntervals.back(); _nextIntervalStats.reset(new IntervalStats()); _nextIntervalStats->minDistanceAllowed = _nextInterval->minDistance; _nextIntervalStats->maxDistanceAllowed = _nextInterval->maxDistance; _nextIntervalStats->inclusiveMaxDistanceAllowed = _nextInterval->inclusiveMax; } WorkingSetID nextMemberID; PlanStage::StageState intervalState = _nextInterval->covering->work(&nextMemberID); if (PlanStage::IS_EOF == intervalState) { _nextInterval = NULL; _nextIntervalSeen.clear(); _searchState = SearchState_Advancing; return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == intervalState) { *error = WorkingSetCommon::getMemberStatus(*_workingSet->get(nextMemberID)); return intervalState; } else if (PlanStage::NEED_FETCH == intervalState) { *toReturn = nextMemberID; return intervalState; } else if (PlanStage::ADVANCED != intervalState) { return intervalState; } // // Try to buffer the next covered member // WorkingSetMember* nextMember = _workingSet->get(nextMemberID); // The child stage may not dedup so we must dedup them ourselves. if (_nextInterval->dedupCovering && nextMember->hasLoc()) { if (_nextIntervalSeen.end() != _nextIntervalSeen.find(nextMember->loc)) return PlanStage::NEED_TIME; } ++_nextIntervalStats->numResultsFound; StatusWith<double> distanceStatus = computeDistance(nextMember); // Store the member's RecordId, if available, for quick invalidation if (nextMember->hasLoc()) { _nextIntervalSeen.insert(make_pair(nextMember->loc, nextMemberID)); } if (!distanceStatus.isOK()) { _searchState = SearchState_Finished; *error = distanceStatus.getStatus(); return PlanStage::FAILURE; } // If the member's distance is in the current distance interval, add it to our buffered // results. double memberDistance = distanceStatus.getValue(); bool inInterval = memberDistance >= _nextInterval->minDistance && (_nextInterval->inclusiveMax ? memberDistance <= _nextInterval->maxDistance : memberDistance < _nextInterval->maxDistance); // Update found distance stats if (_nextIntervalStats->minDistanceFound < 0 || memberDistance < _nextIntervalStats->minDistanceFound) { _nextIntervalStats->minDistanceFound = memberDistance; } if (_nextIntervalStats->maxDistanceFound < 0 || memberDistance > _nextIntervalStats->maxDistanceFound) { _nextIntervalStats->maxDistanceFound = memberDistance; } if (inInterval) { _resultBuffer.push(SearchResult(nextMemberID, memberDistance)); ++_nextIntervalStats->numResultsBuffered; // Update buffered distance stats if (_nextIntervalStats->minDistanceBuffered < 0 || memberDistance < _nextIntervalStats->minDistanceBuffered) { _nextIntervalStats->minDistanceBuffered = memberDistance; } if (_nextIntervalStats->maxDistanceBuffered < 0 || memberDistance > _nextIntervalStats->maxDistanceBuffered) { _nextIntervalStats->maxDistanceBuffered = memberDistance; } } else { // We won't pass this WSM up, so deallocate it _workingSet->free(nextMemberID); } return PlanStage::NEED_TIME; }
PlanStage::StageState DeleteStage::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (isEOF()) { return PlanStage::IS_EOF; } invariant(_collection); // If isEOF() returns false, we must have a collection. WorkingSetID id = WorkingSet::INVALID_ID; StageState status = _child->work(&id); if (PlanStage::ADVANCED == status) { WorkingSetMember* member = _ws->get(id); if (!member->hasLoc()) { _ws->free(id); const std::string errmsg = "delete stage failed to read member w/ loc from child"; *out = WorkingSetCommon::allocateStatusMember(_ws, Status(ErrorCodes::InternalError, errmsg)); return PlanStage::FAILURE; } DiskLoc rloc = member->loc; _ws->free(id); BSONObj deletedDoc; WriteUnitOfWork wunit(_txn); // TODO: Do we want to buffer docs and delete them in a group rather than // saving/restoring state repeatedly? saveState(); const bool deleteCappedOK = false; const bool deleteNoWarn = false; _collection->deleteDocument(_txn, rloc, deleteCappedOK, deleteNoWarn, _params.shouldCallLogOp ? &deletedDoc : NULL); restoreState(_txn); ++_specificStats.docsDeleted; if (_params.shouldCallLogOp) { if (deletedDoc.isEmpty()) { log() << "Deleted object without id in collection " << _collection->ns() << ", not logging."; } else { bool replJustOne = true; repl::logOp(_txn, "d", _collection->ns().ns().c_str(), deletedDoc, 0, &replJustOne, _params.fromMigrate); } } wunit.commit(); _txn->recoveryUnit()->commitIfNeeded(); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == status) { *out = id; // If a stage fails, it may create a status WSM to indicate why it failed, in which case // 'id' is valid. If ID is invalid, we create our own error message. if (WorkingSet::INVALID_ID == id) { const std::string errmsg = "delete stage failed to read in results from child"; *out = WorkingSetCommon::allocateStatusMember(_ws, Status(ErrorCodes::InternalError, errmsg)); return PlanStage::FAILURE; } return status; } else { if (PlanStage::NEED_TIME == status) { ++_commonStats.needTime; } return status; } }
PlanExecutor::ExecState PlanExecutor::getNext(BSONObj* objOut, DiskLoc* dlOut) { if (_killed) { return PlanExecutor::DEAD; } for (;;) { WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState code = _root->work(&id); if (PlanStage::ADVANCED == code) { // Fast count. if (WorkingSet::INVALID_ID == id) { invariant(NULL == objOut); invariant(NULL == dlOut); return PlanExecutor::ADVANCED; } WorkingSetMember* member = _workingSet->get(id); bool hasRequestedData = true; if (NULL != objOut) { if (WorkingSetMember::LOC_AND_IDX == member->state) { if (1 != member->keyData.size()) { _workingSet->free(id); hasRequestedData = false; } else { *objOut = member->keyData[0].keyData; } } else if (member->hasObj()) { *objOut = member->obj; } else { _workingSet->free(id); hasRequestedData = false; } } if (NULL != dlOut) { if (member->hasLoc()) { *dlOut = member->loc; } else { _workingSet->free(id); hasRequestedData = false; } } if (hasRequestedData) { _workingSet->free(id); return PlanExecutor::ADVANCED; } // This result didn't have the data the caller wanted, try again. } else if (PlanStage::NEED_TIME == code) { // Fall through to yield check at end of large conditional. } else if (PlanStage::IS_EOF == code) { return PlanExecutor::IS_EOF; } else if (PlanStage::DEAD == code) { return PlanExecutor::DEAD; } else { verify(PlanStage::FAILURE == code); if (NULL != objOut) { WorkingSetCommon::getStatusMemberObject(*_workingSet, id, objOut); } return PlanExecutor::EXEC_ERROR; } } }
PlanStage::StageState AndHashStage::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (isEOF()) { return PlanStage::IS_EOF; } // Fast-path for one of our children being EOF immediately. We work each child a few times. // If it hits EOF, the AND cannot output anything. If it produces a result, we stash that // result in _lookAheadResults. if (_lookAheadResults.empty()) { // INVALID_ID means that the child didn't produce a valid result. // We specifically are not using .resize(size, value) here because C++11 builds don't // seem to resolve WorkingSet::INVALID_ID during linking. _lookAheadResults.resize(_children.size()); for (size_t i = 0; i < _children.size(); ++i) { _lookAheadResults[i] = WorkingSet::INVALID_ID; } // Work each child some number of times until it's either EOF or produces // a result. If it's EOF this whole stage will be EOF. If it produces a // result we cache it for later. for (size_t i = 0; i < _children.size(); ++i) { PlanStage* child = _children[i]; for (size_t j = 0; j < kLookAheadWorks; ++j) { StageState childStatus = child->work(&_lookAheadResults[i]); if (PlanStage::IS_EOF == childStatus || PlanStage::DEAD == childStatus) { // A child went right to EOF. Bail out. _hashingChildren = false; _dataMap.clear(); return PlanStage::IS_EOF; } else if (PlanStage::ADVANCED == childStatus) { // We have a result cached in _lookAheadResults[i]. Stop looking at this // child. break; } else if (PlanStage::FAILURE == childStatus) { // Propage error to parent. *out = _lookAheadResults[i]; // If a stage fails, it may create a status WSM to indicate why it // failed, in which case 'id' is valid. If ID is invalid, we // create our own error message. if (WorkingSet::INVALID_ID == *out) { mongoutils::str::stream ss; ss << "hashed AND stage failed to read in look ahead results " << "from child " << i; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember( _ws, status); } _hashingChildren = false; _dataMap.clear(); return PlanStage::FAILURE; } // We ignore NEED_TIME. TODO: what do we want to do if we get NEED_YIELD here? } } // We did a bunch of work above, return NEED_TIME to be fair. return PlanStage::NEED_TIME; } // An AND is either reading the first child into the hash table, probing against the hash // table with subsequent children, or checking the last child's results to see if they're // in the hash table. // We read the first child into our hash table. if (_hashingChildren) { // Check memory usage of previously hashed results. if (_memUsage > _maxMemUsage) { mongoutils::str::stream ss; ss << "hashed AND stage buffered data usage of " << _memUsage << " bytes exceeds internal limit of " << kDefaultMaxMemUsageBytes << " bytes"; Status status(ErrorCodes::Overflow, ss); *out = WorkingSetCommon::allocateStatusMember( _ws, status); return PlanStage::FAILURE; } if (0 == _currentChild) { return readFirstChild(out); } else if (_currentChild < _children.size() - 1) { return hashOtherChildren(out); } else { _hashingChildren = false; // We don't hash our last child. Instead, we probe the table created from the // previous children, returning results in the order of the last child. // Fall through to below. } } // Returning results. We read from the last child and return the results that are in our // hash map. // We should be EOF if we're not hashing results and the dataMap is empty. verify(!_dataMap.empty()); // We probe _dataMap with the last child. verify(_currentChild == _children.size() - 1); // Get the next result for the (_children.size() - 1)-th child. StageState childStatus = workChild(_children.size() - 1, out); if (PlanStage::ADVANCED != childStatus) { return childStatus; } // We know that we've ADVANCED. See if the WSM is in our table. WorkingSetMember* member = _ws->get(*out); // Maybe the child had an invalidation. We intersect RecordId(s) so we can't do anything // with this WSM. if (!member->hasLoc()) { _ws->flagForReview(*out); return PlanStage::NEED_TIME; } DataMap::iterator it = _dataMap.find(member->loc); if (_dataMap.end() == it) { // Child's output wasn't in every previous child. Throw it out. _ws->free(*out); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else { // Child's output was in every previous child. Merge any key data in // the child's output and free the child's just-outputted WSM. WorkingSetID hashID = it->second; _dataMap.erase(it); WorkingSetMember* olderMember = _ws->get(hashID); AndCommon::mergeFrom(olderMember, *member); _ws->free(*out); // We should check for matching at the end so the matcher can use information in the // indices of all our children. if (Filter::passes(olderMember, _filter)) { *out = hashID; ++_commonStats.advanced; return PlanStage::ADVANCED; } else { _ws->free(hashID); ++_commonStats.needTime; return PlanStage::NEED_TIME; } } }
PlanStage::StageState FetchStage::doWork(WorkingSetID* out) { if (isEOF()) { return PlanStage::IS_EOF; } // Either retry the last WSM we worked on or get a new one from our child. WorkingSetID id; StageState status; if (_idRetrying == WorkingSet::INVALID_ID) { status = child()->work(&id); } else { status = ADVANCED; id = _idRetrying; _idRetrying = WorkingSet::INVALID_ID; } if (PlanStage::ADVANCED == status) { WorkingSetMember* member = _ws->get(id); // If there's an obj there, there is no fetching to perform. if (member->hasObj()) { ++_specificStats.alreadyHasObj; } else { // We need a valid loc to fetch from and this is the only state that has one. verify(WorkingSetMember::LOC_AND_IDX == member->getState()); verify(member->hasLoc()); try { if (!_cursor) _cursor = _collection->getCursor(getOpCtx()); if (auto fetcher = _cursor->fetcherForId(member->loc)) { // There's something to fetch. Hand the fetcher off to the WSM, and pass up // a fetch request. _idRetrying = id; member->setFetcher(fetcher.release()); *out = id; return NEED_YIELD; } // The doc is already in memory, so go ahead and grab it. Now we have a RecordId // as well as an unowned object if (!WorkingSetCommon::fetch(getOpCtx(), _ws, id, _cursor)) { _ws->free(id); return NEED_TIME; } } catch (const WriteConflictException& wce) { // Ensure that the BSONObj underlying the WorkingSetMember is owned because it may // be freed when we yield. member->makeObjOwnedIfNeeded(); _idRetrying = id; *out = WorkingSet::INVALID_ID; return NEED_YIELD; } } return returnIfMatches(member, id, out); } else if (PlanStage::FAILURE == status || PlanStage::DEAD == status) { *out = id; // If a stage fails, it may create a status WSM to indicate why it // failed, in which case 'id' is valid. If ID is invalid, we // create our own error message. if (WorkingSet::INVALID_ID == id) { mongoutils::str::stream ss; ss << "fetch stage failed to read in results from child"; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember(_ws, status); } return status; } else if (PlanStage::NEED_YIELD == status) { *out = id; } return status; }
PlanStage::StageState AndHashStage::readFirstChild(WorkingSetID* out) { verify(_currentChild == 0); WorkingSetID id = WorkingSet::INVALID_ID; StageState childStatus = workChild(0, &id); if (PlanStage::ADVANCED == childStatus) { WorkingSetMember* member = _ws->get(id); // Maybe the child had an invalidation. We intersect RecordId(s) so we can't do anything // with this WSM. if (!member->hasLoc()) { _ws->flagForReview(id); return PlanStage::NEED_TIME; } if (!_dataMap.insert(std::make_pair(member->loc, id)).second) { // Didn't insert because we already had this loc inside the map. This should only // happen if we're seeing a newer copy of the same doc in a more recent snapshot. // Throw out the newer copy of the doc. _ws->free(id); ++_commonStats.needTime; return PlanStage::NEED_TIME; } // Update memory stats. _memUsage += member->getMemUsage(); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == childStatus) { // Done reading child 0. _currentChild = 1; // If our first child was empty, don't scan any others, no possible results. if (_dataMap.empty()) { _hashingChildren = false; return PlanStage::IS_EOF; } ++_commonStats.needTime; _specificStats.mapAfterChild.push_back(_dataMap.size()); return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == childStatus) { *out = id; // If a stage fails, it may create a status WSM to indicate why it // failed, in which case 'id' is valid. If ID is invalid, we // create our own error message. if (WorkingSet::INVALID_ID == id) { mongoutils::str::stream ss; ss << "hashed AND stage failed to read in results to from first child"; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember( _ws, status); } return childStatus; } else { if (PlanStage::NEED_TIME == childStatus) { ++_commonStats.needTime; } else if (PlanStage::NEED_YIELD == childStatus) { ++_commonStats.needYield; *out = id; } return childStatus; } }
PlanStage::StageState DeleteStage::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (isEOF()) { return PlanStage::IS_EOF; } invariant(_collection); // If isEOF() returns false, we must have a collection. // It is possible that after a delete was executed, a WriteConflictException occurred // and prevented us from returning ADVANCED with the old version of the document. if (_idReturning != WorkingSet::INVALID_ID) { // We should only get here if we were trying to return something before. invariant(_params.returnDeleted); WorkingSetMember* member = _ws->get(_idReturning); invariant(member->getState() == WorkingSetMember::OWNED_OBJ); *out = _idReturning; _idReturning = WorkingSet::INVALID_ID; ++_commonStats.advanced; return PlanStage::ADVANCED; } // Either retry the last WSM we worked on or get a new one from our child. WorkingSetID id; StageState status; if (_idRetrying == WorkingSet::INVALID_ID) { status = child()->work(&id); } else { status = ADVANCED; id = _idRetrying; _idRetrying = WorkingSet::INVALID_ID; } if (PlanStage::ADVANCED == status) { WorkingSetMember* member = _ws->get(id); // We want to free this member when we return, unless we need to retry it. ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id); if (!member->hasLoc()) { // We expect to be here because of an invalidation causing a force-fetch, and // doc-locking storage engines do not issue invalidations. ++_specificStats.nInvalidateSkips; ++_commonStats.needTime; return PlanStage::NEED_TIME; } RecordId rloc = member->loc; // Deletes can't have projections. This means that covering analysis will always add // a fetch. We should always get fetched data, and never just key data. invariant(member->hasObj()); try { // If the snapshot changed, then we have to make sure we have the latest copy of the // doc and that it still matches. std::unique_ptr<SeekableRecordCursor> cursor; if (getOpCtx()->recoveryUnit()->getSnapshotId() != member->obj.snapshotId()) { cursor = _collection->getCursor(getOpCtx()); if (!WorkingSetCommon::fetch(getOpCtx(), _ws, id, cursor)) { // Doc is already deleted. Nothing more to do. ++_commonStats.needTime; return PlanStage::NEED_TIME; } // Make sure the re-fetched doc still matches the predicate. if (_params.canonicalQuery && !_params.canonicalQuery->root()->matchesBSON(member->obj.value(), NULL)) { // Doesn't match. ++_commonStats.needTime; return PlanStage::NEED_TIME; } } // Ensure that the BSONObj underlying the WorkingSetMember is owned because saveState() // is allowed to free the memory. if (_params.returnDeleted) { member->makeObjOwnedIfNeeded(); } // TODO: Do we want to buffer docs and delete them in a group rather than // saving/restoring state repeatedly? try { if (supportsDocLocking()) { // Doc-locking engines require this before saveState() since they don't use // invalidations. WorkingSetCommon::prepareForSnapshotChange(_ws); } child()->saveState(); } catch (const WriteConflictException& wce) { std::terminate(); } if (_params.returnDeleted) { // Save a copy of the document that is about to get deleted, but keep it in the // LOC_AND_OBJ state in case we need to retry deleting it. BSONObj deletedDoc = member->obj.value(); member->obj.setValue(deletedDoc.getOwned()); } // Do the write, unless this is an explain. if (!_params.isExplain) { WriteUnitOfWork wunit(getOpCtx()); _collection->deleteDocument(getOpCtx(), rloc); wunit.commit(); } ++_specificStats.docsDeleted; } catch (const WriteConflictException& wce) { // Ensure that the BSONObj underlying the WorkingSetMember is owned because it may be // freed when we yield. member->makeObjOwnedIfNeeded(); _idRetrying = id; memberFreer.Dismiss(); // Keep this member around so we can retry deleting it. *out = WorkingSet::INVALID_ID; _commonStats.needYield++; return NEED_YIELD; } if (_params.returnDeleted) { // After deleting the document, the RecordId associated with this member is invalid. // Remove the 'loc' from the WorkingSetMember before returning it. member->loc = RecordId(); member->transitionToOwnedObj(); } // As restoreState may restore (recreate) cursors, cursors are tied to the // transaction in which they are created, and a WriteUnitOfWork is a // transaction, make sure to restore the state outside of the WritUnitOfWork. try { child()->restoreState(); } catch (const WriteConflictException& wce) { // Note we don't need to retry anything in this case since the delete already // was committed. However, we still need to return the deleted document // (if it was requested). if (_params.returnDeleted) { // member->obj should refer to the deleted document. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); _idReturning = id; // Keep this member around so that we can return it on the next work() call. memberFreer.Dismiss(); } *out = WorkingSet::INVALID_ID; _commonStats.needYield++; return NEED_YIELD; } if (_params.returnDeleted) { // member->obj should refer to the deleted document. invariant(member->getState() == WorkingSetMember::OWNED_OBJ); memberFreer.Dismiss(); // Keep this member around so we can return it. *out = id; ++_commonStats.advanced; return PlanStage::ADVANCED; } ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == status || PlanStage::DEAD == status) { *out = id; // If a stage fails, it may create a status WSM to indicate why it failed, in which case // 'id' is valid. If ID is invalid, we create our own error message. if (WorkingSet::INVALID_ID == id) { const std::string errmsg = "delete stage failed to read in results from child"; *out = WorkingSetCommon::allocateStatusMember( _ws, Status(ErrorCodes::InternalError, errmsg)); } return status; } else if (PlanStage::NEED_TIME == status) { ++_commonStats.needTime; } else if (PlanStage::NEED_YIELD == status) { *out = id; ++_commonStats.needYield; } return status; }
PlanStage::StageState AndHashStage::hashOtherChildren(WorkingSetID* out) { verify(_currentChild > 0); WorkingSetID id = WorkingSet::INVALID_ID; StageState childStatus = workChild(_currentChild, &id); if (PlanStage::ADVANCED == childStatus) { WorkingSetMember* member = _ws->get(id); // Maybe the child had an invalidation. We intersect RecordId(s) so we can't do anything // with this WSM. if (!member->hasLoc()) { _ws->flagForReview(id); return PlanStage::NEED_TIME; } verify(member->hasLoc()); if (_dataMap.end() == _dataMap.find(member->loc)) { // Ignore. It's not in any previous child. } else { // We have a hit. Copy data into the WSM we already have. _seenMap.insert(member->loc); WorkingSetMember* olderMember = _ws->get(_dataMap[member->loc]); size_t memUsageBefore = olderMember->getMemUsage(); AndCommon::mergeFrom(olderMember, *member); // Update memory stats. _memUsage += olderMember->getMemUsage() - memUsageBefore; } _ws->free(id); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == childStatus) { // Finished with a child. ++_currentChild; // Keep elements of _dataMap that are in _seenMap. DataMap::iterator it = _dataMap.begin(); while (it != _dataMap.end()) { if (_seenMap.end() == _seenMap.find(it->first)) { DataMap::iterator toErase = it; ++it; // Update memory stats. WorkingSetMember* member = _ws->get(toErase->second); _memUsage -= member->getMemUsage(); _ws->free(toErase->second); _dataMap.erase(toErase); } else { ++it; } } _specificStats.mapAfterChild.push_back(_dataMap.size()); _seenMap.clear(); // _dataMap is now the intersection of the first _currentChild nodes. // If we have nothing to AND with after finishing any child, stop. if (_dataMap.empty()) { _hashingChildren = false; return PlanStage::IS_EOF; } // We've finished scanning all children. Return results with the next call to work(). if (_currentChild == _children.size()) { _hashingChildren = false; } ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == childStatus) { *out = id; // If a stage fails, it may create a status WSM to indicate why it // failed, in which case 'id' is valid. If ID is invalid, we // create our own error message. if (WorkingSet::INVALID_ID == id) { mongoutils::str::stream ss; ss << "hashed AND stage failed to read in results from other child " << _currentChild; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember( _ws, status); } return childStatus; } else { if (PlanStage::NEED_TIME == childStatus) { ++_commonStats.needTime; } else if (PlanStage::NEED_YIELD == childStatus) { ++_commonStats.needYield; *out = id; } return childStatus; } }
PlanStage::StageState OrStage::work(WorkingSetID* out) { ++_commonStats.works; if (isEOF()) { return PlanStage::IS_EOF; } if (0 == _specificStats.matchTested.size()) { _specificStats.matchTested = vector<uint64_t>(_children.size(), 0); } WorkingSetID id; StageState childStatus = _children[_currentChild]->work(&id); if (PlanStage::ADVANCED == childStatus) { WorkingSetMember* member = _ws->get(id); verify(member->hasLoc()); // If we're deduping... if (_dedup) { ++_specificStats.dupsTested; // ...and we've seen the DiskLoc before if (_seen.end() != _seen.find(member->loc)) { // ...drop it. ++_specificStats.dupsDropped; _ws->free(id); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else { // Otherwise, note that we've seen it. _seen.insert(member->loc); } } if (NULL == _matcher || _matcher->matches(member)) { if (NULL != _matcher) { ++_specificStats.matchTested[_currentChild]; } // Match! return it. *out = id; ++_commonStats.advanced; return PlanStage::ADVANCED; } else { // Does not match, try again. _ws->free(id); ++_commonStats.needTime; return PlanStage::NEED_TIME; } } else if (PlanStage::IS_EOF == childStatus) { // Done with _currentChild, move to the next one. ++_currentChild; // Maybe we're out of children. if (isEOF()) { return PlanStage::IS_EOF; } else { ++_commonStats.needTime; return PlanStage::NEED_TIME; } } else { if (PlanStage::NEED_FETCH == childStatus) { ++_commonStats.needFetch; } else if (PlanStage::NEED_TIME == childStatus) { ++_commonStats.needTime; } // NEED_TIME, ERROR, NEED_YIELD, pass them up. return childStatus; } }
PlanStage::StageState DeleteStage::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (isEOF()) { return PlanStage::IS_EOF; } invariant(_collection); // If isEOF() returns false, we must have a collection. // Either retry the last WSM we worked on or get a new one from our child. WorkingSetID id; StageState status; if (_idRetrying == WorkingSet::INVALID_ID) { status = _child->work(&id); } else { status = ADVANCED; id = _idRetrying; _idRetrying = WorkingSet::INVALID_ID; } if (PlanStage::ADVANCED == status) { WorkingSetMember* member = _ws->get(id); // We want to free this member when we return, unless we need to retry it. ScopeGuard memberFreer = MakeGuard(&WorkingSet::free, _ws, id); if (!member->hasLoc()) { // We expect to be here because of an invalidation causing a force-fetch, and // doc-locking storage engines do not issue invalidations. dassert(!supportsDocLocking()); ++_specificStats.nInvalidateSkips; ++_commonStats.needTime; return PlanStage::NEED_TIME; } RecordId rloc = member->loc; try { // If the snapshot changed, then we have to make sure we have the latest copy of the // doc and that it still matches. if (_txn->recoveryUnit()->getSnapshotId() != member->obj.snapshotId()) { if (!WorkingSetCommon::fetch(_txn, member, _collection)) { // Doc is already deleted. Nothing more to do. ++_commonStats.needTime; return PlanStage::NEED_TIME; } // Make sure the re-fetched doc still matches the predicate. if (_params.canonicalQuery && !_params.canonicalQuery->root()->matchesBSON(member->obj.value(), NULL)) { // Doesn't match. ++_commonStats.needTime; return PlanStage::NEED_TIME; } } // TODO: Do we want to buffer docs and delete them in a group rather than // saving/restoring state repeatedly? try { _child->saveState(); if (supportsDocLocking()) { // Doc-locking engines require this after saveState() since they don't use // invalidations. WorkingSetCommon::prepareForSnapshotChange(_ws); } } catch ( const WriteConflictException& wce ) { std::terminate(); } // Do the write, unless this is an explain. if (!_params.isExplain) { WriteUnitOfWork wunit(_txn); const bool deleteCappedOK = false; const bool deleteNoWarn = false; BSONObj deletedDoc; _collection->deleteDocument(_txn, rloc, deleteCappedOK, deleteNoWarn, _params.shouldCallLogOp ? &deletedDoc : NULL); if (_params.shouldCallLogOp) { if (deletedDoc.isEmpty()) { log() << "Deleted object without id in collection " << _collection->ns() << ", not logging."; } else { getGlobalServiceContext()->getOpObserver()->onDelete( _txn, _collection->ns().ns(), deletedDoc, _params.fromMigrate); } } wunit.commit(); } ++_specificStats.docsDeleted; } catch ( const WriteConflictException& wce ) { _idRetrying = id; memberFreer.Dismiss(); // Keep this member around so we can retry deleting it. *out = WorkingSet::INVALID_ID; _commonStats.needYield++; return NEED_YIELD; } // As restoreState may restore (recreate) cursors, cursors are tied to the // transaction in which they are created, and a WriteUnitOfWork is a // transaction, make sure to restore the state outside of the WritUnitOfWork. try { _child->restoreState(_txn); } catch ( const WriteConflictException& wce ) { // Note we don't need to retry anything in this case since the delete already // was committed. *out = WorkingSet::INVALID_ID; _commonStats.needYield++; return NEED_YIELD; } ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == status) { *out = id; // If a stage fails, it may create a status WSM to indicate why it failed, in which case // 'id' is valid. If ID is invalid, we create our own error message. if (WorkingSet::INVALID_ID == id) { const std::string errmsg = "delete stage failed to read in results from child"; *out = WorkingSetCommon::allocateStatusMember(_ws, Status(ErrorCodes::InternalError, errmsg)); return PlanStage::FAILURE; } return status; } else if (PlanStage::NEED_TIME == status) { ++_commonStats.needTime; } else if (PlanStage::NEED_YIELD == status) { *out = id; ++_commonStats.needYield; } return status; }
PlanStage::StageState SortStage::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (NULL == _sortKeyGen) { // This is heavy and should be done as part of work(). _sortKeyGen.reset(new SortStageKeyGenerator(_collection, _pattern, _query)); _sortKeyComparator.reset(new WorkingSetComparator(_sortKeyGen->getSortComparator())); // If limit > 1, we need to initialize _dataSet here to maintain ordered // set of data items while fetching from the child stage. if (_limit > 1) { const WorkingSetComparator& cmp = *_sortKeyComparator; _dataSet.reset(new SortableDataItemSet(cmp)); } return PlanStage::NEED_TIME; } const size_t maxBytes = static_cast<size_t>(internalQueryExecMaxBlockingSortBytes); if (_memUsage > maxBytes) { mongoutils::str::stream ss; ss << "Sort operation used more than the maximum " << maxBytes << " bytes of RAM. Add an index, or specify a smaller limit."; Status status(ErrorCodes::OperationFailed, ss); *out = WorkingSetCommon::allocateStatusMember(_ws, status); return PlanStage::FAILURE; } if (isEOF()) { return PlanStage::IS_EOF; } // Still reading in results to sort. if (!_sorted) { WorkingSetID id = WorkingSet::INVALID_ID; StageState code = _child->work(&id); if (PlanStage::ADVANCED == code) { // Add it into the map for quick invalidation if it has a valid RecordId. // A RecordId may be invalidated at any time (during a yield). We need to get into // the WorkingSet as quickly as possible to handle it. WorkingSetMember* member = _ws->get(id); // Planner must put a fetch before we get here. verify(member->hasObj()); // We might be sorting something that was invalidated at some point. if (member->hasLoc()) { _wsidByDiskLoc[member->loc] = id; } // The data remains in the WorkingSet and we wrap the WSID with the sort key. SortableDataItem item; Status sortKeyStatus = _sortKeyGen->getSortKey(*member, &item.sortKey); if (!_sortKeyGen->getSortKey(*member, &item.sortKey).isOK()) { *out = WorkingSetCommon::allocateStatusMember(_ws, sortKeyStatus); return PlanStage::FAILURE; } item.wsid = id; if (member->hasLoc()) { // The RecordId breaks ties when sorting two WSMs with the same sort key. item.loc = member->loc; } addToBuffer(item); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == code) { // TODO: We don't need the lock for this. We could ask for a yield and do this work // unlocked. Also, this is performing a lot of work for one call to work(...) sortBuffer(); _resultIterator = _data.begin(); _sorted = true; ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == code || PlanStage::DEAD == code) { *out = id; // If a stage fails, it may create a status WSM to indicate why it // failed, in which case 'id' is valid. If ID is invalid, we // create our own error message. if (WorkingSet::INVALID_ID == id) { mongoutils::str::stream ss; ss << "sort stage failed to read in results to sort from child"; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember(_ws, status); } return code; } else if (PlanStage::NEED_TIME == code) { ++_commonStats.needTime; } else if (PlanStage::NEED_YIELD == code) { ++_commonStats.needYield; *out = id; } return code; } // Returning results. verify(_resultIterator != _data.end()); verify(_sorted); *out = _resultIterator->wsid; _resultIterator++; // If we're returning something, take it out of our DL -> WSID map so that future // calls to invalidate don't cause us to take action for a DL we're done with. WorkingSetMember* member = _ws->get(*out); if (member->hasLoc()) { _wsidByDiskLoc.erase(member->loc); } ++_commonStats.advanced; return PlanStage::ADVANCED; }
PlanStage::StageState TwoDNear::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (!_initted) { _initted = true; if ( !_params.collection ) return PlanStage::IS_EOF; IndexCatalog* indexCatalog = _params.collection->getIndexCatalog(); IndexDescriptor* desc = indexCatalog->findIndexByKeyPattern(_params.indexKeyPattern); if ( desc == NULL ) return PlanStage::IS_EOF; TwoDAccessMethod* am = static_cast<TwoDAccessMethod*>( indexCatalog->getIndex( desc ) ); auto_ptr<twod_exec::GeoSearch> search; search.reset(new twod_exec::GeoSearch(_params.collection, am, _params.nearQuery.centroid.oldPoint, _params.numWanted, _params.filter, _params.nearQuery.maxDistance, _params.nearQuery.isNearSphere ? twod_exec::GEO_SPHERE : twod_exec::GEO_PLANE)); // This is where all the work is done. :( search->exec(); _specificStats.objectsLoaded = search->_objectsLoaded; _specificStats.nscanned = search->_lookedAt; for (twod_exec::GeoHopper::Holder::iterator it = search->_points.begin(); it != search->_points.end(); it++) { WorkingSetID id = _workingSet->allocate(); WorkingSetMember* member = _workingSet->get(id); member->loc = it->_loc; member->obj = _params.collection->docFor(member->loc); member->state = WorkingSetMember::LOC_AND_UNOWNED_OBJ; if (_params.addDistMeta) { member->addComputed(new GeoDistanceComputedData(it->_distance)); } if (_params.addPointMeta) { member->addComputed(new GeoNearPointComputedData(it->_pt)); } _results.push(Result(id, it->_distance)); _invalidationMap.insert(pair<DiskLoc, WorkingSetID>(it->_loc, id)); } } if (isEOF()) { return PlanStage::IS_EOF; } Result result = _results.top(); _results.pop(); *out = result.id; // Remove from invalidation map. WorkingSetMember* member = _workingSet->get(*out); // The WSM may have been mutated or deleted so it may not have a loc. if (member->hasLoc()) { typedef multimap<DiskLoc, WorkingSetID>::iterator MMIT; pair<MMIT, MMIT> range = _invalidationMap.equal_range(member->loc); for (MMIT it = range.first; it != range.second; ++it) { if (it->second == *out) { _invalidationMap.erase(it); break; } } } ++_commonStats.advanced; return PlanStage::ADVANCED; }
PlanStage::StageState AndSortedStage::getTargetLoc(WorkingSetID* out) { verify(numeric_limits<size_t>::max() == _targetNode); verify(WorkingSet::INVALID_ID == _targetId); verify(RecordId() == _targetLoc); // Pick one, and get a loc to work toward. WorkingSetID id = WorkingSet::INVALID_ID; StageState state = _children[0]->work(&id); if (PlanStage::ADVANCED == state) { WorkingSetMember* member = _ws->get(id); // Maybe the child had an invalidation. We intersect RecordId(s) so we can't do anything // with this WSM. if (!member->hasLoc()) { _ws->flagForReview(id); return PlanStage::NEED_TIME; } verify(member->hasLoc()); // We have a value from one child to AND with. _targetNode = 0; _targetId = id; _targetLoc = member->loc; // We have to AND with all other children. for (size_t i = 1; i < _children.size(); ++i) { _workingTowardRep.push(i); } ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == state) { _isEOF = true; return state; } else if (PlanStage::FAILURE == state) { *out = id; // If a stage fails, it may create a status WSM to indicate why it // failed, in which case 'id' is valid. If ID is invalid, we // create our own error message. if (WorkingSet::INVALID_ID == id) { mongoutils::str::stream ss; ss << "sorted AND stage failed to read in results from first child"; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember(_ws, status); } _isEOF = true; return state; } else { if (PlanStage::NEED_TIME == state) { ++_commonStats.needTime; } else if (PlanStage::NEED_YIELD == state) { ++_commonStats.needYield; *out = id; } // NEED_TIME, NEED_YIELD. return state; } }
PlanStage::StageState UpdateStage::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (isEOF()) { return PlanStage::IS_EOF; } if (doneUpdating()) { // Even if we're done updating, we may have some inserting left to do. if (needInsert()) { doInsert(); } // At this point either we're done updating and there was no insert to do, // or we're done updating and we're done inserting. Either way, we're EOF. invariant(isEOF()); return PlanStage::IS_EOF; } // If we're here, then we still have to ask for results from the child and apply // updates to them. We should only get here if the collection exists. invariant(_collection); WorkingSetID id = WorkingSet::INVALID_ID; StageState status = _child->work(&id); if (PlanStage::ADVANCED == status) { // Need to get these things from the result returned by the child. DiskLoc loc; BSONObj oldObj; WorkingSetMember* member = _ws->get(id); if (!member->hasLoc()) { _ws->free(id); const std::string errmsg = "update stage failed to read member w/ loc from child"; *out = WorkingSetCommon::allocateStatusMember(_ws, Status(ErrorCodes::InternalError, errmsg)); return PlanStage::FAILURE; } loc = member->loc; // Updates can't have projections. This means that covering analysis will always add // a fetch. We should always get fetched data, and never just key data. invariant(member->hasObj()); oldObj = member->obj; // If we're here, then we have retrieved both a DiskLoc and the corresponding // unowned object from the child stage. Since we have the object and the diskloc, // we can free the WSM. _ws->free(id); // We fill this with the new locs of moved doc so we don't double-update. if (_updatedLocs && _updatedLocs->count(loc) > 0) { // Found a loc that we already updated. ++_commonStats.needTime; return PlanStage::NEED_TIME; } ++_specificStats.nMatched; // Do the update and return. transformAndUpdate(oldObj, loc); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == status) { // The child is out of results, but we might not be done yet because we still might // have to do an insert. ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == status) { *out = id; // If a stage fails, it may create a status WSM to indicate why it failed, in which case // 'id' is valid. If ID is invalid, we create our own error message. if (WorkingSet::INVALID_ID == id) { const std::string errmsg = "delete stage failed to read in results from child"; *out = WorkingSetCommon::allocateStatusMember(_ws, Status(ErrorCodes::InternalError, errmsg)); return PlanStage::FAILURE; } return status; } else { if (PlanStage::NEED_TIME == status) { ++_commonStats.needTime; } return status; } }
Runner::RunnerState MultiPlanRunner::getNext(BSONObj* objOut, DiskLoc* dlOut) { if (_killed) { return Runner::RUNNER_DEAD; } if (_failure) { return Runner::RUNNER_ERROR; } // If we haven't picked the best plan yet... if (NULL == _bestPlan) { if (!pickBestPlan(NULL, objOut)) { verify(_failure || _killed); if (_killed) { return Runner::RUNNER_DEAD; } if (_failure) { return Runner::RUNNER_ERROR; } } } // Look for an already produced result that provides the data the caller wants. while (!_alreadyProduced.empty()) { WorkingSetID id = _alreadyProduced.front(); _alreadyProduced.pop_front(); WorkingSetMember* member = _bestPlan->getWorkingSet()->get(id); // Note that this copies code from PlanExecutor. if (NULL != objOut) { if (WorkingSetMember::LOC_AND_IDX == member->state) { if (1 != member->keyData.size()) { _bestPlan->getWorkingSet()->free(id); // If the caller needs the key data and the WSM doesn't have it, drop the // result and carry on. continue; } *objOut = member->keyData[0].keyData; } else if (member->hasObj()) { *objOut = member->obj; } else { // If the caller needs an object and the WSM doesn't have it, drop and // try the next result. _bestPlan->getWorkingSet()->free(id); continue; } } if (NULL != dlOut) { if (member->hasLoc()) { *dlOut = member->loc; } else { // If the caller needs a DiskLoc and the WSM doesn't have it, drop and carry on. _bestPlan->getWorkingSet()->free(id); continue; } } // If we're here, the caller has all the data needed and we've set the out // parameters. Remove the result from the WorkingSet. _bestPlan->getWorkingSet()->free(id); return Runner::RUNNER_ADVANCED; } RunnerState state = _bestPlan->getNext(objOut, dlOut); if (Runner::RUNNER_ERROR == state && (NULL != _backupSolution)) { QLOG() << "Best plan errored out switching to backup\n"; // Uncache the bad solution if we fall back // on the backup solution. // // XXX: Instead of uncaching we should find a way for the // cached plan runner to fall back on a different solution // if the best solution fails. Alternatively we could try to // defer cache insertion to be after the first produced result. Database* db = cc().database(); verify(NULL != db); Collection* collection = db->getCollection(_query->ns()); verify(NULL != collection); PlanCache* cache = collection->infoCache()->getPlanCache(); cache->remove(*_query); _bestPlan.reset(_backupPlan); _backupPlan = NULL; _bestSolution.reset(_backupSolution); _backupSolution = NULL; _alreadyProduced = _backupAlreadyProduced; return getNext(objOut, dlOut); } if (NULL != _backupSolution && Runner::RUNNER_ADVANCED == state) { QLOG() << "Best plan had a blocking sort, became unblocked, deleting backup plan\n"; delete _backupSolution; delete _backupPlan; _backupSolution = NULL; _backupPlan = NULL; // TODO: free from WS? _backupAlreadyProduced.clear(); } return state; }
PlanStage::StageState OrStage::doWork(WorkingSetID* out) { if (isEOF()) { return PlanStage::IS_EOF; } WorkingSetID id = WorkingSet::INVALID_ID; StageState childStatus = _children[_currentChild]->work(&id); if (PlanStage::ADVANCED == childStatus) { WorkingSetMember* member = _ws->get(id); // If we're deduping (and there's something to dedup by) if (_dedup && member->hasLoc()) { ++_specificStats.dupsTested; // ...and we've seen the RecordId before if (_seen.end() != _seen.find(member->loc)) { // ...drop it. ++_specificStats.dupsDropped; _ws->free(id); return PlanStage::NEED_TIME; } else { // Otherwise, note that we've seen it. _seen.insert(member->loc); } } if (Filter::passes(member, _filter)) { // Match! return it. *out = id; return PlanStage::ADVANCED; } else { // Does not match, try again. _ws->free(id); return PlanStage::NEED_TIME; } } else if (PlanStage::IS_EOF == childStatus) { // Done with _currentChild, move to the next one. ++_currentChild; // Maybe we're out of children. if (isEOF()) { return PlanStage::IS_EOF; } else { return PlanStage::NEED_TIME; } } else if (PlanStage::FAILURE == childStatus || PlanStage::DEAD == childStatus) { *out = id; // If a stage fails, it may create a status WSM to indicate why it // failed, in which case 'id' is valid. If ID is invalid, we // create our own error message. if (WorkingSet::INVALID_ID == id) { mongoutils::str::stream ss; ss << "OR stage failed to read in results from child " << _currentChild; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember(_ws, status); } return childStatus; } else if (PlanStage::NEED_YIELD == childStatus) { *out = id; } // NEED_TIME, ERROR, NEED_YIELD, pass them up. return childStatus; }
void run() { Client::WriteContext ctx(&_txn, ns()); Database* db = ctx.ctx().db(); Collection* coll = db->getCollection(&_txn, ns()); if (!coll) { coll = db->createCollection(&_txn, ns()); } fillData(); // The data we're going to later invalidate. set<DiskLoc> locs; getLocs(&locs, coll); // Build the mock scan stage which feeds the data. WorkingSet ws; auto_ptr<MockStage> ms(new MockStage(&ws)); insertVarietyOfObjects(ms.get(), coll); SortStageParams params; params.collection = coll; params.pattern = BSON("foo" << 1); params.limit = limit(); auto_ptr<SortStage> ss(new SortStage(&_txn, params, &ws, ms.get())); const int firstRead = 10; // Have sort read in data from the mock stage. for (int i = 0; i < firstRead; ++i) { WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState status = ss->work(&id); ASSERT_NOT_EQUALS(PlanStage::ADVANCED, status); } // We should have read in the first 'firstRead' locs. Invalidate the first. ss->saveState(); set<DiskLoc>::iterator it = locs.begin(); ss->invalidate(*it++, INVALIDATION_DELETION); ss->restoreState(&_txn); // Read the rest of the data from the mock stage. while (!ms->isEOF()) { WorkingSetID id = WorkingSet::INVALID_ID; ss->work(&id); } // Release to prevent double-deletion. ms.release(); // Let's just invalidate everything now. ss->saveState(); while (it != locs.end()) { ss->invalidate(*it++, INVALIDATION_DELETION); } ss->restoreState(&_txn); // Invalidation of data in the sort stage fetches it but passes it through. int count = 0; while (!ss->isEOF()) { WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState status = ss->work(&id); if (PlanStage::ADVANCED != status) { continue; } WorkingSetMember* member = ws.get(id); ASSERT(member->hasObj()); ASSERT(!member->hasLoc()); ++count; } ctx.commit(); // Returns all docs. ASSERT_EQUALS(limit() ? limit() : numObj(), count); }
bool MultiPlanRunner::workAllPlans(BSONObj* objOut) { bool doneWorking = false; for (size_t i = 0; i < _candidates.size(); ++i) { CandidatePlan& candidate = _candidates[i]; if (candidate.failed) { continue; } // Yield, if we can yield ourselves. if (NULL != _yieldPolicy.get() && _yieldPolicy->shouldYield()) { saveState(); _yieldPolicy->yield(); if (_failure || _killed) { return false; } restoreState(); } WorkingSetID id = WorkingSet::INVALID_ID; PlanStage::StageState state = candidate.root->work(&id); if (PlanStage::ADVANCED == state) { // Save result for later. candidate.results.push_back(id); // Once a plan returns enough results, stop working. if (candidate.results.size() >= size_t(internalQueryPlanEvaluationMaxResults)) { doneWorking = true; } } else if (PlanStage::NEED_TIME == state) { // Fall through to yield check at end of large conditional. } else if (PlanStage::NEED_FETCH == state) { // id has a loc and refers to an obj we need to fetch. WorkingSetMember* member = candidate.ws->get(id); // This must be true for somebody to request a fetch and can only change when an // invalidation happens, which is when we give up a lock. Don't give up the // lock between receiving the NEED_FETCH and actually fetching(?). verify(member->hasLoc()); // Actually bring record into memory. Record* record = member->loc.rec(); // If we're allowed to, go to disk outside of the lock. if (NULL != _yieldPolicy.get()) { saveState(); _yieldPolicy->yield(record); if (_failure || _killed) { return false; } restoreState(); } else { // We're set to manually yield. We go to disk in the lock. record->touch(); } // Record should be in memory now. Log if it's not. if (!Record::likelyInPhysicalMemory(record->dataNoThrowing())) { OCCASIONALLY { warning() << "Record wasn't in memory immediately after fetch: " << member->loc.toString() << endl; } } // Note that we're not freeing id. Fetch semantics say that we shouldn't. }