PlanStage::StageState MergeSortStage::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (isEOF()) { return PlanStage::IS_EOF; } if (!_noResultToMerge.empty()) { // We have some child that we don't have a result from. Each child must have a result // in order to pick the minimum result among all our children. Work a child. PlanStage* child = _noResultToMerge.front(); WorkingSetID id = WorkingSet::INVALID_ID; StageState code = child->work(&id); if (PlanStage::ADVANCED == code) { WorkingSetMember* member = _ws->get(id); // If we're deduping... if (_dedup) { if (!member->hasLoc()) { // Can't dedup data unless there's a RecordId. We go ahead and use its // result. _noResultToMerge.pop(); } else { ++_specificStats.dupsTested; // ...and there's a diskloc and and we've seen the RecordId before if (_seen.end() != _seen.find(member->loc)) { // ...drop it. _ws->free(id); ++_commonStats.needTime; ++_specificStats.dupsDropped; return PlanStage::NEED_TIME; } else { // Otherwise, note that we've seen it. _seen.insert(member->loc); // We're going to use the result from the child, so we remove it from // the queue of children without a result. _noResultToMerge.pop(); } } } else { // Not deduping. We use any result we get from the child. Remove the child // from the queue of things without a result. _noResultToMerge.pop(); } // Store the result in our list. StageWithValue value; value.id = id; value.stage = child; // Ensure that the BSONObj underlying the WorkingSetMember is owned in case we yield. member->makeObjOwned(); _mergingData.push_front(value); // Insert the result (indirectly) into our priority queue. _merging.push(_mergingData.begin()); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == code) { // There are no more results possible from this child. Don't bother with it // anymore. _noResultToMerge.pop(); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == code || PlanStage::DEAD == code) { *out = id; // If a stage fails, it may create a status WSM to indicate why it // failed, in which case 'id' is valid. If ID is invalid, we // create our own error message. if (WorkingSet::INVALID_ID == id) { mongoutils::str::stream ss; ss << "merge sort stage failed to read in results from child"; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember(_ws, status); } return code; } else { if (PlanStage::NEED_TIME == code) { ++_commonStats.needTime; } else if (PlanStage::NEED_YIELD == code) { *out = id; ++_commonStats.needYield; } return code; } } // If we're here, for each non-EOF child, we have a valid WSID. verify(!_merging.empty()); // Get the 'min' WSID. _merging is a priority queue so its top is the smallest. MergingRef top = _merging.top(); _merging.pop(); // Since we're returning the WSID that came from top->stage, we need to work(...) it again // to get a new result. _noResultToMerge.push(top->stage); // Save the ID that we're returning and remove the returned result from our data. WorkingSetID idToTest = top->id; _mergingData.erase(top); // Return the min. *out = idToTest; ++_commonStats.advanced; // But don't return it if it's flagged. if (_ws->isFlagged(*out)) { return PlanStage::NEED_TIME; } return PlanStage::ADVANCED; }
PlanStage::StageState AndSortedStage::moveTowardTargetLoc(WorkingSetID* out) { verify(numeric_limits<size_t>::max() != _targetNode); verify(WorkingSet::INVALID_ID != _targetId); // We have nodes that haven't hit _targetLoc yet. size_t workingChildNumber = _workingTowardRep.front(); PlanStage* next = _children[workingChildNumber]; WorkingSetID id = WorkingSet::INVALID_ID; StageState state = next->work(&id); if (PlanStage::ADVANCED == state) { WorkingSetMember* member = _ws->get(id); // Maybe the child had an invalidation. We intersect DiskLoc(s) so we can't do anything // with this WSM. if (!member->hasLoc()) { _ws->flagForReview(id); return PlanStage::NEED_TIME; } verify(member->hasLoc()); if (member->loc == _targetLoc) { // The front element has hit _targetLoc. Don't move it forward anymore/work on // another element. _workingTowardRep.pop(); AndCommon::mergeFrom(_ws->get(_targetId), *member); _ws->free(id); if (0 == _workingTowardRep.size()) { WorkingSetID toReturn = _targetId; WorkingSetMember* toMatchTest = _ws->get(toReturn); _targetNode = numeric_limits<size_t>::max(); _targetId = WorkingSet::INVALID_ID; _targetLoc = DiskLoc(); // Everyone hit it, hooray. Return it, if it matches. if (Filter::passes(toMatchTest, _filter)) { if (NULL != _filter) { ++_specificStats.matchTested; } *out = toReturn; ++_commonStats.advanced; return PlanStage::ADVANCED; } else { _ws->free(toReturn); ++_commonStats.needTime; return PlanStage::NEED_TIME; } } // More children need to be advanced to _targetLoc. ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (member->loc < _targetLoc) { // The front element of _workingTowardRep hasn't hit the thing we're AND-ing with // yet. Try again later. _ws->free(id); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else { // member->loc > _targetLoc. // _targetLoc wasn't successfully AND-ed with the other sub-plans. We toss it and // try AND-ing with the next value. _specificStats.failedAnd[_targetNode]++; _ws->free(_targetId); _targetNode = workingChildNumber; _targetLoc = member->loc; _targetId = id; _workingTowardRep = std::queue<size_t>(); for (size_t i = 0; i < _children.size(); ++i) { if (workingChildNumber != i) { _workingTowardRep.push(i); } } // Need time to chase after the new _targetLoc. ++_commonStats.needTime; return PlanStage::NEED_TIME; } } else if (PlanStage::IS_EOF == state) { _isEOF = true; _ws->free(_targetId); return state; } else if (PlanStage::FAILURE == state) { *out = id; // If a stage fails, it may create a status WSM to indicate why it // failed, in which case 'id' is valid. If ID is invalid, we // create our own error message. if (WorkingSet::INVALID_ID == id) { mongoutils::str::stream ss; ss << "sorted AND stage failed to read in results from child " << workingChildNumber; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember( _ws, status); } _isEOF = true; _ws->free(_targetId); return state; } else { if (PlanStage::NEED_TIME == state) { ++_commonStats.needTime; } else if (PlanStage::NEED_FETCH == state) { ++_commonStats.needFetch; *out = id; } return state; } }
PlanStage::StageState AndHashStage::work(WorkingSetID* out) { ++_commonStats.works; // Adds the amount of time taken by work() to executionTimeMillis. ScopedTimer timer(&_commonStats.executionTimeMillis); if (isEOF()) { return PlanStage::IS_EOF; } // Fast-path for one of our children being EOF immediately. We work each child a few times. // If it hits EOF, the AND cannot output anything. If it produces a result, we stash that // result in _lookAheadResults. if (_lookAheadResults.empty()) { // INVALID_ID means that the child didn't produce a valid result. // We specifically are not using .resize(size, value) here because C++11 builds don't // seem to resolve WorkingSet::INVALID_ID during linking. _lookAheadResults.resize(_children.size()); for (size_t i = 0; i < _children.size(); ++i) { _lookAheadResults[i] = WorkingSet::INVALID_ID; } // Work each child some number of times until it's either EOF or produces // a result. If it's EOF this whole stage will be EOF. If it produces a // result we cache it for later. for (size_t i = 0; i < _children.size(); ++i) { PlanStage* child = _children[i]; for (size_t j = 0; j < kLookAheadWorks; ++j) { StageState childStatus = child->work(&_lookAheadResults[i]); if (PlanStage::IS_EOF == childStatus || PlanStage::DEAD == childStatus) { // A child went right to EOF. Bail out. _hashingChildren = false; _dataMap.clear(); return PlanStage::IS_EOF; } else if (PlanStage::ADVANCED == childStatus) { // We have a result cached in _lookAheadResults[i]. Stop looking at this // child. break; } else if (PlanStage::FAILURE == childStatus) { // Propage error to parent. *out = _lookAheadResults[i]; // If a stage fails, it may create a status WSM to indicate why it // failed, in which case 'id' is valid. If ID is invalid, we // create our own error message. if (WorkingSet::INVALID_ID == *out) { mongoutils::str::stream ss; ss << "hashed AND stage failed to read in look ahead results " << "from child " << i; Status status(ErrorCodes::InternalError, ss); *out = WorkingSetCommon::allocateStatusMember( _ws, status); } _hashingChildren = false; _dataMap.clear(); return PlanStage::FAILURE; } // We ignore NEED_TIME. TODO: what do we want to do if we get NEED_YIELD here? } } // We did a bunch of work above, return NEED_TIME to be fair. return PlanStage::NEED_TIME; } // An AND is either reading the first child into the hash table, probing against the hash // table with subsequent children, or checking the last child's results to see if they're // in the hash table. // We read the first child into our hash table. if (_hashingChildren) { // Check memory usage of previously hashed results. if (_memUsage > _maxMemUsage) { mongoutils::str::stream ss; ss << "hashed AND stage buffered data usage of " << _memUsage << " bytes exceeds internal limit of " << kDefaultMaxMemUsageBytes << " bytes"; Status status(ErrorCodes::Overflow, ss); *out = WorkingSetCommon::allocateStatusMember( _ws, status); return PlanStage::FAILURE; } if (0 == _currentChild) { return readFirstChild(out); } else if (_currentChild < _children.size() - 1) { return hashOtherChildren(out); } else { _hashingChildren = false; // We don't hash our last child. Instead, we probe the table created from the // previous children, returning results in the order of the last child. // Fall through to below. } } // Returning results. We read from the last child and return the results that are in our // hash map. // We should be EOF if we're not hashing results and the dataMap is empty. verify(!_dataMap.empty()); // We probe _dataMap with the last child. verify(_currentChild == _children.size() - 1); // Get the next result for the (_children.size() - 1)-th child. StageState childStatus = workChild(_children.size() - 1, out); if (PlanStage::ADVANCED != childStatus) { return childStatus; } // We know that we've ADVANCED. See if the WSM is in our table. WorkingSetMember* member = _ws->get(*out); // Maybe the child had an invalidation. We intersect RecordId(s) so we can't do anything // with this WSM. if (!member->hasLoc()) { _ws->flagForReview(*out); return PlanStage::NEED_TIME; } DataMap::iterator it = _dataMap.find(member->loc); if (_dataMap.end() == it) { // Child's output wasn't in every previous child. Throw it out. _ws->free(*out); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else { // Child's output was in every previous child. Merge any key data in // the child's output and free the child's just-outputted WSM. WorkingSetID hashID = it->second; _dataMap.erase(it); WorkingSetMember* olderMember = _ws->get(hashID); AndCommon::mergeFrom(olderMember, *member); _ws->free(*out); // We should check for matching at the end so the matcher can use information in the // indices of all our children. if (Filter::passes(olderMember, _filter)) { *out = hashID; ++_commonStats.advanced; return PlanStage::ADVANCED; } else { _ws->free(hashID); ++_commonStats.needTime; return PlanStage::NEED_TIME; } } }
PlanStage::StageState MergeSortStage::doWork(WorkingSetID* out) { if (isEOF()) { return PlanStage::IS_EOF; } if (!_noResultToMerge.empty()) { // We have some child that we don't have a result from. Each child must have a result // in order to pick the minimum result among all our children. Work a child. PlanStage* child = _noResultToMerge.front(); WorkingSetID id = WorkingSet::INVALID_ID; StageState code = child->work(&id); if (PlanStage::ADVANCED == code) { WorkingSetMember* member = _ws->get(id); // If we're deduping... if (_dedup) { if (!member->hasRecordId()) { // Can't dedup data unless there's a RecordId. We go ahead and use its // result. _noResultToMerge.pop(); } else { ++_specificStats.dupsTested; // ...and there's a RecordId and and we've seen the RecordId before if (_seen.end() != _seen.find(member->recordId)) { // ...drop it. _ws->free(id); ++_specificStats.dupsDropped; return PlanStage::NEED_TIME; } else { // Otherwise, note that we've seen it. _seen.insert(member->recordId); // We're going to use the result from the child, so we remove it from // the queue of children without a result. _noResultToMerge.pop(); } } } else { // Not deduping. We use any result we get from the child. Remove the child // from the queue of things without a result. _noResultToMerge.pop(); } // Store the result in our list. StageWithValue value; value.id = id; value.stage = child; // Ensure that the BSONObj underlying the WorkingSetMember is owned in case we yield. member->makeObjOwnedIfNeeded(); _mergingData.push_front(value); // Insert the result (indirectly) into our priority queue. _merging.push(_mergingData.begin()); return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == code) { // There are no more results possible from this child. Don't bother with it // anymore. _noResultToMerge.pop(); return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == code) { // The stage which produces a failure is responsible for allocating a working set member // with error details. invariant(WorkingSet::INVALID_ID != id); *out = id; return code; } else if (PlanStage::NEED_YIELD == code) { *out = id; return code; } else { return code; } } // If we're here, for each non-EOF child, we have a valid WSID. verify(!_merging.empty()); // Get the 'min' WSID. _merging is a priority queue so its top is the smallest. MergingRef top = _merging.top(); _merging.pop(); // Since we're returning the WSID that came from top->stage, we need to work(...) it again // to get a new result. _noResultToMerge.push(top->stage); // Save the ID that we're returning and remove the returned result from our data. WorkingSetID idToTest = top->id; _mergingData.erase(top); // Return the min. *out = idToTest; return PlanStage::ADVANCED; }
PlanStage::StageState MergeSortStage::work(WorkingSetID* out) { ++_commonStats.works; if (isEOF()) { return PlanStage::IS_EOF; } if (!_noResultToMerge.empty()) { // We have some child that we don't have a result from. Each child must have a result // in order to pick the minimum result among all our children. Work a child. PlanStage* child = _noResultToMerge.front(); WorkingSetID id = WorkingSet::INVALID_ID; StageState code = child->work(&id); if (PlanStage::ADVANCED == code) { // If we're deduping... if (_dedup) { WorkingSetMember* member = _ws->get(id); if (!member->hasLoc()) { // Can't dedup data unless there's a DiskLoc. We go ahead and use its // result. _noResultToMerge.pop(); } else { ++_specificStats.dupsTested; // ...and there's a diskloc and and we've seen the DiskLoc before if (_seen.end() != _seen.find(member->loc)) { // ...drop it. _ws->free(id); ++_commonStats.needTime; ++_specificStats.dupsDropped; return PlanStage::NEED_TIME; } else { // Otherwise, note that we've seen it. _seen.insert(member->loc); // We're going to use the result from the child, so we remove it from // the queue of children without a result. _noResultToMerge.pop(); } } } else { // Not deduping. We use any result we get from the child. Remove the child // from the queue of things without a result. _noResultToMerge.pop(); } // Store the result in our list. StageWithValue value; value.id = id; value.stage = child; _mergingData.push_front(value); // Insert the result (indirectly) into our priority queue. _merging.push(_mergingData.begin()); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::IS_EOF == code) { // There are no more results possible from this child. Don't bother with it // anymore. _noResultToMerge.pop(); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else if (PlanStage::FAILURE == code) { *out = id; return code; } else { if (PlanStage::NEED_FETCH == code) { *out = id; ++_commonStats.needFetch; } else if (PlanStage::NEED_TIME == code) { ++_commonStats.needTime; } return code; } } // If we're here, for each non-EOF child, we have a valid WSID. verify(!_merging.empty()); // Get the 'min' WSID. _merging is a priority queue so its top is the smallest. MergingRef top = _merging.top(); _merging.pop(); // Since we're returning the WSID that came from top->stage, we need to work(...) it again // to get a new result. _noResultToMerge.push(top->stage); // Save the ID that we're returning and remove the returned result from our data. WorkingSetID idToTest = top->id; _mergingData.erase(top); // Return the min. *out = idToTest; ++_commonStats.advanced; // But don't return it if it's flagged. if (_ws->isFlagged(*out)) { _ws->free(*out); return PlanStage::NEED_TIME; } return PlanStage::ADVANCED; }
PlanStage::StageState AndHashStage::work(WorkingSetID* out) { ++_commonStats.works; if (isEOF()) { return PlanStage::IS_EOF; } // Fast-path for one of our children being EOF immediately. We work each child a few times. // If it hits EOF, the AND cannot output anything. If it produces a result, we stash that // result in _lookAheadResults. if (_lookAheadResults.empty()) { // INVALID_ID means that the child didn't produce a valid result. _lookAheadResults.resize(_children.size(), WorkingSet::INVALID_ID); // Work each child some number of times until it's either EOF or produces // a result. If it's EOF this whole stage will be EOF. If it produces a // result we cache it for later. for (size_t i = 0; i < _children.size(); ++i) { PlanStage* child = _children[i]; for (size_t j = 0; j < kLookAheadWorks; ++j) { StageState childStatus = child->work(&_lookAheadResults[i]); if (PlanStage::IS_EOF == childStatus || PlanStage::DEAD == childStatus || PlanStage::FAILURE == childStatus) { // A child went right to EOF. Bail out. _hashingChildren = false; _dataMap.clear(); return PlanStage::IS_EOF; } else if (PlanStage::ADVANCED == childStatus) { // We have a result cached in _lookAheadResults[i]. Stop looking at this // child. break; } // We ignore NEED_TIME. TODO: What do we want to do if the child provides // NEED_FETCH? } } // We did a bunch of work above, return NEED_TIME to be fair. return PlanStage::NEED_TIME; } // An AND is either reading the first child into the hash table, probing against the hash // table with subsequent children, or checking the last child's results to see if they're // in the hash table. // We read the first child into our hash table. if (_hashingChildren) { if (0 == _currentChild) { return readFirstChild(out); } else if (_currentChild < _children.size() - 1) { return hashOtherChildren(out); } else { _hashingChildren = false; // We don't hash our last child. Instead, we probe the table created from the // previous children, returning results in the order of the last child. // Fall through to below. } } // Returning results. We read from the last child and return the results that are in our // hash map. // We should be EOF if we're not hashing results and the dataMap is empty. verify(!_dataMap.empty()); // We probe _dataMap with the last child. verify(_currentChild == _children.size() - 1); // Get the next result for the (_children.size() - 1)-th child. StageState childStatus = workChild(_children.size() - 1, out); if (PlanStage::ADVANCED != childStatus) { return childStatus; } // We know that we've ADVANCED. See if the WSM is in our table. WorkingSetMember* member = _ws->get(*out); // Maybe the child had an invalidation. We intersect DiskLoc(s) so we can't do anything // with this WSM. if (!member->hasLoc()) { _ws->flagForReview(*out); return PlanStage::NEED_TIME; } DataMap::iterator it = _dataMap.find(member->loc); if (_dataMap.end() == it) { // Child's output wasn't in every previous child. Throw it out. _ws->free(*out); ++_commonStats.needTime; return PlanStage::NEED_TIME; } else { // Child's output was in every previous child. Merge any key data in // the child's output and free the child's just-outputted WSM. WorkingSetID hashID = it->second; _dataMap.erase(it); WorkingSetMember* olderMember = _ws->get(hashID); AndCommon::mergeFrom(olderMember, *member); _ws->free(*out); // We should check for matching at the end so the matcher can use information in the // indices of all our children. if (Filter::passes(olderMember, _filter)) { *out = hashID; ++_commonStats.advanced; return PlanStage::ADVANCED; } else { _ws->free(hashID); ++_commonStats.needTime; return PlanStage::NEED_TIME; } } }