IndexAccessMethod::BulkBuilder::BulkBuilder(const IndexAccessMethod* index, const IndexDescriptor* descriptor) : _sorter(Sorter::make( SortOptions() .TempDir(storageGlobalParams.dbpath + "/_tmp") .ExtSortAllowed() .MaxMemoryUsageBytes(100 * 1024 * 1024), BtreeExternalSortComparison(descriptor->keyPattern(), descriptor->version()))), _real(index) {}
BtreeBasedBulkAccessMethod::BtreeBasedBulkAccessMethod(OperationContext* txn, BtreeBasedAccessMethod* real, BtreeInterface* interface, const IndexDescriptor* descriptor) { _real = real; _interface = interface; _txn = txn; _docsInserted = 0; _keysInserted = 0; _isMultiKey = false; _sorter.reset(BSONObjExternalSorter::make( SortOptions().TempDir(storageGlobalParams.dbpath + "/_tmp") .ExtSortAllowed() .MaxMemoryUsageBytes(100*1024*1024), BtreeExternalSortComparison(descriptor->keyPattern(), descriptor->version()))); }
shared_ptr<Sorter<Value, Value>::Iterator> DocumentSourceGroup::spill() { vector<const GroupsMap::value_type*> ptrs; // using pointers to speed sorting ptrs.reserve(groups.size()); for (GroupsMap::const_iterator it=groups.begin(), end=groups.end(); it != end; ++it) { ptrs.push_back(&*it); } stable_sort(ptrs.begin(), ptrs.end(), SpillSTLComparator()); SortedFileWriter<Value, Value> writer(SortOptions().TempDir(pExpCtx->tempDir)); switch (vpAccumulatorFactory.size()) { // same as ptrs[i]->second.size() for all i. case 0: // no values, essentially a distinct for (size_t i=0; i < ptrs.size(); i++) { writer.addAlreadySorted(ptrs[i]->first, Value()); } break; case 1: // just one value, use optimized serialization as single Value for (size_t i=0; i < ptrs.size(); i++) { writer.addAlreadySorted(ptrs[i]->first, ptrs[i]->second[0]->getValue(/*toBeMerged=*/true)); } break; default: // multiple values, serialize as array-typed Value for (size_t i=0; i < ptrs.size(); i++) { vector<Value> accums; for (size_t j=0; j < ptrs[i]->second.size(); j++) { accums.push_back(ptrs[i]->second[j]->getValue(/*toBeMerged=*/true)); } writer.addAlreadySorted(ptrs[i]->first, Value(std::move(accums))); } break; } groups.clear(); return shared_ptr<Sorter<Value, Value>::Iterator>(writer.done()); }
void DocumentSourceGroup::populate() { const size_t numAccumulators = vpAccumulatorFactory.size(); dassert(numAccumulators == vpExpression.size()); // pushed to on spill() vector<shared_ptr<Sorter<Value, Value>::Iterator> > sortedFiles; int memoryUsageBytes = 0; // This loop consumes all input from pSource and buckets it based on pIdExpression. while (boost::optional<Document> input = pSource->getNext()) { if (memoryUsageBytes > _maxMemoryUsageBytes) { uassert(16945, "Exceeded memory limit for $group, but didn't allow external sort." " Pass allowDiskUse:true to opt in.", _extSortAllowed); sortedFiles.push_back(spill()); memoryUsageBytes = 0; } _variables->setRoot(*input); /* get the _id value */ Value id = computeId(_variables.get()); /* treat missing values the same as NULL SERVER-4674 */ if (id.missing()) id = Value(BSONNULL); /* Look for the _id value in the map; if it's not there, add a new entry with a blank accumulator. */ const size_t oldSize = groups.size(); vector<intrusive_ptr<Accumulator> >& group = groups[id]; const bool inserted = groups.size() != oldSize; if (inserted) { memoryUsageBytes += id.getApproximateSize(); // Add the accumulators group.reserve(numAccumulators); for (size_t i = 0; i < numAccumulators; i++) { group.push_back(vpAccumulatorFactory[i]()); } } else { for (size_t i = 0; i < numAccumulators; i++) { // subtract old mem usage. New usage added back after processing. memoryUsageBytes -= group[i]->memUsageForSorter(); } } /* tickle all the accumulators for the group we found */ dassert(numAccumulators == group.size()); for (size_t i = 0; i < numAccumulators; i++) { group[i]->process(vpExpression[i]->evaluate(_variables.get()), _doingMerge); memoryUsageBytes += group[i]->memUsageForSorter(); } // We are done with the ROOT document so release it. _variables->clearRoot(); DEV { // In debug mode, spill every time we have a duplicate id to stress merge logic. if (!inserted // is a dup && !pExpCtx->inRouter // can't spill to disk in router && !_extSortAllowed // don't change behavior when testing external sort && sortedFiles.size() < 20 // don't open too many FDs ) { sortedFiles.push_back(spill()); } } } // These blocks do any final steps necessary to prepare to output results. if (!sortedFiles.empty()) { _spilled = true; if (!groups.empty()) { sortedFiles.push_back(spill()); } // We won't be using groups again so free its memory. GroupsMap().swap(groups); _sorterIterator.reset( Sorter<Value,Value>::Iterator::merge( sortedFiles, SortOptions(), SorterComparator())); // prepare current to accumulate data _currentAccumulators.reserve(numAccumulators); for (size_t i = 0; i < numAccumulators; i++) { _currentAccumulators.push_back(vpAccumulatorFactory[i]()); } verify(_sorterIterator->more()); // we put data in, we should get something out. _firstPartOfNextGroup = _sorterIterator->next(); } else { // start the group iterator groupsIterator = groups.begin(); } populated = true; }
void DocumentSourceGroup::initialize() { _initialized = true; const size_t numAccumulators = vpAccumulatorFactory.size(); boost::optional<BSONObj> inputSort = findRelevantInputSort(); if (inputSort) { // We can convert to streaming. _streaming = true; _inputSort = *inputSort; // Set up accumulators. _currentAccumulators.reserve(numAccumulators); for (size_t i = 0; i < numAccumulators; i++) { _currentAccumulators.push_back(vpAccumulatorFactory[i]()); _currentAccumulators.back()->injectExpressionContext(pExpCtx); } // We only need to load the first document. _firstDocOfNextGroup = pSource->getNext(); if (!_firstDocOfNextGroup) { return; } _variables->setRoot(*_firstDocOfNextGroup); // Compute the _id value. _currentId = computeId(_variables.get()); return; } dassert(numAccumulators == vpExpression.size()); // pushed to on spill() vector<shared_ptr<Sorter<Value, Value>::Iterator>> sortedFiles; int memoryUsageBytes = 0; // This loop consumes all input from pSource and buckets it based on pIdExpression. while (boost::optional<Document> input = pSource->getNext()) { if (memoryUsageBytes > _maxMemoryUsageBytes) { uassert(16945, "Exceeded memory limit for $group, but didn't allow external sort." " Pass allowDiskUse:true to opt in.", _extSortAllowed); sortedFiles.push_back(spill()); memoryUsageBytes = 0; } _variables->setRoot(*input); /* get the _id value */ Value id = computeId(_variables.get()); /* Look for the _id value in the map; if it's not there, add a new entry with a blank accumulator. */ const size_t oldSize = _groups->size(); vector<intrusive_ptr<Accumulator>>& group = (*_groups)[id]; const bool inserted = _groups->size() != oldSize; if (inserted) { memoryUsageBytes += id.getApproximateSize(); // Add the accumulators group.reserve(numAccumulators); for (size_t i = 0; i < numAccumulators; i++) { group.push_back(vpAccumulatorFactory[i]()); group.back()->injectExpressionContext(pExpCtx); } } else { for (size_t i = 0; i < numAccumulators; i++) { // subtract old mem usage. New usage added back after processing. memoryUsageBytes -= group[i]->memUsageForSorter(); } } /* tickle all the accumulators for the group we found */ dassert(numAccumulators == group.size()); for (size_t i = 0; i < numAccumulators; i++) { group[i]->process(vpExpression[i]->evaluate(_variables.get()), _doingMerge); memoryUsageBytes += group[i]->memUsageForSorter(); } // We are done with the ROOT document so release it. _variables->clearRoot(); if (kDebugBuild && !storageGlobalParams.readOnly) { // In debug mode, spill every time we have a duplicate id to stress merge logic. if (!inserted // is a dup && !pExpCtx->inRouter // can't spill to disk in router && !_extSortAllowed // don't change behavior when testing external sort && sortedFiles.size() < 20 // don't open too many FDs ) { sortedFiles.push_back(spill()); } } } // These blocks do any final steps necessary to prepare to output results. if (!sortedFiles.empty()) { _spilled = true; if (!_groups->empty()) { sortedFiles.push_back(spill()); } // We won't be using groups again so free its memory. _groups = pExpCtx->getValueComparator().makeUnorderedValueMap<Accumulators>(); _sorterIterator.reset(Sorter<Value, Value>::Iterator::merge( sortedFiles, SortOptions(), SorterComparator(pExpCtx->getValueComparator()))); // prepare current to accumulate data _currentAccumulators.reserve(numAccumulators); for (size_t i = 0; i < numAccumulators; i++) { _currentAccumulators.push_back(vpAccumulatorFactory[i]()); _currentAccumulators.back()->injectExpressionContext(pExpCtx); } verify(_sorterIterator->more()); // we put data in, we should get something out. _firstPartOfNextGroup = _sorterIterator->next(); } else { // start the group iterator groupsIterator = _groups->begin(); } }