Esempio n. 1
0
IndexAccessMethod::BulkBuilder::BulkBuilder(const IndexAccessMethod* index,
                                            const IndexDescriptor* descriptor)
    : _sorter(Sorter::make(
          SortOptions()
              .TempDir(storageGlobalParams.dbpath + "/_tmp")
              .ExtSortAllowed()
              .MaxMemoryUsageBytes(100 * 1024 * 1024),
          BtreeExternalSortComparison(descriptor->keyPattern(), descriptor->version()))),
      _real(index) {}
    BtreeBasedBulkAccessMethod::BtreeBasedBulkAccessMethod(OperationContext* txn,
                                                           BtreeBasedAccessMethod* real,
                                                           BtreeInterface* interface,
                                                           const IndexDescriptor* descriptor) {
        _real = real;
        _interface = interface;
        _txn = txn;

        _docsInserted = 0;
        _keysInserted = 0;
        _isMultiKey = false;

        _sorter.reset(BSONObjExternalSorter::make(
                    SortOptions().TempDir(storageGlobalParams.dbpath + "/_tmp")
                                 .ExtSortAllowed()
                                 .MaxMemoryUsageBytes(100*1024*1024),
                    BtreeExternalSortComparison(descriptor->keyPattern(), descriptor->version())));
    }
    shared_ptr<Sorter<Value, Value>::Iterator> DocumentSourceGroup::spill() {
        vector<const GroupsMap::value_type*> ptrs; // using pointers to speed sorting
        ptrs.reserve(groups.size());
        for (GroupsMap::const_iterator it=groups.begin(), end=groups.end(); it != end; ++it) {
            ptrs.push_back(&*it);
        }

        stable_sort(ptrs.begin(), ptrs.end(), SpillSTLComparator());

        SortedFileWriter<Value, Value> writer(SortOptions().TempDir(pExpCtx->tempDir));
        switch (vpAccumulatorFactory.size()) { // same as ptrs[i]->second.size() for all i.
        case 0: // no values, essentially a distinct
            for (size_t i=0; i < ptrs.size(); i++) {
                writer.addAlreadySorted(ptrs[i]->first, Value());
            }
            break;

        case 1: // just one value, use optimized serialization as single Value
            for (size_t i=0; i < ptrs.size(); i++) {
                writer.addAlreadySorted(ptrs[i]->first,
                                        ptrs[i]->second[0]->getValue(/*toBeMerged=*/true));
            }
            break;

        default: // multiple values, serialize as array-typed Value
            for (size_t i=0; i < ptrs.size(); i++) {
                vector<Value> accums;
                for (size_t j=0; j < ptrs[i]->second.size(); j++) {
                    accums.push_back(ptrs[i]->second[j]->getValue(/*toBeMerged=*/true));
                }
                writer.addAlreadySorted(ptrs[i]->first, Value(std::move(accums)));
            }
            break;
        }

        groups.clear();

        return shared_ptr<Sorter<Value, Value>::Iterator>(writer.done());
    }
    void DocumentSourceGroup::populate() {
        const size_t numAccumulators = vpAccumulatorFactory.size();
        dassert(numAccumulators == vpExpression.size());

        // pushed to on spill()
        vector<shared_ptr<Sorter<Value, Value>::Iterator> > sortedFiles;
        int memoryUsageBytes = 0;

        // This loop consumes all input from pSource and buckets it based on pIdExpression.
        while (boost::optional<Document> input = pSource->getNext()) {
            if (memoryUsageBytes > _maxMemoryUsageBytes) {
                uassert(16945, "Exceeded memory limit for $group, but didn't allow external sort."
                               " Pass allowDiskUse:true to opt in.",
                        _extSortAllowed);
                sortedFiles.push_back(spill());
                memoryUsageBytes = 0;
            }

            _variables->setRoot(*input);

            /* get the _id value */
            Value id = computeId(_variables.get());

            /* treat missing values the same as NULL SERVER-4674 */
            if (id.missing())
                id = Value(BSONNULL);

            /*
              Look for the _id value in the map; if it's not there, add a
              new entry with a blank accumulator.
            */
            const size_t oldSize = groups.size();
            vector<intrusive_ptr<Accumulator> >& group = groups[id];
            const bool inserted = groups.size() != oldSize;

            if (inserted) {
                memoryUsageBytes += id.getApproximateSize();

                // Add the accumulators
                group.reserve(numAccumulators);
                for (size_t i = 0; i < numAccumulators; i++) {
                    group.push_back(vpAccumulatorFactory[i]());
                }
            } else {
                for (size_t i = 0; i < numAccumulators; i++) {
                    // subtract old mem usage. New usage added back after processing.
                    memoryUsageBytes -= group[i]->memUsageForSorter();
                }
            }

            /* tickle all the accumulators for the group we found */
            dassert(numAccumulators == group.size());
            for (size_t i = 0; i < numAccumulators; i++) {
                group[i]->process(vpExpression[i]->evaluate(_variables.get()), _doingMerge);
                memoryUsageBytes += group[i]->memUsageForSorter();
            }

            // We are done with the ROOT document so release it.
            _variables->clearRoot();

            DEV {
                // In debug mode, spill every time we have a duplicate id to stress merge logic.
                if (!inserted // is a dup
                        && !pExpCtx->inRouter // can't spill to disk in router
                        && !_extSortAllowed // don't change behavior when testing external sort
                        && sortedFiles.size() < 20 // don't open too many FDs
                        ) {
                    sortedFiles.push_back(spill());
                }
            }
        }

        // These blocks do any final steps necessary to prepare to output results.
        if (!sortedFiles.empty()) {
            _spilled = true;
            if (!groups.empty()) {
                sortedFiles.push_back(spill());
            }

            // We won't be using groups again so free its memory.
            GroupsMap().swap(groups);

            _sorterIterator.reset(
                    Sorter<Value,Value>::Iterator::merge(
                        sortedFiles, SortOptions(), SorterComparator()));

            // prepare current to accumulate data
            _currentAccumulators.reserve(numAccumulators);
            for (size_t i = 0; i < numAccumulators; i++) {
                _currentAccumulators.push_back(vpAccumulatorFactory[i]());
            }

            verify(_sorterIterator->more()); // we put data in, we should get something out.
            _firstPartOfNextGroup = _sorterIterator->next();
        } else {
            // start the group iterator
            groupsIterator = groups.begin();
        }

        populated = true;
    }
Esempio n. 5
0
void DocumentSourceGroup::initialize() {
    _initialized = true;
    const size_t numAccumulators = vpAccumulatorFactory.size();

    boost::optional<BSONObj> inputSort = findRelevantInputSort();
    if (inputSort) {
        // We can convert to streaming.
        _streaming = true;
        _inputSort = *inputSort;

        // Set up accumulators.
        _currentAccumulators.reserve(numAccumulators);
        for (size_t i = 0; i < numAccumulators; i++) {
            _currentAccumulators.push_back(vpAccumulatorFactory[i]());
            _currentAccumulators.back()->injectExpressionContext(pExpCtx);
        }

        // We only need to load the first document.
        _firstDocOfNextGroup = pSource->getNext();

        if (!_firstDocOfNextGroup) {
            return;
        }

        _variables->setRoot(*_firstDocOfNextGroup);

        // Compute the _id value.
        _currentId = computeId(_variables.get());
        return;
    }

    dassert(numAccumulators == vpExpression.size());

    // pushed to on spill()
    vector<shared_ptr<Sorter<Value, Value>::Iterator>> sortedFiles;
    int memoryUsageBytes = 0;

    // This loop consumes all input from pSource and buckets it based on pIdExpression.
    while (boost::optional<Document> input = pSource->getNext()) {
        if (memoryUsageBytes > _maxMemoryUsageBytes) {
            uassert(16945,
                    "Exceeded memory limit for $group, but didn't allow external sort."
                    " Pass allowDiskUse:true to opt in.",
                    _extSortAllowed);
            sortedFiles.push_back(spill());
            memoryUsageBytes = 0;
        }

        _variables->setRoot(*input);

        /* get the _id value */
        Value id = computeId(_variables.get());

        /*
          Look for the _id value in the map; if it's not there, add a
          new entry with a blank accumulator.
        */
        const size_t oldSize = _groups->size();
        vector<intrusive_ptr<Accumulator>>& group = (*_groups)[id];
        const bool inserted = _groups->size() != oldSize;

        if (inserted) {
            memoryUsageBytes += id.getApproximateSize();

            // Add the accumulators
            group.reserve(numAccumulators);
            for (size_t i = 0; i < numAccumulators; i++) {
                group.push_back(vpAccumulatorFactory[i]());
                group.back()->injectExpressionContext(pExpCtx);
            }
        } else {
            for (size_t i = 0; i < numAccumulators; i++) {
                // subtract old mem usage. New usage added back after processing.
                memoryUsageBytes -= group[i]->memUsageForSorter();
            }
        }

        /* tickle all the accumulators for the group we found */
        dassert(numAccumulators == group.size());
        for (size_t i = 0; i < numAccumulators; i++) {
            group[i]->process(vpExpression[i]->evaluate(_variables.get()), _doingMerge);
            memoryUsageBytes += group[i]->memUsageForSorter();
        }

        // We are done with the ROOT document so release it.
        _variables->clearRoot();

        if (kDebugBuild && !storageGlobalParams.readOnly) {
            // In debug mode, spill every time we have a duplicate id to stress merge logic.
            if (!inserted  // is a dup
                &&
                !pExpCtx->inRouter  // can't spill to disk in router
                &&
                !_extSortAllowed  // don't change behavior when testing external sort
                &&
                sortedFiles.size() < 20  // don't open too many FDs
                ) {
                sortedFiles.push_back(spill());
            }
        }
    }

    // These blocks do any final steps necessary to prepare to output results.
    if (!sortedFiles.empty()) {
        _spilled = true;
        if (!_groups->empty()) {
            sortedFiles.push_back(spill());
        }

        // We won't be using groups again so free its memory.
        _groups = pExpCtx->getValueComparator().makeUnorderedValueMap<Accumulators>();

        _sorterIterator.reset(Sorter<Value, Value>::Iterator::merge(
            sortedFiles, SortOptions(), SorterComparator(pExpCtx->getValueComparator())));

        // prepare current to accumulate data
        _currentAccumulators.reserve(numAccumulators);
        for (size_t i = 0; i < numAccumulators; i++) {
            _currentAccumulators.push_back(vpAccumulatorFactory[i]());
            _currentAccumulators.back()->injectExpressionContext(pExpCtx);
        }

        verify(_sorterIterator->more());  // we put data in, we should get something out.
        _firstPartOfNextGroup = _sorterIterator->next();
    } else {
        // start the group iterator
        groupsIterator = _groups->begin();
    }
}