예제 #1
0
void RecordStoreV1Base::deleteRecord(OperationContext* txn, const RecordId& rid) {
    const DiskLoc dl = DiskLoc::fromRecordId(rid);

    MmapV1RecordHeader* todelete = recordFor(dl);
    invariant(todelete->netLength() >= 4);  // this is required for defensive code

    /* remove ourself from the record next/prev chain */
    {
        if (todelete->prevOfs() != DiskLoc::NullOfs) {
            DiskLoc prev = getPrevRecordInExtent(txn, dl);
            MmapV1RecordHeader* prevRecord = recordFor(prev);
            txn->recoveryUnit()->writingInt(prevRecord->nextOfs()) = todelete->nextOfs();
        }

        if (todelete->nextOfs() != DiskLoc::NullOfs) {
            DiskLoc next = getNextRecord(txn, dl);
            MmapV1RecordHeader* nextRecord = recordFor(next);
            txn->recoveryUnit()->writingInt(nextRecord->prevOfs()) = todelete->prevOfs();
        }
    }

    /* remove ourself from extent pointers */
    {
        DiskLoc extentLoc = todelete->myExtentLoc(dl);
        Extent* e = _getExtent(txn, extentLoc);
        if (e->firstRecord == dl) {
            txn->recoveryUnit()->writing(&e->firstRecord);
            if (todelete->nextOfs() == DiskLoc::NullOfs)
                e->firstRecord.Null();
            else
                e->firstRecord.set(dl.a(), todelete->nextOfs());
        }
        if (e->lastRecord == dl) {
            txn->recoveryUnit()->writing(&e->lastRecord);
            if (todelete->prevOfs() == DiskLoc::NullOfs)
                e->lastRecord.Null();
            else
                e->lastRecord.set(dl.a(), todelete->prevOfs());
        }
    }

    /* add to the free list */
    {
        _details->incrementStats(txn, -1 * todelete->netLength(), -1);

        if (_isSystemIndexes) {
            /* temp: if in system.indexes, don't reuse, and zero out: we want to be
               careful until validated more, as IndexDetails has pointers
               to this disk location.  so an incorrectly done remove would cause
               a lot of problems.
            */
            memset(txn->recoveryUnit()->writingPtr(todelete, todelete->lengthWithHeaders()),
                   0,
                   todelete->lengthWithHeaders());
        } else {
            // this is defensive so we can detect if we are still using a location
            // that was deleted
            memset(txn->recoveryUnit()->writingPtr(todelete->data(), 4), 0xee, 4);
            addDeletedRec(txn, dl);
        }
    }
}
예제 #2
0
void SimpleRecordStoreV1::_compactExtent(OperationContext* txn,
                                         const DiskLoc extentLoc,
                                         int extentNumber,
                                         RecordStoreCompactAdaptor* adaptor,
                                         const CompactOptions* compactOptions,
                                         CompactStats* stats) {
    log() << "compact begin extent #" << extentNumber << " for namespace " << _ns << " "
          << extentLoc;

    unsigned oldObjSize = 0;  // we'll report what the old padding was
    unsigned oldObjSizeWithPadding = 0;

    Extent* const sourceExtent = _extentManager->getExtent(extentLoc);
    sourceExtent->assertOk();
    fassert(17437, sourceExtent->validates(extentLoc));

    {
        // The next/prev MmapV1RecordHeader pointers within the Extent might not be in order so we
        // first page in the whole Extent sequentially.
        // TODO benchmark on slow storage to verify this is measurably faster.
        log() << "compact paging in len=" << sourceExtent->length / 1000000.0 << "MB" << endl;
        Timer t;
        size_t length = sourceExtent->length;

        touch_pages(reinterpret_cast<const char*>(sourceExtent), length);
        int ms = t.millis();
        if (ms > 1000)
            log() << "compact end paging in " << ms << "ms "
                  << sourceExtent->length / 1000000.0 / t.seconds() << "MB/sec" << endl;
    }

    {
        // Move each MmapV1RecordHeader out of this extent and insert it in to the "new" extents.
        log() << "compact copying records" << endl;
        long long totalNetSize = 0;
        long long nrecords = 0;
        DiskLoc nextSourceLoc = sourceExtent->firstRecord;
        while (!nextSourceLoc.isNull()) {
            txn->checkForInterrupt();

            WriteUnitOfWork wunit(txn);
            MmapV1RecordHeader* recOld = recordFor(nextSourceLoc);
            RecordData oldData = recOld->toRecordData();
            nextSourceLoc = getNextRecordInExtent(txn, nextSourceLoc);

            if (compactOptions->validateDocuments && !adaptor->isDataValid(oldData)) {
                // object is corrupt!
                log() << "compact removing corrupt document!";
                stats->corruptDocuments++;
            } else {
                // How much data is in the record. Excludes padding and MmapV1RecordHeader headers.
                const unsigned rawDataSize = adaptor->dataSize(oldData);

                nrecords++;
                oldObjSize += rawDataSize;
                oldObjSizeWithPadding += recOld->netLength();

                // Allocation sizes include the headers and possibly some padding.
                const unsigned minAllocationSize = rawDataSize + MmapV1RecordHeader::HeaderSize;
                unsigned allocationSize = minAllocationSize;
                switch (compactOptions->paddingMode) {
                    case CompactOptions::NONE:  // default padding
                        if (shouldPadInserts()) {
                            allocationSize = quantizeAllocationSpace(minAllocationSize);
                        }
                        break;

                    case CompactOptions::PRESERVE:  // keep original padding
                        allocationSize = recOld->lengthWithHeaders();
                        break;

                    case CompactOptions::MANUAL:  // user specified how much padding to use
                        allocationSize = compactOptions->computeRecordSize(minAllocationSize);
                        if (allocationSize < minAllocationSize ||
                            allocationSize > BSONObjMaxUserSize / 2) {
                            allocationSize = minAllocationSize;
                        }
                        break;
                }
                invariant(allocationSize >= minAllocationSize);

                // Copy the data to a new record. Because we orphaned the record freelist at the
                // start of the compact, this insert will allocate a record in a new extent.
                // See the comment in compact() for more details.
                CompactDocWriter writer(recOld, rawDataSize, allocationSize);
                StatusWith<RecordId> status = insertRecordWithDocWriter(txn, &writer);
                uassertStatusOK(status.getStatus());
                const MmapV1RecordHeader* newRec =
                    recordFor(DiskLoc::fromRecordId(status.getValue()));
                invariant(unsigned(newRec->netLength()) >= rawDataSize);
                totalNetSize += newRec->netLength();

                // Tells the caller that the record has been moved, so it can do things such as
                // add it to indexes.
                adaptor->inserted(newRec->toRecordData(), status.getValue());
            }

            // Remove the old record from the linked list of records withing the sourceExtent.
            // The old record is not added to the freelist as we will be freeing the whole
            // extent at the end.
            *txn->recoveryUnit()->writing(&sourceExtent->firstRecord) = nextSourceLoc;
            if (nextSourceLoc.isNull()) {
                // Just moved the last record out of the extent. Mark extent as empty.
                *txn->recoveryUnit()->writing(&sourceExtent->lastRecord) = DiskLoc();
            } else {
                MmapV1RecordHeader* newFirstRecord = recordFor(nextSourceLoc);
                txn->recoveryUnit()->writingInt(newFirstRecord->prevOfs()) = DiskLoc::NullOfs;
            }

            // Adjust the stats to reflect the removal of the old record. The insert above
            // handled adjusting the stats for the new record.
            _details->incrementStats(txn, -(recOld->netLength()), -1);

            wunit.commit();
        }

        // The extent must now be empty.
        invariant(sourceExtent->firstRecord.isNull());
        invariant(sourceExtent->lastRecord.isNull());

        // We are still the first extent, but we must not be the only extent.
        invariant(_details->firstExtent(txn) == extentLoc);
        invariant(_details->lastExtent(txn) != extentLoc);

        // Remove the newly emptied sourceExtent from the extent linked list and return it to
        // the extent manager.
        WriteUnitOfWork wunit(txn);
        const DiskLoc newFirst = sourceExtent->xnext;
        _details->setFirstExtent(txn, newFirst);
        *txn->recoveryUnit()->writing(&_extentManager->getExtent(newFirst)->xprev) = DiskLoc();
        _extentManager->freeExtent(txn, extentLoc);
        wunit.commit();

        {
            const double oldPadding = oldObjSize ? double(oldObjSizeWithPadding) / oldObjSize
                                                 : 1.0;  // defining 0/0 as 1 for this.

            log() << "compact finished extent #" << extentNumber << " containing " << nrecords
                  << " documents (" << totalNetSize / (1024 * 1024.0) << "MB)"
                  << " oldPadding: " << oldPadding;
        }
    }
}
예제 #3
0
void initializeV1RS(OperationContext* opCtx,
                    const LocAndSize* records,
                    const LocAndSize* drecs,
                    const LocAndSize* legacyGrabBag,
                    DummyExtentManager* em,
                    DummyRecordStoreV1MetaData* md) {
    invariant(records || drecs);  // if both are NULL nothing is being created...

    // Need to start with a blank slate
    invariant(em->numFiles() == 0);
    invariant(md->firstExtent(opCtx).isNull());

    // pre-allocate extents (even extents that aren't part of this RS)
    {
        typedef std::map<int, size_t> ExtentSizes;
        ExtentSizes extentSizes;
        accumulateExtentSizeRequirements(records, &extentSizes);
        accumulateExtentSizeRequirements(drecs, &extentSizes);
        accumulateExtentSizeRequirements(legacyGrabBag, &extentSizes);
        invariant(!extentSizes.empty());

        const int maxExtent = extentSizes.rbegin()->first;
        for (int i = 0; i <= maxExtent; i++) {
            const size_t size = extentSizes.count(i) ? extentSizes[i] : 0;
            const DiskLoc loc = em->allocateExtent(opCtx, md->isCapped(), size, 0);

            // This function and assertState depend on these details of DummyExtentManager
            invariant(loc.a() == i);
            invariant(loc.getOfs() == 0);
        }

        // link together extents that should be part of this RS
        md->setFirstExtent(opCtx, DiskLoc(extentSizes.begin()->first, 0));
        md->setLastExtent(opCtx, DiskLoc(extentSizes.rbegin()->first, 0));
        for (ExtentSizes::iterator it = extentSizes.begin(); boost::next(it) != extentSizes.end();
             /* ++it */) {
            const int a = it->first;
            ++it;
            const int b = it->first;
            em->getExtent(DiskLoc(a, 0))->xnext = DiskLoc(b, 0);
            em->getExtent(DiskLoc(b, 0))->xprev = DiskLoc(a, 0);
        }

        // This signals "done allocating new extents".
        if (md->isCapped())
            md->setDeletedListEntry(opCtx, 1, DiskLoc());
    }

    if (records && !records[0].loc.isNull()) {
        int recIdx = 0;
        DiskLoc extLoc = md->firstExtent(opCtx);
        while (!extLoc.isNull()) {
            Extent* ext = em->getExtent(extLoc);
            int prevOfs = DiskLoc::NullOfs;
            while (extLoc.a() == records[recIdx].loc.a()) {  // for all records in this extent
                const DiskLoc loc = records[recIdx].loc;
                const int size = records[recIdx].size;
                ;
                invariant(size >= MmapV1RecordHeader::HeaderSize);

                md->incrementStats(opCtx, size - MmapV1RecordHeader::HeaderSize, 1);

                if (ext->firstRecord.isNull())
                    ext->firstRecord = loc;

                MmapV1RecordHeader* rec = em->recordForV1(loc);
                rec->lengthWithHeaders() = size;
                rec->extentOfs() = 0;

                rec->prevOfs() = prevOfs;
                prevOfs = loc.getOfs();

                const DiskLoc nextLoc = records[recIdx + 1].loc;
                if (nextLoc.a() == loc.a()) {  // if next is in same extent
                    rec->nextOfs() = nextLoc.getOfs();
                } else {
                    rec->nextOfs() = DiskLoc::NullOfs;
                    ext->lastRecord = loc;
                }

                recIdx++;
            }
            extLoc = ext->xnext;
        }
        invariant(records[recIdx].loc.isNull());
    }

    if (drecs && !drecs[0].loc.isNull()) {
        int drecIdx = 0;
        DiskLoc* prevNextPtr = NULL;
        int lastBucket = -1;
        while (!drecs[drecIdx].loc.isNull()) {
            const DiskLoc loc = drecs[drecIdx].loc;
            const int size = drecs[drecIdx].size;
            invariant(size >= MmapV1RecordHeader::HeaderSize);
            const int bucket = RecordStoreV1Base::bucket(size);

            if (md->isCapped()) {
                // All drecs form a single list in bucket 0
                if (prevNextPtr == NULL) {
                    md->setDeletedListEntry(opCtx, 0, loc);
                } else {
                    *prevNextPtr = loc;
                }

                if (loc.a() < md->capExtent().a() &&
                    drecs[drecIdx + 1].loc.a() == md->capExtent().a()) {
                    // Bucket 1 is known as cappedLastDelRecLastExtent
                    md->setDeletedListEntry(opCtx, 1, loc);
                }
            } else if (bucket != lastBucket) {
                invariant(bucket > lastBucket);  // if this fails, drecs weren't sorted by bucket
                md->setDeletedListEntry(opCtx, bucket, loc);
                lastBucket = bucket;
            } else {
                *prevNextPtr = loc;
            }

            DeletedRecord* drec = &em->recordForV1(loc)->asDeleted();
            drec->lengthWithHeaders() = size;
            drec->extentOfs() = 0;
            drec->nextDeleted() = DiskLoc();
            prevNextPtr = &drec->nextDeleted();

            drecIdx++;
        }
    }

    if (legacyGrabBag && !legacyGrabBag[0].loc.isNull()) {
        invariant(!md->isCapped());  // capped should have an empty legacy grab bag.

        int grabBagIdx = 0;
        DiskLoc* prevNextPtr = NULL;
        while (!legacyGrabBag[grabBagIdx].loc.isNull()) {
            const DiskLoc loc = legacyGrabBag[grabBagIdx].loc;
            const int size = legacyGrabBag[grabBagIdx].size;
            invariant(size >= MmapV1RecordHeader::HeaderSize);

            if (grabBagIdx == 0) {
                md->setDeletedListLegacyGrabBag(opCtx, loc);
            } else {
                *prevNextPtr = loc;
            }

            DeletedRecord* drec = &em->recordForV1(loc)->asDeleted();
            drec->lengthWithHeaders() = size;
            drec->extentOfs() = 0;
            drec->nextDeleted() = DiskLoc();
            prevNextPtr = &drec->nextDeleted();

            grabBagIdx++;
        }
    }

    // Make sure we set everything up as requested.
    assertStateV1RS(opCtx, records, drecs, legacyGrabBag, em, md);
}