void RecordStoreV1Base::deleteRecord(OperationContext* txn, const RecordId& rid) { const DiskLoc dl = DiskLoc::fromRecordId(rid); MmapV1RecordHeader* todelete = recordFor(dl); invariant(todelete->netLength() >= 4); // this is required for defensive code /* remove ourself from the record next/prev chain */ { if (todelete->prevOfs() != DiskLoc::NullOfs) { DiskLoc prev = getPrevRecordInExtent(txn, dl); MmapV1RecordHeader* prevRecord = recordFor(prev); txn->recoveryUnit()->writingInt(prevRecord->nextOfs()) = todelete->nextOfs(); } if (todelete->nextOfs() != DiskLoc::NullOfs) { DiskLoc next = getNextRecord(txn, dl); MmapV1RecordHeader* nextRecord = recordFor(next); txn->recoveryUnit()->writingInt(nextRecord->prevOfs()) = todelete->prevOfs(); } } /* remove ourself from extent pointers */ { DiskLoc extentLoc = todelete->myExtentLoc(dl); Extent* e = _getExtent(txn, extentLoc); if (e->firstRecord == dl) { txn->recoveryUnit()->writing(&e->firstRecord); if (todelete->nextOfs() == DiskLoc::NullOfs) e->firstRecord.Null(); else e->firstRecord.set(dl.a(), todelete->nextOfs()); } if (e->lastRecord == dl) { txn->recoveryUnit()->writing(&e->lastRecord); if (todelete->prevOfs() == DiskLoc::NullOfs) e->lastRecord.Null(); else e->lastRecord.set(dl.a(), todelete->prevOfs()); } } /* add to the free list */ { _details->incrementStats(txn, -1 * todelete->netLength(), -1); if (_isSystemIndexes) { /* temp: if in system.indexes, don't reuse, and zero out: we want to be careful until validated more, as IndexDetails has pointers to this disk location. so an incorrectly done remove would cause a lot of problems. */ memset(txn->recoveryUnit()->writingPtr(todelete, todelete->lengthWithHeaders()), 0, todelete->lengthWithHeaders()); } else { // this is defensive so we can detect if we are still using a location // that was deleted memset(txn->recoveryUnit()->writingPtr(todelete->data(), 4), 0xee, 4); addDeletedRec(txn, dl); } } }
void SimpleRecordStoreV1::_compactExtent(OperationContext* txn, const DiskLoc extentLoc, int extentNumber, RecordStoreCompactAdaptor* adaptor, const CompactOptions* compactOptions, CompactStats* stats) { log() << "compact begin extent #" << extentNumber << " for namespace " << _ns << " " << extentLoc; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent* const sourceExtent = _extentManager->getExtent(extentLoc); sourceExtent->assertOk(); fassert(17437, sourceExtent->validates(extentLoc)); { // The next/prev MmapV1RecordHeader pointers within the Extent might not be in order so we // first page in the whole Extent sequentially. // TODO benchmark on slow storage to verify this is measurably faster. log() << "compact paging in len=" << sourceExtent->length / 1000000.0 << "MB" << endl; Timer t; size_t length = sourceExtent->length; touch_pages(reinterpret_cast<const char*>(sourceExtent), length); int ms = t.millis(); if (ms > 1000) log() << "compact end paging in " << ms << "ms " << sourceExtent->length / 1000000.0 / t.seconds() << "MB/sec" << endl; } { // Move each MmapV1RecordHeader out of this extent and insert it in to the "new" extents. log() << "compact copying records" << endl; long long totalNetSize = 0; long long nrecords = 0; DiskLoc nextSourceLoc = sourceExtent->firstRecord; while (!nextSourceLoc.isNull()) { txn->checkForInterrupt(); WriteUnitOfWork wunit(txn); MmapV1RecordHeader* recOld = recordFor(nextSourceLoc); RecordData oldData = recOld->toRecordData(); nextSourceLoc = getNextRecordInExtent(txn, nextSourceLoc); if (compactOptions->validateDocuments && !adaptor->isDataValid(oldData)) { // object is corrupt! log() << "compact removing corrupt document!"; stats->corruptDocuments++; } else { // How much data is in the record. Excludes padding and MmapV1RecordHeader headers. const unsigned rawDataSize = adaptor->dataSize(oldData); nrecords++; oldObjSize += rawDataSize; oldObjSizeWithPadding += recOld->netLength(); // Allocation sizes include the headers and possibly some padding. const unsigned minAllocationSize = rawDataSize + MmapV1RecordHeader::HeaderSize; unsigned allocationSize = minAllocationSize; switch (compactOptions->paddingMode) { case CompactOptions::NONE: // default padding if (shouldPadInserts()) { allocationSize = quantizeAllocationSpace(minAllocationSize); } break; case CompactOptions::PRESERVE: // keep original padding allocationSize = recOld->lengthWithHeaders(); break; case CompactOptions::MANUAL: // user specified how much padding to use allocationSize = compactOptions->computeRecordSize(minAllocationSize); if (allocationSize < minAllocationSize || allocationSize > BSONObjMaxUserSize / 2) { allocationSize = minAllocationSize; } break; } invariant(allocationSize >= minAllocationSize); // Copy the data to a new record. Because we orphaned the record freelist at the // start of the compact, this insert will allocate a record in a new extent. // See the comment in compact() for more details. CompactDocWriter writer(recOld, rawDataSize, allocationSize); StatusWith<RecordId> status = insertRecordWithDocWriter(txn, &writer); uassertStatusOK(status.getStatus()); const MmapV1RecordHeader* newRec = recordFor(DiskLoc::fromRecordId(status.getValue())); invariant(unsigned(newRec->netLength()) >= rawDataSize); totalNetSize += newRec->netLength(); // Tells the caller that the record has been moved, so it can do things such as // add it to indexes. adaptor->inserted(newRec->toRecordData(), status.getValue()); } // Remove the old record from the linked list of records withing the sourceExtent. // The old record is not added to the freelist as we will be freeing the whole // extent at the end. *txn->recoveryUnit()->writing(&sourceExtent->firstRecord) = nextSourceLoc; if (nextSourceLoc.isNull()) { // Just moved the last record out of the extent. Mark extent as empty. *txn->recoveryUnit()->writing(&sourceExtent->lastRecord) = DiskLoc(); } else { MmapV1RecordHeader* newFirstRecord = recordFor(nextSourceLoc); txn->recoveryUnit()->writingInt(newFirstRecord->prevOfs()) = DiskLoc::NullOfs; } // Adjust the stats to reflect the removal of the old record. The insert above // handled adjusting the stats for the new record. _details->incrementStats(txn, -(recOld->netLength()), -1); wunit.commit(); } // The extent must now be empty. invariant(sourceExtent->firstRecord.isNull()); invariant(sourceExtent->lastRecord.isNull()); // We are still the first extent, but we must not be the only extent. invariant(_details->firstExtent(txn) == extentLoc); invariant(_details->lastExtent(txn) != extentLoc); // Remove the newly emptied sourceExtent from the extent linked list and return it to // the extent manager. WriteUnitOfWork wunit(txn); const DiskLoc newFirst = sourceExtent->xnext; _details->setFirstExtent(txn, newFirst); *txn->recoveryUnit()->writing(&_extentManager->getExtent(newFirst)->xprev) = DiskLoc(); _extentManager->freeExtent(txn, extentLoc); wunit.commit(); { const double oldPadding = oldObjSize ? double(oldObjSizeWithPadding) / oldObjSize : 1.0; // defining 0/0 as 1 for this. log() << "compact finished extent #" << extentNumber << " containing " << nrecords << " documents (" << totalNetSize / (1024 * 1024.0) << "MB)" << " oldPadding: " << oldPadding; } } }
void initializeV1RS(OperationContext* opCtx, const LocAndSize* records, const LocAndSize* drecs, const LocAndSize* legacyGrabBag, DummyExtentManager* em, DummyRecordStoreV1MetaData* md) { invariant(records || drecs); // if both are NULL nothing is being created... // Need to start with a blank slate invariant(em->numFiles() == 0); invariant(md->firstExtent(opCtx).isNull()); // pre-allocate extents (even extents that aren't part of this RS) { typedef std::map<int, size_t> ExtentSizes; ExtentSizes extentSizes; accumulateExtentSizeRequirements(records, &extentSizes); accumulateExtentSizeRequirements(drecs, &extentSizes); accumulateExtentSizeRequirements(legacyGrabBag, &extentSizes); invariant(!extentSizes.empty()); const int maxExtent = extentSizes.rbegin()->first; for (int i = 0; i <= maxExtent; i++) { const size_t size = extentSizes.count(i) ? extentSizes[i] : 0; const DiskLoc loc = em->allocateExtent(opCtx, md->isCapped(), size, 0); // This function and assertState depend on these details of DummyExtentManager invariant(loc.a() == i); invariant(loc.getOfs() == 0); } // link together extents that should be part of this RS md->setFirstExtent(opCtx, DiskLoc(extentSizes.begin()->first, 0)); md->setLastExtent(opCtx, DiskLoc(extentSizes.rbegin()->first, 0)); for (ExtentSizes::iterator it = extentSizes.begin(); boost::next(it) != extentSizes.end(); /* ++it */) { const int a = it->first; ++it; const int b = it->first; em->getExtent(DiskLoc(a, 0))->xnext = DiskLoc(b, 0); em->getExtent(DiskLoc(b, 0))->xprev = DiskLoc(a, 0); } // This signals "done allocating new extents". if (md->isCapped()) md->setDeletedListEntry(opCtx, 1, DiskLoc()); } if (records && !records[0].loc.isNull()) { int recIdx = 0; DiskLoc extLoc = md->firstExtent(opCtx); while (!extLoc.isNull()) { Extent* ext = em->getExtent(extLoc); int prevOfs = DiskLoc::NullOfs; while (extLoc.a() == records[recIdx].loc.a()) { // for all records in this extent const DiskLoc loc = records[recIdx].loc; const int size = records[recIdx].size; ; invariant(size >= MmapV1RecordHeader::HeaderSize); md->incrementStats(opCtx, size - MmapV1RecordHeader::HeaderSize, 1); if (ext->firstRecord.isNull()) ext->firstRecord = loc; MmapV1RecordHeader* rec = em->recordForV1(loc); rec->lengthWithHeaders() = size; rec->extentOfs() = 0; rec->prevOfs() = prevOfs; prevOfs = loc.getOfs(); const DiskLoc nextLoc = records[recIdx + 1].loc; if (nextLoc.a() == loc.a()) { // if next is in same extent rec->nextOfs() = nextLoc.getOfs(); } else { rec->nextOfs() = DiskLoc::NullOfs; ext->lastRecord = loc; } recIdx++; } extLoc = ext->xnext; } invariant(records[recIdx].loc.isNull()); } if (drecs && !drecs[0].loc.isNull()) { int drecIdx = 0; DiskLoc* prevNextPtr = NULL; int lastBucket = -1; while (!drecs[drecIdx].loc.isNull()) { const DiskLoc loc = drecs[drecIdx].loc; const int size = drecs[drecIdx].size; invariant(size >= MmapV1RecordHeader::HeaderSize); const int bucket = RecordStoreV1Base::bucket(size); if (md->isCapped()) { // All drecs form a single list in bucket 0 if (prevNextPtr == NULL) { md->setDeletedListEntry(opCtx, 0, loc); } else { *prevNextPtr = loc; } if (loc.a() < md->capExtent().a() && drecs[drecIdx + 1].loc.a() == md->capExtent().a()) { // Bucket 1 is known as cappedLastDelRecLastExtent md->setDeletedListEntry(opCtx, 1, loc); } } else if (bucket != lastBucket) { invariant(bucket > lastBucket); // if this fails, drecs weren't sorted by bucket md->setDeletedListEntry(opCtx, bucket, loc); lastBucket = bucket; } else { *prevNextPtr = loc; } DeletedRecord* drec = &em->recordForV1(loc)->asDeleted(); drec->lengthWithHeaders() = size; drec->extentOfs() = 0; drec->nextDeleted() = DiskLoc(); prevNextPtr = &drec->nextDeleted(); drecIdx++; } } if (legacyGrabBag && !legacyGrabBag[0].loc.isNull()) { invariant(!md->isCapped()); // capped should have an empty legacy grab bag. int grabBagIdx = 0; DiskLoc* prevNextPtr = NULL; while (!legacyGrabBag[grabBagIdx].loc.isNull()) { const DiskLoc loc = legacyGrabBag[grabBagIdx].loc; const int size = legacyGrabBag[grabBagIdx].size; invariant(size >= MmapV1RecordHeader::HeaderSize); if (grabBagIdx == 0) { md->setDeletedListLegacyGrabBag(opCtx, loc); } else { *prevNextPtr = loc; } DeletedRecord* drec = &em->recordForV1(loc)->asDeleted(); drec->lengthWithHeaders() = size; drec->extentOfs() = 0; drec->nextDeleted() = DiskLoc(); prevNextPtr = &drec->nextDeleted(); grabBagIdx++; } } // Make sure we set everything up as requested. assertStateV1RS(opCtx, records, drecs, legacyGrabBag, em, md); }