Status RecordStoreV1Base::validate(OperationContext* txn, bool full, bool scanData, ValidateAdaptor* adaptor, ValidateResults* results, BSONObjBuilder* output) { // 1) basic status that require no iteration // 2) extent level info // 3) check extent start and end // 4) check each non-deleted record // 5) check deleted list // ------------- // 1111111111111111111 if (isCapped()) { output->appendBool("capped", true); output->appendNumber("max", _details->maxCappedDocs()); } output->appendNumber("datasize", _details->dataSize()); output->appendNumber("nrecords", _details->numRecords()); output->appendNumber("lastExtentSize", _details->lastExtentSize(txn)); if (_details->firstExtent(txn).isNull()) output->append("firstExtent", "null"); else output->append("firstExtent", str::stream() << _details->firstExtent(txn).toString() << " ns:" << _getExtent(txn, _details->firstExtent(txn))->nsDiagnostic.toString()); if (_details->lastExtent(txn).isNull()) output->append("lastExtent", "null"); else output->append("lastExtent", str::stream() << _details->lastExtent(txn).toString() << " ns:" << _getExtent(txn, _details->lastExtent(txn))->nsDiagnostic.toString()); // 22222222222222222222222222 { // validate extent basics BSONArrayBuilder extentData; int extentCount = 0; DiskLoc extentDiskLoc; try { if (!_details->firstExtent(txn).isNull()) { _getExtent(txn, _details->firstExtent(txn))->assertOk(); _getExtent(txn, _details->lastExtent(txn))->assertOk(); } extentDiskLoc = _details->firstExtent(txn); while (!extentDiskLoc.isNull()) { Extent* thisExtent = _getExtent(txn, extentDiskLoc); if (full) { extentData << thisExtent->dump(); } if (!thisExtent->validates(extentDiskLoc, &results->errors)) { results->valid = false; } DiskLoc nextDiskLoc = thisExtent->xnext; if (extentCount > 0 && !nextDiskLoc.isNull() && _getExtent(txn, nextDiskLoc)->xprev != extentDiskLoc) { StringBuilder sb; sb << "'xprev' pointer " << _getExtent(txn, nextDiskLoc)->xprev.toString() << " in extent " << nextDiskLoc.toString() << " does not point to extent " << extentDiskLoc.toString(); results->errors.push_back(sb.str()); results->valid = false; } if (nextDiskLoc.isNull() && extentDiskLoc != _details->lastExtent(txn)) { StringBuilder sb; sb << "'lastExtent' pointer " << _details->lastExtent(txn).toString() << " does not point to last extent in list " << extentDiskLoc.toString(); results->errors.push_back(sb.str()); results->valid = false; } extentDiskLoc = nextDiskLoc; extentCount++; txn->checkForInterrupt(); } } catch (const DBException& e) { StringBuilder sb; sb << "exception validating extent " << extentCount << ": " << e.what(); results->errors.push_back(sb.str()); results->valid = false; return Status::OK(); } output->append("extentCount", extentCount); if (full) output->appendArray("extents", extentData.arr()); } try { // 333333333333333333333333333 bool testingLastExtent = false; try { DiskLoc firstExtentLoc = _details->firstExtent(txn); if (firstExtentLoc.isNull()) { // this is ok } else { output->append("firstExtentDetails", _getExtent(txn, firstExtentLoc)->dump()); if (!_getExtent(txn, firstExtentLoc)->xprev.isNull()) { StringBuilder sb; sb << "'xprev' pointer in 'firstExtent' " << _details->firstExtent(txn).toString() << " is " << _getExtent(txn, firstExtentLoc)->xprev.toString() << ", should be null"; results->errors.push_back(sb.str()); results->valid = false; } } testingLastExtent = true; DiskLoc lastExtentLoc = _details->lastExtent(txn); if (lastExtentLoc.isNull()) { // this is ok } else { if (firstExtentLoc != lastExtentLoc) { output->append("lastExtentDetails", _getExtent(txn, lastExtentLoc)->dump()); if (!_getExtent(txn, lastExtentLoc)->xnext.isNull()) { StringBuilder sb; sb << "'xnext' pointer in 'lastExtent' " << lastExtentLoc.toString() << " is " << _getExtent(txn, lastExtentLoc)->xnext.toString() << ", should be null"; results->errors.push_back(sb.str()); results->valid = false; } } } } catch (const DBException& e) { StringBuilder sb; sb << "exception processing '" << (testingLastExtent ? "lastExtent" : "firstExtent") << "': " << e.what(); results->errors.push_back(sb.str()); results->valid = false; } // 4444444444444444444444444 set<DiskLoc> recs; if (scanData) { int n = 0; int nInvalid = 0; long long nQuantizedSize = 0; long long len = 0; long long nlen = 0; long long bsonLen = 0; int outOfOrder = 0; DiskLoc dl_last; auto cursor = getCursor(txn); while (auto record = cursor->next()) { const auto dl = DiskLoc::fromRecordId(record->id); n++; if (n < 1000000) recs.insert(dl); if (isCapped()) { if (dl < dl_last) outOfOrder++; dl_last = dl; } MmapV1RecordHeader* r = recordFor(dl); len += r->lengthWithHeaders(); nlen += r->netLength(); if (isQuantized(r->lengthWithHeaders())) { // Count the number of records having a size consistent with // the quantizeAllocationSpace quantization implementation. ++nQuantizedSize; } if (full) { size_t dataSize = 0; const Status status = adaptor->validate(r->toRecordData(), &dataSize); if (!status.isOK()) { results->valid = false; if (nInvalid == 0) // only log once; results->errors.push_back("invalid object detected (see logs)"); nInvalid++; log() << "Invalid object detected in " << _ns << ": " << status.reason(); } else { bsonLen += dataSize; } } } if (isCapped() && !_details->capLooped()) { output->append("cappedOutOfOrder", outOfOrder); if (outOfOrder > 1) { results->valid = false; results->errors.push_back("too many out of order records"); } } output->append("objectsFound", n); if (full) { output->append("invalidObjects", nInvalid); } output->appendNumber("nQuantizedSize", nQuantizedSize); output->appendNumber("bytesWithHeaders", len); output->appendNumber("bytesWithoutHeaders", nlen); if (full) { output->appendNumber("bytesBson", bsonLen); } } // end scanData // 55555555555555555555555555 BSONArrayBuilder deletedListArray; for (int i = 0; i < Buckets; i++) { deletedListArray << _details->deletedListEntry(i).isNull(); } int ndel = 0; long long delSize = 0; BSONArrayBuilder delBucketSizes; int incorrect = 0; for (int i = 0; i < Buckets; i++) { DiskLoc loc = _details->deletedListEntry(i); try { int k = 0; while (!loc.isNull()) { if (recs.count(loc)) incorrect++; ndel++; if (loc.questionable()) { if (isCapped() && !loc.isValid() && i == 1) { /* the constructor for NamespaceDetails intentionally sets * deletedList[1] to invalid see comments in namespace.h */ break; } string err(str::stream() << "bad pointer in deleted record list: " << loc.toString() << " bucket: " << i << " k: " << k); results->errors.push_back(err); results->valid = false; break; } const DeletedRecord* d = deletedRecordFor(loc); delSize += d->lengthWithHeaders(); loc = d->nextDeleted(); k++; txn->checkForInterrupt(); } delBucketSizes << k; } catch (...) { results->errors.push_back((string) "exception in deleted chain for bucket " + BSONObjBuilder::numStr(i)); results->valid = false; } } output->appendNumber("deletedCount", ndel); output->appendNumber("deletedSize", delSize); if (full) { output->append("delBucketSizes", delBucketSizes.arr()); } if (incorrect) { results->errors.push_back(BSONObjBuilder::numStr(incorrect) + " records from datafile are in deleted list"); results->valid = false; } } catch (AssertionException) { results->errors.push_back("exception during validate"); results->valid = false; } return Status::OK(); }
void SimpleRecordStoreV1::_compactExtent(OperationContext* txn, const DiskLoc extentLoc, int extentNumber, RecordStoreCompactAdaptor* adaptor, const CompactOptions* compactOptions, CompactStats* stats) { log() << "compact begin extent #" << extentNumber << " for namespace " << _ns << " " << extentLoc; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent* const sourceExtent = _extentManager->getExtent(extentLoc); sourceExtent->assertOk(); fassert(17437, sourceExtent->validates(extentLoc)); { // The next/prev MmapV1RecordHeader pointers within the Extent might not be in order so we // first page in the whole Extent sequentially. // TODO benchmark on slow storage to verify this is measurably faster. log() << "compact paging in len=" << sourceExtent->length / 1000000.0 << "MB" << endl; Timer t; size_t length = sourceExtent->length; touch_pages(reinterpret_cast<const char*>(sourceExtent), length); int ms = t.millis(); if (ms > 1000) log() << "compact end paging in " << ms << "ms " << sourceExtent->length / 1000000.0 / t.seconds() << "MB/sec" << endl; } { // Move each MmapV1RecordHeader out of this extent and insert it in to the "new" extents. log() << "compact copying records" << endl; long long totalNetSize = 0; long long nrecords = 0; DiskLoc nextSourceLoc = sourceExtent->firstRecord; while (!nextSourceLoc.isNull()) { txn->checkForInterrupt(); WriteUnitOfWork wunit(txn); MmapV1RecordHeader* recOld = recordFor(nextSourceLoc); RecordData oldData = recOld->toRecordData(); nextSourceLoc = getNextRecordInExtent(txn, nextSourceLoc); if (compactOptions->validateDocuments && !adaptor->isDataValid(oldData)) { // object is corrupt! log() << "compact removing corrupt document!"; stats->corruptDocuments++; } else { // How much data is in the record. Excludes padding and MmapV1RecordHeader headers. const unsigned rawDataSize = adaptor->dataSize(oldData); nrecords++; oldObjSize += rawDataSize; oldObjSizeWithPadding += recOld->netLength(); // Allocation sizes include the headers and possibly some padding. const unsigned minAllocationSize = rawDataSize + MmapV1RecordHeader::HeaderSize; unsigned allocationSize = minAllocationSize; switch (compactOptions->paddingMode) { case CompactOptions::NONE: // default padding if (shouldPadInserts()) { allocationSize = quantizeAllocationSpace(minAllocationSize); } break; case CompactOptions::PRESERVE: // keep original padding allocationSize = recOld->lengthWithHeaders(); break; case CompactOptions::MANUAL: // user specified how much padding to use allocationSize = compactOptions->computeRecordSize(minAllocationSize); if (allocationSize < minAllocationSize || allocationSize > BSONObjMaxUserSize / 2) { allocationSize = minAllocationSize; } break; } invariant(allocationSize >= minAllocationSize); // Copy the data to a new record. Because we orphaned the record freelist at the // start of the compact, this insert will allocate a record in a new extent. // See the comment in compact() for more details. CompactDocWriter writer(recOld, rawDataSize, allocationSize); StatusWith<RecordId> status = insertRecordWithDocWriter(txn, &writer); uassertStatusOK(status.getStatus()); const MmapV1RecordHeader* newRec = recordFor(DiskLoc::fromRecordId(status.getValue())); invariant(unsigned(newRec->netLength()) >= rawDataSize); totalNetSize += newRec->netLength(); // Tells the caller that the record has been moved, so it can do things such as // add it to indexes. adaptor->inserted(newRec->toRecordData(), status.getValue()); } // Remove the old record from the linked list of records withing the sourceExtent. // The old record is not added to the freelist as we will be freeing the whole // extent at the end. *txn->recoveryUnit()->writing(&sourceExtent->firstRecord) = nextSourceLoc; if (nextSourceLoc.isNull()) { // Just moved the last record out of the extent. Mark extent as empty. *txn->recoveryUnit()->writing(&sourceExtent->lastRecord) = DiskLoc(); } else { MmapV1RecordHeader* newFirstRecord = recordFor(nextSourceLoc); txn->recoveryUnit()->writingInt(newFirstRecord->prevOfs()) = DiskLoc::NullOfs; } // Adjust the stats to reflect the removal of the old record. The insert above // handled adjusting the stats for the new record. _details->incrementStats(txn, -(recOld->netLength()), -1); wunit.commit(); } // The extent must now be empty. invariant(sourceExtent->firstRecord.isNull()); invariant(sourceExtent->lastRecord.isNull()); // We are still the first extent, but we must not be the only extent. invariant(_details->firstExtent(txn) == extentLoc); invariant(_details->lastExtent(txn) != extentLoc); // Remove the newly emptied sourceExtent from the extent linked list and return it to // the extent manager. WriteUnitOfWork wunit(txn); const DiskLoc newFirst = sourceExtent->xnext; _details->setFirstExtent(txn, newFirst); *txn->recoveryUnit()->writing(&_extentManager->getExtent(newFirst)->xprev) = DiskLoc(); _extentManager->freeExtent(txn, extentLoc); wunit.commit(); { const double oldPadding = oldObjSize ? double(oldObjSizeWithPadding) / oldObjSize : 1.0; // defining 0/0 as 1 for this. log() << "compact finished extent #" << extentNumber << " containing " << nrecords << " documents (" << totalNetSize / (1024 * 1024.0) << "MB)" << " oldPadding: " << oldPadding; } } }
void RecordStoreV1Base::deleteRecord(OperationContext* txn, const RecordId& rid) { const DiskLoc dl = DiskLoc::fromRecordId(rid); MmapV1RecordHeader* todelete = recordFor(dl); invariant(todelete->netLength() >= 4); // this is required for defensive code /* remove ourself from the record next/prev chain */ { if (todelete->prevOfs() != DiskLoc::NullOfs) { DiskLoc prev = getPrevRecordInExtent(txn, dl); MmapV1RecordHeader* prevRecord = recordFor(prev); txn->recoveryUnit()->writingInt(prevRecord->nextOfs()) = todelete->nextOfs(); } if (todelete->nextOfs() != DiskLoc::NullOfs) { DiskLoc next = getNextRecord(txn, dl); MmapV1RecordHeader* nextRecord = recordFor(next); txn->recoveryUnit()->writingInt(nextRecord->prevOfs()) = todelete->prevOfs(); } } /* remove ourself from extent pointers */ { DiskLoc extentLoc = todelete->myExtentLoc(dl); Extent* e = _getExtent(txn, extentLoc); if (e->firstRecord == dl) { txn->recoveryUnit()->writing(&e->firstRecord); if (todelete->nextOfs() == DiskLoc::NullOfs) e->firstRecord.Null(); else e->firstRecord.set(dl.a(), todelete->nextOfs()); } if (e->lastRecord == dl) { txn->recoveryUnit()->writing(&e->lastRecord); if (todelete->prevOfs() == DiskLoc::NullOfs) e->lastRecord.Null(); else e->lastRecord.set(dl.a(), todelete->prevOfs()); } } /* add to the free list */ { _details->incrementStats(txn, -1 * todelete->netLength(), -1); if (_isSystemIndexes) { /* temp: if in system.indexes, don't reuse, and zero out: we want to be careful until validated more, as IndexDetails has pointers to this disk location. so an incorrectly done remove would cause a lot of problems. */ memset(txn->recoveryUnit()->writingPtr(todelete, todelete->lengthWithHeaders()), 0, todelete->lengthWithHeaders()); } else { // this is defensive so we can detect if we are still using a location // that was deleted memset(txn->recoveryUnit()->writingPtr(todelete->data(), 4), 0xee, 4); addDeletedRec(txn, dl); } } }