void RecordStoreV1Base::deleteRecord( TransactionExperiment* txn, const DiskLoc& dl ) { Record* todelete = recordFor( dl ); /* remove ourself from the record next/prev chain */ { if ( todelete->prevOfs() != DiskLoc::NullOfs ) { DiskLoc prev = getPrevRecordInExtent( dl ); Record* prevRecord = recordFor( prev ); txn->writingInt( prevRecord->nextOfs() ) = todelete->nextOfs(); } if ( todelete->nextOfs() != DiskLoc::NullOfs ) { DiskLoc next = getNextRecord( dl ); Record* nextRecord = recordFor( next ); txn->writingInt( nextRecord->prevOfs() ) = todelete->prevOfs(); } } /* remove ourself from extent pointers */ { Extent *e = txn->writing( _getExtent( _getExtentLocForRecord( dl ) ) ); if ( e->firstRecord == dl ) { if ( todelete->nextOfs() == DiskLoc::NullOfs ) e->firstRecord.Null(); else e->firstRecord.set(dl.a(), todelete->nextOfs() ); } if ( e->lastRecord == dl ) { if ( todelete->prevOfs() == DiskLoc::NullOfs ) e->lastRecord.Null(); else e->lastRecord.set(dl.a(), todelete->prevOfs() ); } } /* add to the free list */ { _details->incrementStats( txn, -1 * todelete->netLength(), -1 ); if ( _isSystemIndexes ) { /* temp: if in system.indexes, don't reuse, and zero out: we want to be careful until validated more, as IndexDetails has pointers to this disk location. so an incorrectly done remove would cause a lot of problems. */ memset( txn->writingPtr(todelete, todelete->lengthWithHeaders() ), 0, todelete->lengthWithHeaders() ); } else { DEV { unsigned long long *p = reinterpret_cast<unsigned long long *>( todelete->data() ); *txn->writing(p) = 0; } addDeletedRec(txn, dl); } } }
StatusWith<DiskLoc> HeapRecordStore::updateRecord(OperationContext* txn, const DiskLoc& loc, const char* data, int len, bool enforceQuota, UpdateMoveNotifier* notifier ) { HeapRecord* oldRecord = recordFor( loc ); int oldLen = oldRecord->size; if (_isCapped && len > oldLen) { return StatusWith<DiskLoc>( ErrorCodes::InternalError, "failing update: objects in a capped ns cannot grow", 10003 ); } HeapRecord newRecord(len); memcpy(newRecord.data.get(), data, len); txn->recoveryUnit()->registerChange(new RemoveChange(_data, loc, *oldRecord)); _data->dataSize += len - oldLen; *oldRecord = newRecord; cappedDeleteAsNeeded(txn); return StatusWith<DiskLoc>(loc); }
StatusWith<DiskLoc> RecordStoreV1Base::_insertRecord( OperationContext* txn, const char* data, int len, bool enforceQuota ) { int lenWHdr = getRecordAllocationSize( len + Record::HeaderSize ); fassert( 17208, lenWHdr >= ( len + Record::HeaderSize ) ); StatusWith<DiskLoc> loc = allocRecord( txn, lenWHdr, enforceQuota ); if ( !loc.isOK() ) return loc; Record *r = recordFor( loc.getValue() ); fassert( 17210, r->lengthWithHeaders() >= lenWHdr ); // copy the data r = reinterpret_cast<Record*>( txn->recoveryUnit()->writingPtr(r, lenWHdr) ); memcpy( r->data(), data, len ); _addRecordToRecListInExtent(txn, r, loc.getValue()); _details->incrementStats( txn, r->netLength(), 1 ); return loc; }
StatusWith<DiskLoc> RecordStoreV1Base::insertRecord( OperationContext* txn, const DocWriter* doc, bool enforceQuota ) { int docSize = doc->documentSize(); if ( docSize < 4 ) { return StatusWith<DiskLoc>( ErrorCodes::InvalidLength, "record has to be >= 4 bytes" ); } int lenWHdr = docSize + Record::HeaderSize; if ( doc->addPadding() ) lenWHdr = getRecordAllocationSize( lenWHdr ); StatusWith<DiskLoc> loc = allocRecord( txn, lenWHdr, enforceQuota ); if ( !loc.isOK() ) return loc; Record *r = recordFor( loc.getValue() ); fassert( 17319, r->lengthWithHeaders() >= lenWHdr ); r = reinterpret_cast<Record*>( txn->recoveryUnit()->writingPtr(r, lenWHdr) ); doc->writeDocument( r->data() ); _addRecordToRecListInExtent(txn, r, loc.getValue()); _details->incrementStats( txn, r->netLength(), 1 ); _paddingFits( txn ); return loc; }
Status HeapRecordStore::updateWithDamages( OperationContext* txn, const DiskLoc& loc, const RecordData& oldRec, const char* damageSource, const mutablebson::DamageVector& damages ) { HeapRecord* oldRecord = recordFor( loc ); const int len = oldRecord->size; HeapRecord newRecord(len); memcpy(newRecord.data.get(), oldRecord->data.get(), len); txn->recoveryUnit()->registerChange(new RemoveChange(_data, loc, *oldRecord)); *oldRecord = newRecord; cappedDeleteAsNeeded(txn); char* root = newRecord.data.get(); mutablebson::DamageVector::const_iterator where = damages.begin(); const mutablebson::DamageVector::const_iterator end = damages.end(); for( ; where != end; ++where ) { const char* sourcePtr = damageSource + where->sourceOffset; char* targetPtr = root + where->targetOffset; std::memcpy(targetPtr, sourcePtr, where->size); } *oldRecord = newRecord; return Status::OK(); }
StatusWith<RecordId> RecordStoreV1Base::_insertRecord(OperationContext* txn, const char* data, int len, bool enforceQuota) { const int lenWHdr = len + MmapV1RecordHeader::HeaderSize; const int lenToAlloc = shouldPadInserts() ? quantizeAllocationSpace(lenWHdr) : lenWHdr; fassert(17208, lenToAlloc >= lenWHdr); StatusWith<DiskLoc> loc = allocRecord(txn, lenToAlloc, enforceQuota); if (!loc.isOK()) return StatusWith<RecordId>(loc.getStatus()); MmapV1RecordHeader* r = recordFor(loc.getValue()); fassert(17210, r->lengthWithHeaders() >= lenWHdr); // copy the data r = reinterpret_cast<MmapV1RecordHeader*>(txn->recoveryUnit()->writingPtr(r, lenWHdr)); memcpy(r->data(), data, len); _addRecordToRecListInExtent(txn, r, loc.getValue()); _details->incrementStats(txn, r->netLength(), 1); return StatusWith<RecordId>(loc.getValue().toRecordId()); }
StatusWith<RecordId> RecordStoreV1Base::insertRecord(OperationContext* txn, const DocWriter* doc, bool enforceQuota) { int docSize = doc->documentSize(); if (docSize < 4) { return StatusWith<RecordId>(ErrorCodes::InvalidLength, "record has to be >= 4 bytes"); } const int lenWHdr = docSize + MmapV1RecordHeader::HeaderSize; if (lenWHdr > MaxAllowedAllocation) { return StatusWith<RecordId>(ErrorCodes::InvalidLength, "record has to be <= 16.5MB"); } const int lenToAlloc = (doc->addPadding() && shouldPadInserts()) ? quantizeAllocationSpace(lenWHdr) : lenWHdr; StatusWith<DiskLoc> loc = allocRecord(txn, lenToAlloc, enforceQuota); if (!loc.isOK()) return StatusWith<RecordId>(loc.getStatus()); MmapV1RecordHeader* r = recordFor(loc.getValue()); fassert(17319, r->lengthWithHeaders() >= lenWHdr); r = reinterpret_cast<MmapV1RecordHeader*>(txn->recoveryUnit()->writingPtr(r, lenWHdr)); doc->writeDocument(r->data()); _addRecordToRecListInExtent(txn, r, loc.getValue()); _details->incrementStats(txn, r->netLength(), 1); return StatusWith<RecordId>(loc.getValue().toRecordId()); }
void RecordStoreV1Base::IntraExtentIterator::advance() { if (_curr.isNull()) return; const MmapV1RecordHeader* rec = recordFor(_curr); const int nextOfs = _forward ? rec->nextOfs() : rec->prevOfs(); _curr = (nextOfs == DiskLoc::NullOfs ? DiskLoc() : DiskLoc(_curr.a(), nextOfs)); }
void RecordStoreV1Base::_addRecordToRecListInExtent(OperationContext* txn, Record *r, DiskLoc loc) { dassert( recordFor(loc) == r ); Extent *e = _getExtent( _getExtentLocForRecord( loc ) ); if ( e->lastRecord.isNull() ) { *txn->recoveryUnit()->writing(&e->firstRecord) = loc; *txn->recoveryUnit()->writing(&e->lastRecord) = loc; r->prevOfs() = r->nextOfs() = DiskLoc::NullOfs; } else { Record *oldlast = recordFor(e->lastRecord); r->prevOfs() = e->lastRecord.getOfs(); r->nextOfs() = DiskLoc::NullOfs; txn->recoveryUnit()->writingInt(oldlast->nextOfs()) = loc.getOfs(); *txn->recoveryUnit()->writing(&e->lastRecord) = loc; } }
DiskLoc RecordStoreV1Base::getNextRecordInExtent( const DiskLoc& loc ) const { int nextOffset = recordFor( loc )->nextOfs(); if ( nextOffset == DiskLoc::NullOfs ) return DiskLoc(); fassert( 17441, abs(nextOffset) >= 8 ); // defensive return DiskLoc( loc.a(), nextOffset ); }
DiskLoc ExtentManager::getNextRecordInExtent( const DiskLoc& loc ) const { int nextOffset = recordFor( loc )->nextOfs(); if ( nextOffset == DiskLoc::NullOfs ) return DiskLoc(); fassert( 16967, abs(nextOffset) >= 8 ); // defensive return DiskLoc( loc.a(), nextOffset ); }
DiskLoc ExtentManager::getPrevRecordInExtent( const DiskLoc& loc ) const { int prevOffset = recordFor( loc )->prevOfs(); if ( prevOffset == DiskLoc::NullOfs ) return DiskLoc(); fassert( 16968, abs(prevOffset) >= 8 ); // defensive return DiskLoc( loc.a(), prevOffset ); }
DiskLoc RecordStoreV1Base::getPrevRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const { int prevOffset = recordFor( loc )->prevOfs(); if ( prevOffset == DiskLoc::NullOfs ) return DiskLoc(); fassert( 17442, abs(prevOffset) >= 8 ); // defensive DiskLoc result( loc.a(), prevOffset ); return result; }
DiskLoc RecordStoreV1Base::getPrevRecordInExtent( const DiskLoc& loc ) const { int prevOffset = recordFor( loc )->prevOfs(); if ( prevOffset == DiskLoc::NullOfs ) return DiskLoc(); fassert( 17442, abs(prevOffset) >= 8 ); // defensive return DiskLoc( loc.a(), prevOffset ); }
DiskLoc RecordStoreV1Base::IntraExtentIterator::getNext() { if (_curr.isNull()) return DiskLoc(); const DiskLoc out = _curr; // we always return where we were, not where we will be. const Record* rec = recordFor(_curr); const int nextOfs = _forward ? rec->nextOfs() : rec->prevOfs(); _curr = (nextOfs == DiskLoc::NullOfs ? DiskLoc() : DiskLoc(_curr.a(), nextOfs)); return out; }
bool RecordStoreV1Base::findRecord( OperationContext* txn, const DiskLoc& loc, RecordData* rd ) const { // this is a bit odd, as the semantics of using the storage engine imply it _has_ to be. // And in fact we can't actually check. // So we assume the best. Record* rec = recordFor(loc); if ( !rec ) { return false; } *rd = rec->toRecordData(); return true; }
Status HeapRecordStore::updateWithDamages( OperationContext* txn, const DiskLoc& loc, const char* damangeSource, const mutablebson::DamageVector& damages ) { HeapRecord* rec = recordFor( loc ); char* root = rec->data(); // All updates were in place. Apply them via durability and writing pointer. mutablebson::DamageVector::const_iterator where = damages.begin(); const mutablebson::DamageVector::const_iterator end = damages.end(); for( ; where != end; ++where ) { const char* sourcePtr = damangeSource + where->sourceOffset; char* targetPtr = root + where->targetOffset; std::memcpy(targetPtr, sourcePtr, where->size); } return Status::OK(); }
StatusWith<RecordId> RecordStoreV1Base::updateRecord(OperationContext* txn, const RecordId& oldLocation, const char* data, int dataSize, bool enforceQuota, UpdateNotifier* notifier) { MmapV1RecordHeader* oldRecord = recordFor(DiskLoc::fromRecordId(oldLocation)); if (oldRecord->netLength() >= dataSize) { // Make sure to notify other queries before we do an in-place update. if (notifier) { Status callbackStatus = notifier->recordStoreGoingToUpdateInPlace(txn, oldLocation); if (!callbackStatus.isOK()) return StatusWith<RecordId>(callbackStatus); } // we fit memcpy(txn->recoveryUnit()->writingPtr(oldRecord->data(), dataSize), data, dataSize); return StatusWith<RecordId>(oldLocation); } // We enforce the restriction of unchanging capped doc sizes above the storage layer. invariant(!isCapped()); // we have to move if (dataSize + MmapV1RecordHeader::HeaderSize > MaxAllowedAllocation) { return StatusWith<RecordId>(ErrorCodes::InvalidLength, "record has to be <= 16.5MB"); } StatusWith<RecordId> newLocation = _insertRecord(txn, data, dataSize, enforceQuota); if (!newLocation.isOK()) return newLocation; // insert worked, so we delete old record if (notifier) { Status moveStatus = notifier->recordStoreGoingToMove( txn, oldLocation, oldRecord->data(), oldRecord->netLength()); if (!moveStatus.isOK()) return StatusWith<RecordId>(moveStatus); } deleteRecord(txn, oldLocation); return newLocation; }
StatusWith<RecordData> RecordStoreV1Base::updateWithDamages( OperationContext* txn, const RecordId& loc, const RecordData& oldRec, const char* damageSource, const mutablebson::DamageVector& damages) { MmapV1RecordHeader* rec = recordFor(DiskLoc::fromRecordId(loc)); char* root = rec->data(); // All updates were in place. Apply them via durability and writing pointer. mutablebson::DamageVector::const_iterator where = damages.begin(); const mutablebson::DamageVector::const_iterator end = damages.end(); for (; where != end; ++where) { const char* sourcePtr = damageSource + where->sourceOffset; void* targetPtr = txn->recoveryUnit()->writingPtr(root + where->targetOffset, where->size); std::memcpy(targetPtr, sourcePtr, where->size); } return rec->toRecordData(); }
StatusWith<DiskLoc> RecordStoreV1Base::updateRecord( OperationContext* txn, const DiskLoc& oldLocation, const char* data, int dataSize, bool enforceQuota, UpdateMoveNotifier* notifier ) { Record* oldRecord = recordFor( oldLocation ); if ( oldRecord->netLength() >= dataSize ) { // we fit _paddingFits( txn ); memcpy( txn->recoveryUnit()->writingPtr( oldRecord->data(), dataSize ), data, dataSize ); return StatusWith<DiskLoc>( oldLocation ); } if ( isCapped() ) return StatusWith<DiskLoc>( ErrorCodes::InternalError, "failing update: objects in a capped ns cannot grow", 10003 ); // we have to move _paddingTooSmall( txn ); StatusWith<DiskLoc> newLocation = _insertRecord( txn, data, dataSize, enforceQuota ); if ( !newLocation.isOK() ) return newLocation; // insert worked, so we delete old record if ( notifier ) { Status moveStatus = notifier->recordStoreGoingToMove( txn, oldLocation, oldRecord->data(), oldRecord->netLength() ); if ( !moveStatus.isOK() ) return StatusWith<DiskLoc>( moveStatus ); } deleteRecord( txn, oldLocation ); return newLocation; }
StatusWith<DiskLoc> HeapRecordStore::updateRecord(OperationContext* txn, const DiskLoc& oldLocation, const char* data, int len, bool enforceQuota, UpdateMoveNotifier* notifier ) { HeapRecord* oldRecord = recordFor( oldLocation ); int oldLen = oldRecord->netLength(); // If the length of the new data is <= the length of the old data then just // memcopy into the old space if ( len <= oldLen) { memcpy(oldRecord->data(), data, len); _dataSize += len - oldLen; return StatusWith<DiskLoc>(oldLocation); } if ( _isCapped ) { return StatusWith<DiskLoc>( ErrorCodes::InternalError, "failing update: objects in a capped ns cannot grow", 10003 ); } // If the length of the new data exceeds the size of the old Record, we need to allocate // a new Record, and delete the old one const int lengthWithHeaders = len + HeapRecord::HeaderSize; boost::shared_array<char> buf(new char[lengthWithHeaders]); HeapRecord* rec = reinterpret_cast<HeapRecord*>(buf.get()); rec->lengthWithHeaders() = lengthWithHeaders; memcpy(rec->data(), data, len); _records[oldLocation] = buf; _dataSize += len - oldLen; cappedDeleteAsNeeded(txn); return StatusWith<DiskLoc>(oldLocation); }
StatusWith<DiskLoc> RecordStoreV1Base::insertRecord( TransactionExperiment* txn, const DocWriter* doc, int quotaMax ) { int lenWHdr = doc->documentSize() + Record::HeaderSize; if ( doc->addPadding() ) lenWHdr = getRecordAllocationSize( lenWHdr ); StatusWith<DiskLoc> loc = allocRecord( txn, lenWHdr, quotaMax ); if ( !loc.isOK() ) return loc; Record *r = recordFor( loc.getValue() ); fassert( 17319, r->lengthWithHeaders() >= lenWHdr ); r = reinterpret_cast<Record*>( txn->writingPtr(r, lenWHdr) ); doc->writeDocument( r->data() ); _addRecordToRecListInExtent(txn, r, loc.getValue()); _details->incrementStats( txn, r->netLength(), 1 ); return loc; }
void RecordStoreV1Base::deleteRecord( OperationContext* txn, const DiskLoc& dl ) { Record* todelete = recordFor( dl ); invariant( todelete->netLength() >= 4 ); // this is required for defensive code /* remove ourself from the record next/prev chain */ { if ( todelete->prevOfs() != DiskLoc::NullOfs ) { DiskLoc prev = getPrevRecordInExtent( txn, dl ); Record* prevRecord = recordFor( prev ); txn->recoveryUnit()->writingInt( prevRecord->nextOfs() ) = todelete->nextOfs(); } if ( todelete->nextOfs() != DiskLoc::NullOfs ) { DiskLoc next = getNextRecord( txn, dl ); Record* nextRecord = recordFor( next ); txn->recoveryUnit()->writingInt( nextRecord->prevOfs() ) = todelete->prevOfs(); } } /* remove ourself from extent pointers */ { DiskLoc extentLoc = todelete->myExtentLoc(dl); Extent *e = _getExtent( txn, extentLoc ); if ( e->firstRecord == dl ) { txn->recoveryUnit()->writing(&e->firstRecord); if ( todelete->nextOfs() == DiskLoc::NullOfs ) e->firstRecord.Null(); else e->firstRecord.set(dl.a(), todelete->nextOfs() ); } if ( e->lastRecord == dl ) { txn->recoveryUnit()->writing(&e->lastRecord); if ( todelete->prevOfs() == DiskLoc::NullOfs ) e->lastRecord.Null(); else e->lastRecord.set(dl.a(), todelete->prevOfs() ); } } /* add to the free list */ { _details->incrementStats( txn, -1 * todelete->netLength(), -1 ); if ( _isSystemIndexes ) { /* temp: if in system.indexes, don't reuse, and zero out: we want to be careful until validated more, as IndexDetails has pointers to this disk location. so an incorrectly done remove would cause a lot of problems. */ memset( txn->recoveryUnit()->writingPtr(todelete, todelete->lengthWithHeaders() ), 0, todelete->lengthWithHeaders() ); } else { // this is defensive so we can detect if we are still using a location // that was deleted memset(txn->recoveryUnit()->writingPtr(todelete->data(), 4), 0xee, 4); addDeletedRec(txn, dl); } } }
Extent* ExtentManager::extentFor( const DiskLoc& loc ) const { Record* record = recordFor( loc ); DiskLoc extentLoc( loc.a(), record->extentOfs() ); return getExtent( extentLoc ); }
void SimpleRecordStoreV1::_compactExtent(OperationContext* txn, const DiskLoc extentLoc, int extentNumber, RecordStoreCompactAdaptor* adaptor, const CompactOptions* compactOptions, CompactStats* stats ) { log() << "compact begin extent #" << extentNumber << " for namespace " << _ns << " " << extentLoc; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent* const sourceExtent = _extentManager->getExtent( extentLoc ); sourceExtent->assertOk(); fassert( 17437, sourceExtent->validates(extentLoc) ); { // The next/prev Record pointers within the Extent might not be in order so we first // page in the whole Extent sequentially. // TODO benchmark on slow storage to verify this is measurably faster. log() << "compact paging in len=" << sourceExtent->length/1000000.0 << "MB" << endl; Timer t; size_t length = sourceExtent->length; touch_pages( reinterpret_cast<const char*>(sourceExtent), length ); int ms = t.millis(); if( ms > 1000 ) log() << "compact end paging in " << ms << "ms " << sourceExtent->length/1000000.0/t.seconds() << "MB/sec" << endl; } { // Move each Record out of this extent and insert it in to the "new" extents. log() << "compact copying records" << endl; long long totalNetSize = 0; long long nrecords = 0; DiskLoc nextSourceLoc = sourceExtent->firstRecord; while (!nextSourceLoc.isNull()) { txn->checkForInterrupt(); WriteUnitOfWork wunit(txn); Record* recOld = recordFor(nextSourceLoc); RecordData oldData = recOld->toRecordData(); nextSourceLoc = getNextRecordInExtent(txn, nextSourceLoc); if ( compactOptions->validateDocuments && !adaptor->isDataValid( oldData ) ) { // object is corrupt! log() << "compact removing corrupt document!"; stats->corruptDocuments++; } else { // How much data is in the record. Excludes padding and Record headers. const unsigned rawDataSize = adaptor->dataSize( oldData ); nrecords++; oldObjSize += rawDataSize; oldObjSizeWithPadding += recOld->netLength(); // Allocation sizes include the headers and possibly some padding. const unsigned minAllocationSize = rawDataSize + Record::HeaderSize; unsigned allocationSize = minAllocationSize; switch( compactOptions->paddingMode ) { case CompactOptions::NONE: // no padding, unless using powerOf2Sizes if ( _details->isUserFlagSet(Flag_UsePowerOf2Sizes) ) allocationSize = quantizePowerOf2AllocationSpace(minAllocationSize); else allocationSize = minAllocationSize; break; case CompactOptions::PRESERVE: // keep original padding allocationSize = recOld->lengthWithHeaders(); break; case CompactOptions::MANUAL: // user specified how much padding to use allocationSize = compactOptions->computeRecordSize(minAllocationSize); if (allocationSize < minAllocationSize || allocationSize > BSONObjMaxUserSize / 2 ) { allocationSize = minAllocationSize; } break; } invariant(allocationSize >= minAllocationSize); // Copy the data to a new record. Because we orphaned the record freelist at the // start of the compact, this insert will allocate a record in a new extent. // See the comment in compact() for more details. CompactDocWriter writer( recOld, rawDataSize, allocationSize ); StatusWith<DiskLoc> status = insertRecord( txn, &writer, false ); uassertStatusOK( status.getStatus() ); const Record* newRec = recordFor(status.getValue()); invariant(unsigned(newRec->netLength()) >= rawDataSize); totalNetSize += newRec->netLength(); // Tells the caller that the record has been moved, so it can do things such as // add it to indexes. adaptor->inserted(newRec->toRecordData(), status.getValue()); } // Remove the old record from the linked list of records withing the sourceExtent. // The old record is not added to the freelist as we will be freeing the whole // extent at the end. *txn->recoveryUnit()->writing(&sourceExtent->firstRecord) = nextSourceLoc; if (nextSourceLoc.isNull()) { // Just moved the last record out of the extent. Mark extent as empty. *txn->recoveryUnit()->writing(&sourceExtent->lastRecord) = DiskLoc(); } else { Record* newFirstRecord = recordFor(nextSourceLoc); txn->recoveryUnit()->writingInt(newFirstRecord->prevOfs()) = DiskLoc::NullOfs; } // Adjust the stats to reflect the removal of the old record. The insert above // handled adjusting the stats for the new record. _details->incrementStats(txn, -(recOld->netLength()), -1); wunit.commit(); } // The extent must now be empty. invariant(sourceExtent->firstRecord.isNull()); invariant(sourceExtent->lastRecord.isNull()); // We are still the first extent, but we must not be the only extent. invariant( _details->firstExtent(txn) == extentLoc ); invariant( _details->lastExtent(txn) != extentLoc ); // Remove the newly emptied sourceExtent from the extent linked list and return it to // the extent manager. WriteUnitOfWork wunit(txn); const DiskLoc newFirst = sourceExtent->xnext; _details->setFirstExtent( txn, newFirst ); *txn->recoveryUnit()->writing(&_extentManager->getExtent( newFirst )->xprev) = DiskLoc(); _extentManager->freeExtent( txn, extentLoc ); wunit.commit(); { const double oldPadding = oldObjSize ? double(oldObjSizeWithPadding) / oldObjSize : 1.0; // defining 0/0 as 1 for this. log() << "compact finished extent #" << extentNumber << " containing " << nrecords << " documents (" << totalNetSize / (1024*1024.0) << "MB)" << " oldPadding: " << oldPadding; } } }
void HeapRecordStore::deleteRecord(OperationContext* txn, const DiskLoc& loc) { HeapRecord* rec = recordFor(loc); _dataSize -= rec->netLength(); invariant(_records.erase(loc) == 1); }
DiskLoc ExtentManager::extentLocFor( const DiskLoc& loc ) const { Record* record = recordFor( loc ); return DiskLoc( loc.a(), record->extentOfs() ); }
void HeapRecordStore::deleteRecord(OperationContext* txn, const DiskLoc& loc) { HeapRecord* rec = recordFor(loc); txn->recoveryUnit()->registerChange(new RemoveChange(_data, loc, *rec)); _data->dataSize -= rec->size; invariant(_data->records.erase(loc) == 1); }
void SimpleRecordStoreV1::_compactExtent(OperationContext* txn, const DiskLoc diskloc, int extentNumber, RecordStoreCompactAdaptor* adaptor, const CompactOptions* compactOptions, CompactStats* stats ) { log() << "compact begin extent #" << extentNumber << " for namespace " << _ns << " " << diskloc; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent *e = _extentManager->getExtent( diskloc ); e->assertOk(); fassert( 17437, e->validates(diskloc) ); { // the next/prev pointers within the extent might not be in order so we first // page the whole thing in sequentially log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl; Timer t; size_t length = e->length; touch_pages( reinterpret_cast<const char*>(e), length ); int ms = t.millis(); if( ms > 1000 ) log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/t.seconds() << "MB/sec" << endl; } { log() << "compact copying records" << endl; long long datasize = 0; long long nrecords = 0; DiskLoc L = e->firstRecord; if( !L.isNull() ) { while( 1 ) { Record *recOld = recordFor(L); L = getNextRecordInExtent(L); if ( compactOptions->validateDocuments && !adaptor->isDataValid(recOld) ) { // object is corrupt! log() << "compact skipping corrupt document!"; stats->corruptDocuments++; } else { unsigned dataSize = adaptor->dataSize( recOld ); unsigned docSize = dataSize; nrecords++; oldObjSize += docSize; oldObjSizeWithPadding += recOld->netLength(); unsigned lenWHdr = docSize + Record::HeaderSize; unsigned lenWPadding = lenWHdr; switch( compactOptions->paddingMode ) { case CompactOptions::NONE: if ( _details->isUserFlagSet(Flag_UsePowerOf2Sizes) ) lenWPadding = quantizePowerOf2AllocationSpace(lenWPadding); break; case CompactOptions::PRESERVE: // if we are preserving the padding, the record should not change size lenWPadding = recOld->lengthWithHeaders(); break; case CompactOptions::MANUAL: lenWPadding = compactOptions->computeRecordSize(lenWPadding); if (lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { lenWPadding = lenWHdr; } break; } CompactDocWriter writer( recOld, dataSize, lenWPadding ); StatusWith<DiskLoc> status = insertRecord( txn, &writer, 0 ); uassertStatusOK( status.getStatus() ); datasize += recordFor( status.getValue() )->netLength(); adaptor->inserted( recordFor( status.getValue() ), status.getValue() ); } if( L.isNull() ) { // we just did the very last record from the old extent. it's still pointed to // by the old extent ext, but that will be fixed below after this loop break; } // remove the old records (orphan them) periodically so our commit block doesn't get too large bool stopping = false; RARELY stopping = !txn->checkForInterruptNoAssert().isOK(); if( stopping || txn->recoveryUnit()->isCommitNeeded() ) { *txn->recoveryUnit()->writing(&e->firstRecord) = L; Record *r = recordFor(L); txn->recoveryUnit()->writingInt(r->prevOfs()) = DiskLoc::NullOfs; txn->recoveryUnit()->commitIfNeeded(); txn->checkForInterrupt(); } } } // if !L.isNull() invariant( _details->firstExtent() == diskloc ); invariant( _details->lastExtent() != diskloc ); DiskLoc newFirst = e->xnext; _details->setFirstExtent( txn, newFirst ); *txn->recoveryUnit()->writing(&_extentManager->getExtent( newFirst )->xprev) = DiskLoc(); _extentManager->freeExtent( txn, diskloc ); txn->recoveryUnit()->commitIfNeeded(); { double op = 1.0; if( oldObjSize ) op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize; log() << "compact finished extent #" << extentNumber << " containing " << nrecords << " documents (" << datasize/1000000.0 << "MB)" << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100; } } }
Status RecordStoreV1Base::validate( OperationContext* txn, bool full, bool scanData, ValidateAdaptor* adaptor, ValidateResults* results, BSONObjBuilder* output ) const { // 1) basic status that require no iteration // 2) extent level info // 3) check extent start and end // 4) check each non-deleted record // 5) check deleted list // ------------- // 1111111111111111111 if ( isCapped() ){ output->appendBool("capped", true); output->appendNumber("max", _details->maxCappedDocs()); } output->appendNumber("datasize", _details->dataSize()); output->appendNumber("nrecords", _details->numRecords()); output->appendNumber("lastExtentSize", _details->lastExtentSize(txn)); output->appendNumber("padding", _details->paddingFactor()); if ( _details->firstExtent(txn).isNull() ) output->append( "firstExtent", "null" ); else output->append( "firstExtent", str::stream() << _details->firstExtent(txn).toString() << " ns:" << _getExtent( txn, _details->firstExtent(txn) )->nsDiagnostic.toString()); if ( _details->lastExtent(txn).isNull() ) output->append( "lastExtent", "null" ); else output->append( "lastExtent", str::stream() << _details->lastExtent(txn).toString() << " ns:" << _getExtent( txn, _details->lastExtent(txn) )->nsDiagnostic.toString()); // 22222222222222222222222222 { // validate extent basics BSONArrayBuilder extentData; int extentCount = 0; DiskLoc extentDiskLoc; try { if ( !_details->firstExtent(txn).isNull() ) { _getExtent( txn, _details->firstExtent(txn) )->assertOk(); _getExtent( txn, _details->lastExtent(txn) )->assertOk(); } extentDiskLoc = _details->firstExtent(txn); while (!extentDiskLoc.isNull()) { Extent* thisExtent = _getExtent( txn, extentDiskLoc ); if (full) { extentData << thisExtent->dump(); } if (!thisExtent->validates(extentDiskLoc, &results->errors)) { results->valid = false; } DiskLoc nextDiskLoc = thisExtent->xnext; if (extentCount > 0 && !nextDiskLoc.isNull() && _getExtent( txn, nextDiskLoc )->xprev != extentDiskLoc) { StringBuilder sb; sb << "'xprev' pointer " << _getExtent( txn, nextDiskLoc )->xprev.toString() << " in extent " << nextDiskLoc.toString() << " does not point to extent " << extentDiskLoc.toString(); results->errors.push_back( sb.str() ); results->valid = false; } if (nextDiskLoc.isNull() && extentDiskLoc != _details->lastExtent(txn)) { StringBuilder sb; sb << "'lastExtent' pointer " << _details->lastExtent(txn).toString() << " does not point to last extent in list " << extentDiskLoc.toString(); results->errors.push_back( sb.str() ); results->valid = false; } extentDiskLoc = nextDiskLoc; extentCount++; txn->checkForInterrupt(); } } catch (const DBException& e) { StringBuilder sb; sb << "exception validating extent " << extentCount << ": " << e.what(); results->errors.push_back( sb.str() ); results->valid = false; return Status::OK(); } output->append("extentCount", extentCount); if ( full ) output->appendArray( "extents" , extentData.arr() ); } try { // 333333333333333333333333333 bool testingLastExtent = false; try { DiskLoc firstExtentLoc = _details->firstExtent(txn); if (firstExtentLoc.isNull()) { // this is ok } else { output->append("firstExtentDetails", _getExtent(txn, firstExtentLoc)->dump()); if (!_getExtent(txn, firstExtentLoc)->xprev.isNull()) { StringBuilder sb; sb << "'xprev' pointer in 'firstExtent' " << _details->firstExtent(txn).toString() << " is " << _getExtent(txn, firstExtentLoc)->xprev.toString() << ", should be null"; results->errors.push_back( sb.str() ); results->valid = false; } } testingLastExtent = true; DiskLoc lastExtentLoc = _details->lastExtent(txn); if (lastExtentLoc.isNull()) { // this is ok } else { if (firstExtentLoc != lastExtentLoc) { output->append("lastExtentDetails", _getExtent(txn, lastExtentLoc)->dump()); if (!_getExtent(txn, lastExtentLoc)->xnext.isNull()) { StringBuilder sb; sb << "'xnext' pointer in 'lastExtent' " << lastExtentLoc.toString() << " is " << _getExtent(txn, lastExtentLoc)->xnext.toString() << ", should be null"; results->errors.push_back( sb.str() ); results->valid = false; } } } } catch (const DBException& e) { StringBuilder sb; sb << "exception processing '" << (testingLastExtent ? "lastExtent" : "firstExtent") << "': " << e.what(); results->errors.push_back( sb.str() ); results->valid = false; } // 4444444444444444444444444 set<DiskLoc> recs; if( scanData ) { int n = 0; int nInvalid = 0; long long nQuantizedSize = 0; long long nPowerOf2QuantizedSize = 0; long long len = 0; long long nlen = 0; long long bsonLen = 0; int outOfOrder = 0; DiskLoc cl_last; scoped_ptr<RecordIterator> iterator( getIterator( txn, DiskLoc(), false, CollectionScanParams::FORWARD ) ); DiskLoc cl; while ( !( cl = iterator->getNext() ).isNull() ) { n++; if ( n < 1000000 ) recs.insert(cl); if ( isCapped() ) { if ( cl < cl_last ) outOfOrder++; cl_last = cl; } Record *r = recordFor(cl); len += r->lengthWithHeaders(); nlen += r->netLength(); if ( r->lengthWithHeaders() == quantizeAllocationSpace( r->lengthWithHeaders() ) ) { // Count the number of records having a size consistent with // the quantizeAllocationSpace quantization implementation. ++nQuantizedSize; } if ( r->lengthWithHeaders() == quantizePowerOf2AllocationSpace( r->lengthWithHeaders() ) ) { // Count the number of records having a size consistent with the // quantizePowerOf2AllocationSpace quantization implementation. ++nPowerOf2QuantizedSize; } if (full){ size_t dataSize = 0; const Status status = adaptor->validate( r->toRecordData(), &dataSize ); if (!status.isOK()) { results->valid = false; if (nInvalid == 0) // only log once; results->errors.push_back( "invalid object detected (see logs)" ); nInvalid++; log() << "Invalid object detected in " << _ns << ": " << status.reason(); } else { bsonLen += dataSize; } } } if ( isCapped() && !_details->capLooped() ) { output->append("cappedOutOfOrder", outOfOrder); if ( outOfOrder > 1 ) { results->valid = false; results->errors.push_back( "too many out of order records" ); } } output->append("objectsFound", n); if (full) { output->append("invalidObjects", nInvalid); } output->appendNumber("nQuantizedSize", nQuantizedSize); output->appendNumber("nPowerOf2QuantizedSize", nPowerOf2QuantizedSize); output->appendNumber("bytesWithHeaders", len); output->appendNumber("bytesWithoutHeaders", nlen); if (full) { output->appendNumber("bytesBson", bsonLen); } } // end scanData // 55555555555555555555555555 BSONArrayBuilder deletedListArray; for ( int i = 0; i < Buckets; i++ ) { deletedListArray << _details->deletedListEntry(i).isNull(); } int ndel = 0; long long delSize = 0; BSONArrayBuilder delBucketSizes; int incorrect = 0; for ( int i = 0; i < Buckets; i++ ) { DiskLoc loc = _details->deletedListEntry(i); try { int k = 0; while ( !loc.isNull() ) { if ( recs.count(loc) ) incorrect++; ndel++; if ( loc.questionable() ) { if( isCapped() && !loc.isValid() && i == 1 ) { /* the constructor for NamespaceDetails intentionally sets deletedList[1] to invalid see comments in namespace.h */ break; } string err( str::stream() << "bad pointer in deleted record list: " << loc.toString() << " bucket: " << i << " k: " << k ); results->errors.push_back( err ); results->valid = false; break; } const DeletedRecord* d = deletedRecordFor(loc); delSize += d->lengthWithHeaders(); loc = d->nextDeleted(); k++; txn->checkForInterrupt(); } delBucketSizes << k; } catch (...) { results->errors.push_back( (string)"exception in deleted chain for bucket " + BSONObjBuilder::numStr(i) ); results->valid = false; } } output->appendNumber("deletedCount", ndel); output->appendNumber("deletedSize", delSize); if ( full ) { output->append( "delBucketSizes", delBucketSizes.arr() ); } if ( incorrect ) { results->errors.push_back( BSONObjBuilder::numStr(incorrect) + " records from datafile are in deleted list" ); results->valid = false; } } catch (AssertionException) { results->errors.push_back( "exception during validate" ); results->valid = false; } return Status::OK(); }