void RecordStoreV1Base::deleteRecord( TransactionExperiment* txn, const DiskLoc& dl ) { Record* todelete = recordFor( dl ); /* remove ourself from the record next/prev chain */ { if ( todelete->prevOfs() != DiskLoc::NullOfs ) { DiskLoc prev = getPrevRecordInExtent( dl ); Record* prevRecord = recordFor( prev ); txn->writingInt( prevRecord->nextOfs() ) = todelete->nextOfs(); } if ( todelete->nextOfs() != DiskLoc::NullOfs ) { DiskLoc next = getNextRecord( dl ); Record* nextRecord = recordFor( next ); txn->writingInt( nextRecord->prevOfs() ) = todelete->prevOfs(); } } /* remove ourself from extent pointers */ { Extent *e = txn->writing( _getExtent( _getExtentLocForRecord( dl ) ) ); if ( e->firstRecord == dl ) { if ( todelete->nextOfs() == DiskLoc::NullOfs ) e->firstRecord.Null(); else e->firstRecord.set(dl.a(), todelete->nextOfs() ); } if ( e->lastRecord == dl ) { if ( todelete->prevOfs() == DiskLoc::NullOfs ) e->lastRecord.Null(); else e->lastRecord.set(dl.a(), todelete->prevOfs() ); } } /* add to the free list */ { _details->incrementStats( txn, -1 * todelete->netLength(), -1 ); if ( _isSystemIndexes ) { /* temp: if in system.indexes, don't reuse, and zero out: we want to be careful until validated more, as IndexDetails has pointers to this disk location. so an incorrectly done remove would cause a lot of problems. */ memset( txn->writingPtr(todelete, todelete->lengthWithHeaders() ), 0, todelete->lengthWithHeaders() ); } else { DEV { unsigned long long *p = reinterpret_cast<unsigned long long *>( todelete->data() ); *txn->writing(p) = 0; } addDeletedRec(txn, dl); } } }
/* deletes a record, just the pdfile portion -- no index cleanup, no cursor cleanup, etc. caller must check if capped */ void DataFileMgr::_deleteRecord(NamespaceDetails *d, const StringData& ns, Record *todelete, const DiskLoc& dl) { /* remove ourself from the record next/prev chain */ { if ( todelete->prevOfs() != DiskLoc::NullOfs ) getDur().writingInt( todelete->getPrev(dl).rec()->nextOfs() ) = todelete->nextOfs(); if ( todelete->nextOfs() != DiskLoc::NullOfs ) getDur().writingInt( todelete->getNext(dl).rec()->prevOfs() ) = todelete->prevOfs(); } /* remove ourself from extent pointers */ { Extent *e = getDur().writing( todelete->myExtent(dl) ); if ( e->firstRecord == dl ) { if ( todelete->nextOfs() == DiskLoc::NullOfs ) e->firstRecord.Null(); else e->firstRecord.set(dl.a(), todelete->nextOfs() ); } if ( e->lastRecord == dl ) { if ( todelete->prevOfs() == DiskLoc::NullOfs ) e->lastRecord.Null(); else e->lastRecord.set(dl.a(), todelete->prevOfs() ); } } /* add to the free list */ { d->incrementStats( -1 * todelete->netLength(), -1 ); if ( nsToCollectionSubstring(ns) == "system.indexes") { /* temp: if in system.indexes, don't reuse, and zero out: we want to be careful until validated more, as IndexDetails has pointers to this disk location. so an incorrectly done remove would cause a lot of problems. */ memset(getDur().writingPtr(todelete, todelete->lengthWithHeaders() ), 0, todelete->lengthWithHeaders() ); } else { DEV { unsigned long long *p = reinterpret_cast<unsigned long long *>( todelete->data() ); *getDur().writing(p) = 0; //DEV memset(todelete->data, 0, todelete->netLength()); // attempt to notice invalid reuse. } d->addDeletedRec((DeletedRecord*)todelete, dl); } } }
/** * analyzeDiskStorage helper which processes a single record. */ void processDeletedRecord(const DiskLoc& dl, const DeletedRecord* dr, const Extent* ex, const AnalyzeParams& params, int bucketNum, vector<DiskStorageData>& sliceData, BSONArrayBuilder* deletedRecordsArrayBuilder) { killCurrentOp.checkForInterrupt(); int extentOfs = ex->myLoc.getOfs(); if (! (dl.a() == ex->myLoc.a() && dl.getOfs() + dr->lengthWithHeaders() > extentOfs && dl.getOfs() < extentOfs + ex->length) ) { return; } RecPos pos = RecPos::from(dl.getOfs(), dr->lengthWithHeaders(), extentOfs, params); bool spansRequestedArea = false; for (RecPos::SliceIterator it = pos.iterateSlices(); !it.end(); ++it) { DiskStorageData& slice = sliceData[it->sliceNum]; slice.freeRecords[bucketNum] += it->ratioHere; spansRequestedArea = true; } if (deletedRecordsArrayBuilder != NULL && spansRequestedArea) { BSONObjBuilder(deletedRecordsArrayBuilder->subobjStart()) .append("ofs", dl.getOfs() - extentOfs) .append("recBytes", dr->lengthWithHeaders()); } }
DiskLoc _repairExtent( Database* db , string ns, bool forward , DiskLoc eLoc ){ LogIndentLevel lil; if ( eLoc.getOfs() <= 0 ){ error() << "invalid extent ofs: " << eLoc.getOfs() << endl; return DiskLoc(); } MongoDataFile * mdf = db->getFile( eLoc.a() ); Extent * e = mdf->debug_getExtent( eLoc ); if ( ! e->isOk() ){ warning() << "Extent not ok magic: " << e->magic << " going to try to continue" << endl; } log() << "length:" << e->length << endl; LogIndentLevel lil2; DiskLoc loc = forward ? e->firstRecord : e->lastRecord; while ( ! loc.isNull() ){ if ( loc.getOfs() <= 0 ){ error() << "offset is 0 for record which should be impossible" << endl; break; } log() << loc << endl; Record* rec = loc.rec(); log() << loc.obj() << endl; loc = forward ? rec->getNext( loc ) : rec->getPrev( loc ); } return forward ? e->xnext : e->xprev; }
// bypass standard alloc/insert routines to use the extent we want. static DiskLoc insert( DiskLoc ext, int i ) { BSONObjBuilder b; b.append( "a", i ); BSONObj o = b.done(); int len = o.objsize(); Extent *e = ext.ext(); int ofs; if ( e->lastRecord.isNull() ) ofs = ext.getOfs() + ( e->extentData - (char *)e ); else ofs = e->lastRecord.getOfs() + e->lastRecord.rec()->lengthWithHeaders; DiskLoc dl( ext.a(), ofs ); Record *r = dl.rec(); r->lengthWithHeaders = Record::HeaderSize + len; r->extentOfs = e->myLoc.getOfs(); r->nextOfs = DiskLoc::NullOfs; r->prevOfs = e->lastRecord.isNull() ? DiskLoc::NullOfs : e->lastRecord.getOfs(); memcpy( r->data, o.objdata(), len ); if ( e->firstRecord.isNull() ) e->firstRecord = dl; else e->lastRecord.rec()->nextOfs = ofs; e->lastRecord = dl; return dl; }
void touchNs( const std::string& ns ) { std::vector< touch_location > ranges; Client::ReadContext ctx(ns); { NamespaceDetails *nsd = nsdetails(ns.c_str()); uassert( 16154, "namespace does not exist", nsd ); for( DiskLoc L = nsd->firstExtent; !L.isNull(); L = L.ext()->xnext ) { MongoDataFile* mdf = cc().database()->getFile( L.a() ); massert( 16238, "can't fetch extent file structure", mdf ); touch_location tl; tl.fd = mdf->getFd(); tl.offset = L.getOfs(); tl.ext = L.ext(); tl.length = tl.ext->length; ranges.push_back(tl); } } LockMongoFilesShared lk; Lock::TempRelease tr; std::string progress_msg = "touch " + ns + " extents"; ProgressMeterHolder pm( cc().curop()->setMessage( progress_msg.c_str() , ranges.size() ) ); for ( std::vector< touch_location >::iterator it = ranges.begin(); it != ranges.end(); ++it ) { touch_pages( it->fd, it->offset, it->length, it->ext ); pm.hit(); killCurrentOp.checkForInterrupt(false); } pm.finished(); }
/** * analyzeDiskStorage helper which processes a single record. */ void processRecord(const DiskLoc& dl, const DiskLoc& prevDl, const Record* r, int extentOfs, const AnalyzeParams& params, vector<DiskStorageData>& sliceData, BSONArrayBuilder* recordsArrayBuilder) { killCurrentOp.checkForInterrupt(); BSONObj obj = dl.obj(); int recBytes = r->lengthWithHeaders(); double characteristicFieldValue = 0; bool hasCharacteristicField = extractCharacteristicFieldValue(obj, params, characteristicFieldValue); bool isLocatedBeforePrevious = dl.a() < prevDl.a(); RecPos pos = RecPos::from(dl.getOfs(), recBytes, extentOfs, params); bool spansRequestedArea = false; for (RecPos::SliceIterator it = pos.iterateSlices(); !it.end(); ++it) { spansRequestedArea = true; DiskStorageData& slice = sliceData[it->sliceNum]; slice.numEntries += it->ratioHere; slice.recBytes += it->sizeHere; slice.bsonBytes += static_cast<long long>(it->ratioHere * obj.objsize()); if (hasCharacteristicField) { slice.characteristicCount += it->ratioHere; slice.characteristicSum += it->ratioHere * characteristicFieldValue; } if (isLocatedBeforePrevious) { slice.outOfOrderRecs += it->ratioHere; } } if (recordsArrayBuilder != NULL && spansRequestedArea) { DEV { int startsAt = dl.getOfs() - extentOfs; int endsAt = startsAt + recBytes; verify((startsAt < params.startOfs && endsAt > params.startOfs) || (startsAt < params.endOfs && endsAt >= params.endOfs) || (startsAt >= params.startOfs && endsAt < params.endOfs)); } BSONObjBuilder recordBuilder(recordsArrayBuilder->subobjStart()); recordBuilder.append("ofs", dl.getOfs() - extentOfs); recordBuilder.append("recBytes", recBytes); recordBuilder.append("bsonBytes", obj.objsize()); recordBuilder.append("_id", obj["_id"]); if (hasCharacteristicField) { recordBuilder.append("characteristic", characteristicFieldValue); } recordBuilder.doneFast(); }
/* combine adjacent deleted records *for the current extent* of the capped collection this is O(n^2) but we call it for capped tables where typically n==1 or 2! (or 3...there will be a little unused sliver at the end of the extent.) */ void NamespaceDetails::compact() { DDD( "NamespaceDetails::compact enter" ); verify( isCapped() ); vector<DiskLoc> drecs; // Pull out capExtent's DRs from deletedList DiskLoc i = cappedFirstDeletedInCurExtent(); for (; !i.isNull() && inCapExtent( i ); i = i.drec()->nextDeleted() ) { DDD( "\t" << i ); drecs.push_back( i ); } getDur().writingDiskLoc( cappedFirstDeletedInCurExtent() ) = i; std::sort( drecs.begin(), drecs.end() ); DDD( "\t drecs.size(): " << drecs.size() ); vector<DiskLoc>::const_iterator j = drecs.begin(); verify( j != drecs.end() ); DiskLoc a = *j; while ( 1 ) { j++; if ( j == drecs.end() ) { DDD( "\t compact adddelrec" ); addDeletedRec(a.drec(), a); break; } DiskLoc b = *j; while ( a.a() == b.a() && a.getOfs() + a.drec()->lengthWithHeaders() == b.getOfs() ) { // a & b are adjacent. merge. getDur().writingInt( a.drec()->lengthWithHeaders() ) += b.drec()->lengthWithHeaders(); j++; if ( j == drecs.end() ) { DDD( "\t compact adddelrec2" ); addDeletedRec(a.drec(), a); return; } b = *j; } DDD( "\t compact adddelrec3" ); addDeletedRec(a.drec(), a); a = b; } }
DiskLoc ExtentManager::getNextRecordInExtent( const DiskLoc& loc ) const { int nextOffset = recordFor( loc )->nextOfs(); if ( nextOffset == DiskLoc::NullOfs ) return DiskLoc(); fassert( 16967, abs(nextOffset) >= 8 ); // defensive return DiskLoc( loc.a(), nextOffset ); }
DiskLoc RecordStoreV1Base::getNextRecordInExtent( const DiskLoc& loc ) const { int nextOffset = recordFor( loc )->nextOfs(); if ( nextOffset == DiskLoc::NullOfs ) return DiskLoc(); fassert( 17441, abs(nextOffset) >= 8 ); // defensive return DiskLoc( loc.a(), nextOffset ); }
DiskLoc ExtentManager::getPrevRecordInExtent( const DiskLoc& loc ) const { int prevOffset = recordFor( loc )->prevOfs(); if ( prevOffset == DiskLoc::NullOfs ) return DiskLoc(); fassert( 16968, abs(prevOffset) >= 8 ); // defensive return DiskLoc( loc.a(), prevOffset ); }
DiskLoc RecordStoreV1Base::getPrevRecordInExtent( OperationContext* txn, const DiskLoc& loc ) const { int prevOffset = recordFor( loc )->prevOfs(); if ( prevOffset == DiskLoc::NullOfs ) return DiskLoc(); fassert( 17442, abs(prevOffset) >= 8 ); // defensive DiskLoc result( loc.a(), prevOffset ); return result; }
DiskLoc RecordStoreV1Base::getPrevRecordInExtent( const DiskLoc& loc ) const { int prevOffset = recordFor( loc )->prevOfs(); if ( prevOffset == DiskLoc::NullOfs ) return DiskLoc(); fassert( 17442, abs(prevOffset) >= 8 ); // defensive return DiskLoc( loc.a(), prevOffset ); }
/* combine adjacent deleted records *for the current extent* of the capped collection this is O(n^2) but we call it for capped tables where typically n==1 or 2! (or 3...there will be a little unused sliver at the end of the extent.) */ void NamespaceDetails::compact() { assert(capped); list<DiskLoc> drecs; // Pull out capExtent's DRs from deletedList DiskLoc i = cappedFirstDeletedInCurExtent(); for (; !i.isNull() && inCapExtent( i ); i = i.drec()->nextDeleted ) drecs.push_back( i ); getDur().writingDiskLoc( cappedFirstDeletedInCurExtent() ) = i; // This is the O(n^2) part. drecs.sort(); list<DiskLoc>::iterator j = drecs.begin(); assert( j != drecs.end() ); DiskLoc a = *j; while ( 1 ) { j++; if ( j == drecs.end() ) { DEBUGGING out() << "TEMP: compact adddelrec\n"; addDeletedRec(a.drec(), a); break; } DiskLoc b = *j; while ( a.a() == b.a() && a.getOfs() + a.drec()->lengthWithHeaders == b.getOfs() ) { // a & b are adjacent. merge. getDur().writingInt( a.drec()->lengthWithHeaders ) += b.drec()->lengthWithHeaders; j++; if ( j == drecs.end() ) { DEBUGGING out() << "temp: compact adddelrec2\n"; addDeletedRec(a.drec(), a); return; } b = *j; } DEBUGGING out() << "temp: compact adddelrec3\n"; addDeletedRec(a.drec(), a); a = b; } }
bool NamespaceDetails::inCapExtent( const DiskLoc &dl ) const { invariant( !dl.isNull() ); if ( dl.a() != _capExtent.a() ) return false; if ( dl.getOfs() < _capExtent.getOfs() ) return false; const Extent* e = theCapExtent(); int end = _capExtent.getOfs() + e->length; return dl.getOfs() <= end; }
void DiskLoc56Bit::operator=(const DiskLoc& loc) { ofs = loc.getOfs(); int la = loc.a(); invariant( la <= 0xffffff ); // must fit in 3 bytes if( la < 0 ) { if ( la != -1 ) { log() << "btree diskloc isn't negative 1: " << la << std::endl; invariant ( la == -1 ); } la = 0; ofs = OurNullOfs; } memcpy(_a, &la, 3); // endian }
void validateNS(const char *ns, NamespaceDetails *d, const BSONObj& cmdObj, BSONObjBuilder& result) { const bool full = cmdObj["full"].trueValue(); const bool scanData = full || cmdObj["scandata"].trueValue(); bool valid = true; BSONArrayBuilder errors; // explanation(s) for why valid = false if ( d->isCapped() ){ result.append("capped", d->isCapped()); result.appendNumber("max", d->maxCappedDocs()); } result.append("firstExtent", str::stream() << d->firstExtent.toString() << " ns:" << d->firstExtent.ext()->nsDiagnostic.toString()); result.append( "lastExtent", str::stream() << d->lastExtent.toString() << " ns:" << d->lastExtent.ext()->nsDiagnostic.toString()); BSONArrayBuilder extentData; try { d->firstExtent.ext()->assertOk(); d->lastExtent.ext()->assertOk(); DiskLoc el = d->firstExtent; int ne = 0; while( !el.isNull() ) { Extent *e = el.ext(); e->assertOk(); el = e->xnext; ne++; if ( full ) extentData << e->dump(); killCurrentOp.checkForInterrupt(); } result.append("extentCount", ne); } catch (...) { valid=false; errors << "extent asserted"; } if ( full ) result.appendArray( "extents" , extentData.arr() ); result.appendNumber("datasize", d->stats.datasize); result.appendNumber("nrecords", d->stats.nrecords); result.appendNumber("lastExtentSize", d->lastExtentSize); result.appendNumber("padding", d->paddingFactor()); try { try { result.append("firstExtentDetails", d->firstExtent.ext()->dump()); valid = valid && d->firstExtent.ext()->validates() && d->firstExtent.ext()->xprev.isNull(); } catch (...) { errors << "exception firstextent"; valid = false; } set<DiskLoc> recs; if( scanData ) { shared_ptr<Cursor> c = theDataFileMgr.findAll(ns); int n = 0; int nInvalid = 0; long long len = 0; long long nlen = 0; int outOfOrder = 0; DiskLoc cl_last; while ( c->ok() ) { n++; DiskLoc cl = c->currLoc(); if ( n < 1000000 ) recs.insert(cl); if ( d->isCapped() ) { if ( cl < cl_last ) outOfOrder++; cl_last = cl; } Record *r = c->_current(); len += r->lengthWithHeaders(); nlen += r->netLength(); if (full){ BSONObj obj = BSONObj::make(r); if (!obj.isValid() || !obj.valid()){ // both fast and deep checks valid = false; if (nInvalid == 0) // only log once; errors << "invalid bson object detected (see logs for more info)"; nInvalid++; if (strcmp("_id", obj.firstElementFieldName()) == 0){ try { obj.firstElement().validate(); // throws on error log() << "Invalid bson detected in " << ns << " with _id: " << obj.firstElement().toString(false) << endl; } catch(...){ log() << "Invalid bson detected in " << ns << " with corrupt _id" << endl; } } else { log() << "Invalid bson detected in " << ns << " and couldn't find _id" << endl; } } } c->advance(); } if ( d->isCapped() && !d->capLooped() ) { result.append("cappedOutOfOrder", outOfOrder); if ( outOfOrder > 1 ) { valid = false; errors << "too many out of order records"; } } result.append("objectsFound", n); if (full) { result.append("invalidObjects", nInvalid); } result.appendNumber("bytesWithHeaders", len); result.appendNumber("bytesWithoutHeaders", nlen); } BSONArrayBuilder deletedListArray; for ( int i = 0; i < Buckets; i++ ) { deletedListArray << d->deletedList[i].isNull(); } int ndel = 0; long long delSize = 0; int incorrect = 0; for ( int i = 0; i < Buckets; i++ ) { DiskLoc loc = d->deletedList[i]; try { int k = 0; while ( !loc.isNull() ) { if ( recs.count(loc) ) incorrect++; ndel++; if ( loc.questionable() ) { if( d->isCapped() && !loc.isValid() && i == 1 ) { /* the constructor for NamespaceDetails intentionally sets deletedList[1] to invalid see comments in namespace.h */ break; } if ( loc.a() <= 0 || strstr(ns, "hudsonSmall") == 0 ) { string err (str::stream() << "bad deleted loc: " << loc.toString() << " bucket:" << i << " k:" << k); errors << err; valid = false; break; } } DeletedRecord *d = loc.drec(); delSize += d->lengthWithHeaders(); loc = d->nextDeleted(); k++; killCurrentOp.checkForInterrupt(); } } catch (...) { errors << ("exception in deleted chain for bucket " + BSONObjBuilder::numStr(i)); valid = false; } } result.appendNumber("deletedCount", ndel); result.appendNumber("deletedSize", delSize); if ( incorrect ) { errors << (BSONObjBuilder::numStr(incorrect) + " records from datafile are in deleted list"); valid = false; } int idxn = 0; try { result.append("nIndexes", d->nIndexes); BSONObjBuilder indexes; // not using subObjStart to be exception safe NamespaceDetails::IndexIterator i = d->ii(); while( i.more() ) { IndexDetails& id = i.next(); log() << "validating index " << idxn << ": " << id.indexNamespace() << endl; long long keys = id.idxInterface().fullValidate(id.head, id.keyPattern()); indexes.appendNumber(id.indexNamespace(), keys); idxn++; } result.append("keysPerIndex", indexes.done()); } catch (...) { errors << ("exception during index validate idxn " + BSONObjBuilder::numStr(idxn)); valid=false; } } catch (AssertionException) { errors << "exception during validate"; valid = false; } result.appendBool("valid", valid); result.append("errors", errors.arr()); if ( !full ){ result.append("warning", "Some checks omitted for speed. use {full:true} option to do more thorough scan."); } if ( !valid ) { result.append("advice", "ns corrupt, requires repair"); } }
DiskLoc DummyExtentManager::extentLocForV1( const DiskLoc& loc ) const { return DiskLoc( loc.a(), 0 ); }
DiskLoc MmapV1ExtentManager::extentLocForV1( const DiskLoc& loc ) const { Record* record = recordForV1( loc ); return DiskLoc( loc.a(), record->extentOfs() ); }
void RecordStoreV1Base::deleteRecord( OperationContext* txn, const DiskLoc& dl ) { Record* todelete = recordFor( dl ); invariant( todelete->netLength() >= 4 ); // this is required for defensive code /* remove ourself from the record next/prev chain */ { if ( todelete->prevOfs() != DiskLoc::NullOfs ) { DiskLoc prev = getPrevRecordInExtent( txn, dl ); Record* prevRecord = recordFor( prev ); txn->recoveryUnit()->writingInt( prevRecord->nextOfs() ) = todelete->nextOfs(); } if ( todelete->nextOfs() != DiskLoc::NullOfs ) { DiskLoc next = getNextRecord( txn, dl ); Record* nextRecord = recordFor( next ); txn->recoveryUnit()->writingInt( nextRecord->prevOfs() ) = todelete->prevOfs(); } } /* remove ourself from extent pointers */ { DiskLoc extentLoc = todelete->myExtentLoc(dl); Extent *e = _getExtent( txn, extentLoc ); if ( e->firstRecord == dl ) { txn->recoveryUnit()->writing(&e->firstRecord); if ( todelete->nextOfs() == DiskLoc::NullOfs ) e->firstRecord.Null(); else e->firstRecord.set(dl.a(), todelete->nextOfs() ); } if ( e->lastRecord == dl ) { txn->recoveryUnit()->writing(&e->lastRecord); if ( todelete->prevOfs() == DiskLoc::NullOfs ) e->lastRecord.Null(); else e->lastRecord.set(dl.a(), todelete->prevOfs() ); } } /* add to the free list */ { _details->incrementStats( txn, -1 * todelete->netLength(), -1 ); if ( _isSystemIndexes ) { /* temp: if in system.indexes, don't reuse, and zero out: we want to be careful until validated more, as IndexDetails has pointers to this disk location. so an incorrectly done remove would cause a lot of problems. */ memset( txn->recoveryUnit()->writingPtr(todelete, todelete->lengthWithHeaders() ), 0, todelete->lengthWithHeaders() ); } else { // this is defensive so we can detect if we are still using a location // that was deleted memset(txn->recoveryUnit()->writingPtr(todelete->data(), 4), 0xee, 4); addDeletedRec(txn, dl); } } }
DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents(OperationContext* txn, int lenToAllocRaw) { // Slowly drain the deletedListLegacyGrabBag by popping one record off and putting it in the // correct deleted list each time we try to allocate a new record. This ensures we won't // orphan any data when upgrading from old versions, without needing a long upgrade phase. // This is done before we try to allocate the new record so we can take advantage of the new // space immediately. { const DiskLoc head = _details->deletedListLegacyGrabBag(); if (!head.isNull()) { _details->setDeletedListLegacyGrabBag(txn, drec(head)->nextDeleted()); addDeletedRec(txn, head); } } // align size up to a multiple of 4 const int lenToAlloc = (lenToAllocRaw + (4 - 1)) & ~(4 - 1); freelistAllocs.increment(); DiskLoc loc; DeletedRecord* dr = NULL; { int myBucket; for (myBucket = bucket(lenToAlloc); myBucket < Buckets; myBucket++) { // Only look at the first entry in each bucket. This works because we are either // quantizing or allocating fixed-size blocks. const DiskLoc head = _details->deletedListEntry(myBucket); if (head.isNull()) continue; DeletedRecord* const candidate = drec(head); if (candidate->lengthWithHeaders() >= lenToAlloc) { loc = head; dr = candidate; break; } } if (!dr) return DiskLoc(); // no space // Unlink ourself from the deleted list _details->setDeletedListEntry(txn, myBucket, dr->nextDeleted()); *txn->recoveryUnit()->writing(&dr->nextDeleted()) = DiskLoc().setInvalid(); // defensive } invariant(dr->extentOfs() < loc.getOfs()); // Split the deleted record if it has at least as much left over space as our smallest // allocation size. Otherwise, just take the whole DeletedRecord. const int remainingLength = dr->lengthWithHeaders() - lenToAlloc; if (remainingLength >= bucketSizes[0]) { txn->recoveryUnit()->writingInt(dr->lengthWithHeaders()) = lenToAlloc; const DiskLoc newDelLoc = DiskLoc(loc.a(), loc.getOfs() + lenToAlloc); DeletedRecord* newDel = txn->recoveryUnit()->writing(drec(newDelLoc)); newDel->extentOfs() = dr->extentOfs(); newDel->lengthWithHeaders() = remainingLength; newDel->nextDeleted().Null(); addDeletedRec(txn, newDelLoc); } return loc; }
DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents( OperationContext* txn, int lenToAlloc ) { // align size up to a multiple of 4 lenToAlloc = (lenToAlloc + (4-1)) & ~(4-1); freelistAllocs.increment(); DiskLoc loc; { DiskLoc *prev = 0; DiskLoc *bestprev = 0; DiskLoc bestmatch; int bestmatchlen = INT_MAX; // sentinel meaning we haven't found a record big enough int b = bucket(lenToAlloc); DiskLoc cur = _details->deletedListEntry(b); int extra = 5; // look for a better fit, a little. int chain = 0; while ( 1 ) { { // defensive check int fileNumber = cur.a(); int fileOffset = cur.getOfs(); if (fileNumber < -1 || fileNumber >= 100000 || fileOffset < 0) { StringBuilder sb; sb << "Deleted record list corrupted in collection " << _ns << ", bucket " << b << ", link number " << chain << ", invalid link is " << cur.toString() << ", throwing Fatal Assertion"; log() << sb.str() << endl; fassertFailed(16469); } } if ( cur.isNull() ) { // move to next bucket. if we were doing "extra", just break if ( bestmatchlen < INT_MAX ) break; if ( chain > 0 ) { // if we looked at things in the right bucket, but they were not suitable freelistBucketExhausted.increment(); } b++; if ( b > MaxBucket ) { // out of space. alloc a new extent. freelistIterations.increment( 1 + chain ); return DiskLoc(); } cur = _details->deletedListEntry(b); prev = 0; continue; } DeletedRecord *r = drec(cur); if ( r->lengthWithHeaders() >= lenToAlloc && r->lengthWithHeaders() < bestmatchlen ) { bestmatchlen = r->lengthWithHeaders(); bestmatch = cur; bestprev = prev; if (r->lengthWithHeaders() == lenToAlloc) // exact match, stop searching break; } if ( bestmatchlen < INT_MAX && --extra <= 0 ) break; if ( ++chain > 30 && b <= MaxBucket ) { // too slow, force move to next bucket to grab a big chunk //b++; freelistIterations.increment( chain ); chain = 0; cur.Null(); } else { cur = r->nextDeleted(); prev = &r->nextDeleted(); } } // unlink ourself from the deleted list DeletedRecord *bmr = drec(bestmatch); if ( bestprev ) { *txn->recoveryUnit()->writing(bestprev) = bmr->nextDeleted(); } else { // should be the front of a free-list int myBucket = bucket(bmr->lengthWithHeaders()); invariant( _details->deletedListEntry(myBucket) == bestmatch ); _details->setDeletedListEntry(txn, myBucket, bmr->nextDeleted()); } *txn->recoveryUnit()->writing(&bmr->nextDeleted()) = DiskLoc().setInvalid(); // defensive. invariant(bmr->extentOfs() < bestmatch.getOfs()); freelistIterations.increment( 1 + chain ); loc = bestmatch; } if ( loc.isNull() ) return loc; // determine if we should chop up DeletedRecord *r = drec(loc); /* note we want to grab from the front so our next pointers on disk tend to go in a forward direction which is important for performance. */ int regionlen = r->lengthWithHeaders(); invariant( r->extentOfs() < loc.getOfs() ); int left = regionlen - lenToAlloc; if ( left < 24 || left < (lenToAlloc / 8) ) { // you get the whole thing. return loc; } // don't quantize: // - $ collections (indexes) as we already have those aligned the way we want SERVER-8425 if ( _normalCollection ) { // we quantize here so that it only impacts newly sized records // this prevents oddities with older records and space re-use SERVER-8435 lenToAlloc = std::min( r->lengthWithHeaders(), quantizeAllocationSpace( lenToAlloc ) ); left = regionlen - lenToAlloc; if ( left < 24 ) { // you get the whole thing. return loc; } } /* split off some for further use. */ txn->recoveryUnit()->writingInt(r->lengthWithHeaders()) = lenToAlloc; DiskLoc newDelLoc = loc; newDelLoc.inc(lenToAlloc); DeletedRecord* newDel = drec(newDelLoc); DeletedRecord* newDelW = txn->recoveryUnit()->writing(newDel); newDelW->extentOfs() = r->extentOfs(); newDelW->lengthWithHeaders() = left; newDelW->nextDeleted().Null(); addDeletedRec( txn, newDelLoc ); return loc; }
string validateNS(const char *ns, NamespaceDetails *d, BSONObj *cmdObj) { bool scanData = true; if( cmdObj && cmdObj->hasElement("scandata") && !cmdObj->getBoolField("scandata") ) scanData = false; bool valid = true; stringstream ss; ss << "\nvalidate\n"; //ss << " details: " << hex << d << " ofs:" << nsindex(ns)->detailsOffset(d) << dec << endl; if ( d->capped ) ss << " capped:" << d->capped << " max:" << d->max << '\n'; ss << " firstExtent:" << d->firstExtent.toString() << " ns:" << d->firstExtent.ext()->nsDiagnostic.toString()<< '\n'; ss << " lastExtent:" << d->lastExtent.toString() << " ns:" << d->lastExtent.ext()->nsDiagnostic.toString() << '\n'; try { d->firstExtent.ext()->assertOk(); d->lastExtent.ext()->assertOk(); DiskLoc el = d->firstExtent; int ne = 0; while( !el.isNull() ) { Extent *e = el.ext(); e->assertOk(); el = e->xnext; ne++; killCurrentOp.checkForInterrupt(); } ss << " # extents:" << ne << '\n'; } catch (...) { valid=false; ss << " extent asserted "; } ss << " datasize?:" << d->stats.datasize << " nrecords?:" << d->stats.nrecords << " lastExtentSize:" << d->lastExtentSize << '\n'; ss << " padding:" << d->paddingFactor << '\n'; try { try { ss << " first extent:\n"; d->firstExtent.ext()->dump(ss); valid = valid && d->firstExtent.ext()->validates(); } catch (...) { ss << "\n exception firstextent\n" << endl; } set<DiskLoc> recs; if( scanData ) { shared_ptr<Cursor> c = theDataFileMgr.findAll(ns); int n = 0; long long len = 0; long long nlen = 0; int outOfOrder = 0; DiskLoc cl_last; while ( c->ok() ) { n++; DiskLoc cl = c->currLoc(); if ( n < 1000000 ) recs.insert(cl); if ( d->capped ) { if ( cl < cl_last ) outOfOrder++; cl_last = cl; } Record *r = c->_current(); len += r->lengthWithHeaders; nlen += r->netLength(); c->advance(); } if ( d->capped && !d->capLooped() ) { ss << " capped outOfOrder:" << outOfOrder; if ( outOfOrder > 1 ) { valid = false; ss << " ???"; } else ss << " (OK)"; ss << '\n'; } ss << " " << n << " objects found, nobj:" << d->stats.nrecords << '\n'; ss << " " << len << " bytes data w/headers\n"; ss << " " << nlen << " bytes data wout/headers\n"; } ss << " deletedList: "; for ( int i = 0; i < Buckets; i++ ) { ss << (d->deletedList[i].isNull() ? '0' : '1'); } ss << endl; int ndel = 0; long long delSize = 0; int incorrect = 0; for ( int i = 0; i < Buckets; i++ ) { DiskLoc loc = d->deletedList[i]; try { int k = 0; while ( !loc.isNull() ) { if ( recs.count(loc) ) incorrect++; ndel++; if ( loc.questionable() ) { if( d->capped && !loc.isValid() && i == 1 ) { /* the constructor for NamespaceDetails intentionally sets deletedList[1] to invalid see comments in namespace.h */ break; } if ( loc.a() <= 0 || strstr(ns, "hudsonSmall") == 0 ) { ss << " ?bad deleted loc: " << loc.toString() << " bucket:" << i << " k:" << k << endl; valid = false; break; } } DeletedRecord *d = loc.drec(); delSize += d->lengthWithHeaders; loc = d->nextDeleted; k++; killCurrentOp.checkForInterrupt(); } } catch (...) { ss <<" ?exception in deleted chain for bucket " << i << endl; valid = false; } } ss << " deleted: n: " << ndel << " size: " << delSize << endl; if ( incorrect ) { ss << " ?corrupt: " << incorrect << " records from datafile are in deleted list\n"; valid = false; } int idxn = 0; try { ss << " nIndexes:" << d->nIndexes << endl; NamespaceDetails::IndexIterator i = d->ii(); while( i.more() ) { IndexDetails& id = i.next(); ss << " " << id.indexNamespace() << " keys:" << id.head.btree()->fullValidate(id.head, id.keyPattern()) << endl; } } catch (...) { ss << "\n exception during index validate idxn:" << idxn << endl; valid=false; } } catch (AssertionException) { ss << "\n exception during validate\n" << endl; valid = false; } if ( !valid ) ss << " ns corrupt, requires dbchk\n"; return ss.str(); }
void initializeV1RS(OperationContext* txn, const LocAndSize* records, const LocAndSize* drecs, DummyExtentManager* em, DummyRecordStoreV1MetaData* md) { invariant(records || drecs); // if both are NULL nothing is being created... // Need to start with a blank slate invariant(em->numFiles() == 0); invariant(md->firstExtent().isNull()); // pre-allocate extents (even extents that aren't part of this RS) { typedef std::map<int, size_t> ExtentSizes; ExtentSizes extentSizes; accumulateExtentSizeRequirements(records, &extentSizes); accumulateExtentSizeRequirements(drecs, &extentSizes); invariant(!extentSizes.empty()); const int maxExtent = extentSizes.rbegin()->first; for (int i = 0; i <= maxExtent; i++) { const size_t size = extentSizes.count(i) ? extentSizes[i] : 0; const DiskLoc loc = em->allocateExtent(txn, md->isCapped(), size, 0); // This function and assertState depend on these details of DummyExtentManager invariant(loc.a() == i); invariant(loc.getOfs() == 0); } // link together extents that should be part of this RS md->setFirstExtent(txn, DiskLoc(extentSizes.begin()->first, 0)); md->setLastExtent(txn, DiskLoc(extentSizes.rbegin()->first, 0)); for (ExtentSizes::iterator it = extentSizes.begin(); boost::next(it) != extentSizes.end(); /* ++it */ ) { const int a = it->first; ++it; const int b = it->first; em->getExtent(DiskLoc(a, 0))->xnext = DiskLoc(b, 0); em->getExtent(DiskLoc(b, 0))->xprev = DiskLoc(a, 0); } // This signals "done allocating new extents". if (md->isCapped()) md->setDeletedListEntry(txn, 1, DiskLoc()); } if (records && !records[0].loc.isNull()) { int recIdx = 0; DiskLoc extLoc = md->firstExtent(); while (!extLoc.isNull()) { Extent* ext = em->getExtent(extLoc); int prevOfs = DiskLoc::NullOfs; while (extLoc.a() == records[recIdx].loc.a()) { // for all records in this extent const DiskLoc loc = records[recIdx].loc; const int size = records[recIdx].size;; invariant(size >= Record::HeaderSize); md->incrementStats(txn, size - Record::HeaderSize, 1); if (ext->firstRecord.isNull()) ext->firstRecord = loc; Record* rec = em->recordForV1(loc); rec->lengthWithHeaders() = size; rec->extentOfs() = 0; rec->prevOfs() = prevOfs; prevOfs = loc.getOfs(); const DiskLoc nextLoc = records[recIdx + 1].loc; if (nextLoc.a() == loc.a()) { // if next is in same extent rec->nextOfs() = nextLoc.getOfs(); } else { rec->nextOfs() = DiskLoc::NullOfs; ext->lastRecord = loc; } recIdx++; } extLoc = ext->xnext; } invariant(records[recIdx].loc.isNull()); } if (drecs && !drecs[0].loc.isNull()) { int drecIdx = 0; DiskLoc* prevNextPtr = NULL; int lastBucket = -1; while (!drecs[drecIdx].loc.isNull()) { const DiskLoc loc = drecs[drecIdx].loc; const int size = drecs[drecIdx].size; invariant(size >= Record::HeaderSize); const int bucket = RecordStoreV1Base::bucket(size); if (md->isCapped()) { // All drecs form a single list in bucket 0 if (prevNextPtr == NULL) { md->setDeletedListEntry(txn, 0, loc); } else { *prevNextPtr = loc; } if (loc.a() < md->capExtent().a() && drecs[drecIdx + 1].loc.a() == md->capExtent().a()) { // Bucket 1 is known as cappedLastDelRecLastExtent md->setDeletedListEntry(txn, 1, loc); } } else if (bucket != lastBucket) { invariant(bucket > lastBucket); // if this fails, drecs weren't sorted by bucket md->setDeletedListEntry(txn, bucket, loc); lastBucket = bucket; } else { *prevNextPtr = loc; } DeletedRecord* drec = &em->recordForV1(loc)->asDeleted(); drec->lengthWithHeaders() = size; drec->extentOfs() = 0; drec->nextDeleted() = DiskLoc(); prevNextPtr = &drec->nextDeleted(); drecIdx++; } } // Make sure we set everything up as requested. assertStateV1RS(records, drecs, em, md); }
DiskLoc _repairExtent( Database* db , string ns, bool forward , DiskLoc eLoc , Writer& w ){ LogIndentLevel lil; if ( eLoc.getOfs() <= 0 ){ error() << "invalid extent ofs: " << eLoc.getOfs() << endl; return DiskLoc(); } MongoDataFile * mdf = db->getFile( eLoc.a() ); Extent * e = mdf->debug_getExtent( eLoc ); if ( ! e->isOk() ){ warning() << "Extent not ok magic: " << e->magic << " going to try to continue" << endl; } log() << "length:" << e->length << endl; LogIndentLevel lil2; set<DiskLoc> seen; DiskLoc loc = forward ? e->firstRecord : e->lastRecord; while ( ! loc.isNull() ){ if ( ! seen.insert( loc ).second ) { error() << "infinite loop in extend, seen: " << loc << " before" << endl; break; } if ( loc.getOfs() <= 0 ){ error() << "offset is 0 for record which should be impossible" << endl; break; } log(1) << loc << endl; Record* rec = loc.rec(); BSONObj obj; try { obj = loc.obj(); assert( obj.valid() ); LOG(1) << obj << endl; w( obj ); } catch ( std::exception& e ) { log() << "found invalid document @ " << loc << " " << e.what() << endl; if ( ! obj.isEmpty() ) { try { BSONElement e = obj.firstElement(); stringstream ss; ss << "first element: " << e; log() << ss.str(); } catch ( std::exception& ) { } } } loc = forward ? rec->getNext( loc ) : rec->getPrev( loc ); } return forward ? e->xnext : e->xprev; }
void assertStateV1RS(const LocAndSize* records, const LocAndSize* drecs, const ExtentManager* em, const DummyRecordStoreV1MetaData* md) { invariant(records || drecs); // if both are NULL nothing is being asserted... try { if (records) { long long dataSize = 0; long long numRecs = 0; int recIdx = 0; DiskLoc extLoc = md->firstExtent(); while (!extLoc.isNull()) { // for each Extent Extent* ext = em->getExtent(extLoc, true); int expectedPrevOfs = DiskLoc::NullOfs; DiskLoc actualLoc = ext->firstRecord; while (!actualLoc.isNull()) { // for each Record in this Extent const Record* actualRec = em->recordForV1(actualLoc); const int actualSize = actualRec->lengthWithHeaders(); dataSize += actualSize - Record::HeaderSize; numRecs += 1; ASSERT_EQUALS(actualLoc, records[recIdx].loc); ASSERT_EQUALS(actualSize, records[recIdx].size); ASSERT_EQUALS(actualRec->extentOfs(), extLoc.getOfs()); ASSERT_EQUALS(actualRec->prevOfs(), expectedPrevOfs); expectedPrevOfs = actualLoc.getOfs(); recIdx++; const int nextOfs = actualRec->nextOfs(); actualLoc = (nextOfs == DiskLoc::NullOfs ? DiskLoc() : DiskLoc(actualLoc.a(), nextOfs)); } if (ext->xnext.isNull()) { ASSERT_EQUALS(md->lastExtent(), extLoc); } extLoc = ext->xnext; } // both the expected and actual record lists must be done at this point ASSERT_EQUALS(records[recIdx].loc, DiskLoc()); ASSERT_EQUALS(dataSize, md->dataSize()); ASSERT_EQUALS(numRecs, md->numRecords()); } if (drecs) { int drecIdx = 0; for (int bucketIdx = 0; bucketIdx < RecordStoreV1Base::Buckets; bucketIdx++) { DiskLoc actualLoc = md->deletedListEntry(bucketIdx); if (md->isCapped() && bucketIdx == 1) { // In capped collections, the 2nd bucket (index 1) points to the drec before // the first drec in the capExtent. If the capExtent is the first Extent, // it should be Null. if (md->capExtent() == md->firstExtent()) { ASSERT_EQUALS(actualLoc, DiskLoc()); } else { ASSERT_NOT_EQUALS(actualLoc.a(), md->capExtent().a()); const DeletedRecord* actualDrec = &em->recordForV1(actualLoc)->asDeleted(); ASSERT_EQUALS(actualDrec->nextDeleted().a(), md->capExtent().a()); } // Don't do normal checking of bucket 1 in capped collections. Checking // other buckets to verify that they are Null. continue; } while (!actualLoc.isNull()) { const DeletedRecord* actualDrec = &em->recordForV1(actualLoc)->asDeleted(); const int actualSize = actualDrec->lengthWithHeaders(); ASSERT_EQUALS(actualLoc, drecs[drecIdx].loc); ASSERT_EQUALS(actualSize, drecs[drecIdx].size); // Make sure the drec is correct ASSERT_EQUALS(actualDrec->extentOfs(), 0); // in capped collections all drecs are linked into a single list in bucket 0 ASSERT_EQUALS(bucketIdx, md->isCapped() ? 0 : RecordStoreV1Base::bucket(actualSize)); drecIdx++; actualLoc = actualDrec->nextDeleted(); } } // both the expected and actual deleted lists must be done at this point ASSERT_EQUALS(drecs[drecIdx].loc, DiskLoc()); } } catch (...) { // If a test fails, provide extra info to make debugging easier printRecList(em, md); printDRecList(em, md); throw; } }
/** @return number of skipped (invalid) documents */ unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc diskloc, int n, const scoped_array<IndexSpec> &indexSpecs, scoped_array<SortPhaseOne>& phase1, int nidx, bool validate, double pf, int pb) { log() << "compact begin extent #" << n << " for namespace " << ns << endl; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent *e = diskloc.ext(); e->assertOk(); verify( e->validates() ); unsigned skipped = 0; { // the next/prev pointers within the extent might not be in order so we first page the whole thing in // sequentially log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl; Timer t; MongoDataFile* mdf = cc().database()->getFile( diskloc.a() ); HANDLE fd = mdf->getFd(); int offset = diskloc.getOfs(); Extent* ext = diskloc.ext(); size_t length = ext->length; touch_pages(fd, offset, length, ext); int ms = t.millis(); if( ms > 1000 ) log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl; } { log() << "compact copying records" << endl; long long datasize = 0; long long nrecords = 0; DiskLoc L = e->firstRecord; if( !L.isNull() ) { while( 1 ) { Record *recOld = L.rec(); L = recOld->nextInExtent(L); BSONObj objOld = BSONObj::make(recOld); if( !validate || objOld.valid() ) { nrecords++; unsigned sz = objOld.objsize(); oldObjSize += sz; oldObjSizeWithPadding += recOld->netLength(); unsigned lenWHdr = sz + Record::HeaderSize; unsigned lenWPadding = lenWHdr; { lenWPadding = static_cast<unsigned>(pf*lenWPadding); lenWPadding += pb; lenWPadding = lenWPadding & quantizeMask(lenWPadding); if( lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { lenWPadding = lenWHdr; } } DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false); uassert(14024, "compact error out of space during compaction", !loc.isNull()); Record *recNew = loc.rec(); datasize += recNew->netLength(); recNew = (Record *) getDur().writingPtr(recNew, lenWHdr); addRecordToRecListInExtent(recNew, loc); memcpy(recNew->data(), objOld.objdata(), sz); { // extract keys for all indexes we will be rebuilding for( int x = 0; x < nidx; x++ ) { phase1[x].addKeys(indexSpecs[x], objOld, loc); } } } else { if( ++skipped <= 10 ) log() << "compact skipping invalid object" << endl; } if( L.isNull() ) { // we just did the very last record from the old extent. it's still pointed to // by the old extent ext, but that will be fixed below after this loop break; } // remove the old records (orphan them) periodically so our commit block doesn't get too large bool stopping = false; RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0; if( stopping || getDur().aCommitIsNeeded() ) { e->firstRecord.writing() = L; Record *r = L.rec(); getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs; getDur().commitIfNeeded(); killCurrentOp.checkForInterrupt(false); } } } // if !L.isNull() verify( d->firstExtent == diskloc ); verify( d->lastExtent != diskloc ); DiskLoc newFirst = e->xnext; d->firstExtent.writing() = newFirst; newFirst.ext()->xprev.writing().Null(); getDur().writing(e)->markEmpty(); freeExtents( diskloc, diskloc ); // update datasize/record count for this namespace's extent { NamespaceDetails::Stats *s = getDur().writing(&d->stats); s->datasize += datasize; s->nrecords += nrecords; } getDur().commitIfNeeded(); { double op = 1.0; if( oldObjSize ) op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize; log() << "compact finished extent #" << n << " containing " << nrecords << " documents (" << datasize/1000000.0 << "MB)" << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100 << endl; } } return skipped; }