Пример #1
0
    DiskLoc _repairExtent( Database* db , string ns, bool forward , DiskLoc eLoc ){
        LogIndentLevel lil;
        
        if ( eLoc.getOfs() <= 0 ){
            error() << "invalid extent ofs: " << eLoc.getOfs() << endl;
            return DiskLoc();
        }
        

        MongoDataFile * mdf = db->getFile( eLoc.a() );

        Extent * e = mdf->debug_getExtent( eLoc );
        if ( ! e->isOk() ){
            warning() << "Extent not ok magic: " << e->magic << " going to try to continue" << endl;
        }
        
        log() << "length:" << e->length << endl;
        
        LogIndentLevel lil2;
        
        DiskLoc loc = forward ? e->firstRecord : e->lastRecord;
        while ( ! loc.isNull() ){
            if ( loc.getOfs() <= 0 ){
                error() << "offset is 0 for record which should be impossible" << endl;
                break;
            }
            log() << loc << endl;
            Record* rec = loc.rec();
            log() << loc.obj() << endl;
            loc = forward ? rec->getNext( loc ) : rec->getPrev( loc );
        }
        return forward ? e->xnext : e->xprev;
        
    }
Пример #2
0
    /**
     * analyzeDiskStorage helper which processes a single record.
     */
    void processDeletedRecord(const DiskLoc& dl, const DeletedRecord* dr, const Extent* ex,
                              const AnalyzeParams& params, int bucketNum,
                              vector<DiskStorageData>& sliceData,
                              BSONArrayBuilder* deletedRecordsArrayBuilder) {

        killCurrentOp.checkForInterrupt();

        int extentOfs = ex->myLoc.getOfs();

        if (! (dl.a() == ex->myLoc.a() &&
               dl.getOfs() + dr->lengthWithHeaders() > extentOfs &&
               dl.getOfs() < extentOfs + ex->length) ) {

            return;
        }

        RecPos pos = RecPos::from(dl.getOfs(), dr->lengthWithHeaders(), extentOfs, params);
        bool spansRequestedArea = false;
        for (RecPos::SliceIterator it = pos.iterateSlices(); !it.end(); ++it) {

            DiskStorageData& slice = sliceData[it->sliceNum];
            slice.freeRecords[bucketNum] += it->ratioHere;
            spansRequestedArea = true;
        }

        if (deletedRecordsArrayBuilder != NULL && spansRequestedArea) {
            BSONObjBuilder(deletedRecordsArrayBuilder->subobjStart())
                .append("ofs", dl.getOfs() - extentOfs)
                .append("recBytes", dr->lengthWithHeaders());
        }
    }
Пример #3
0
    bool NamespaceDetails::inCapExtent( const DiskLoc &dl ) const {
        invariant( !dl.isNull() );

        if ( dl.a() != _capExtent.a() )
            return false;

        if ( dl.getOfs() < _capExtent.getOfs() )
            return false;

        const Extent* e = theCapExtent();
        int end = _capExtent.getOfs() + e->length;
        return dl.getOfs() <= end;
    }
Пример #4
0
    DiskLoc ExtentManager::_createExtentInFile( int fileNo, DataFile* f,
                                                int size, int maxFileNoForQuota ) {

        size = ExtentManager::quantizeExtentSize( size );

        if ( maxFileNoForQuota > 0 && fileNo - 1 >= maxFileNoForQuota ) {
            if ( cc().hasWrittenThisPass() ) {
                warning() << "quota exceeded, but can't assert" << endl;
            }
            else {
                _quotaExceeded();
            }
        }

        massert( 10358, "bad new extent size", size >= Extent::minSize() && size <= Extent::maxSize() );

        DiskLoc loc = f->allocExtentArea( size );
        loc.assertOk();

        Extent *e = getExtent( loc, false );
        verify( e );

        getDur().writing(e)->init("", size, fileNo, loc.getOfs(), false);

        return loc;
    }
Пример #5
0
    DiskLoc NamespaceDetails::__capAlloc( int len ) {
        DiskLoc prev = cappedLastDelRecLastExtent();
        DiskLoc i = cappedFirstDeletedInCurExtent();
        DiskLoc ret;
        for (; !i.isNull() && inCapExtent( i ); prev = i, i = i.drec()->nextDeleted() ) {
            // We need to keep at least one DR per extent in cappedListOfAllDeletedRecords(),
            // so make sure there's space to create a DR at the end.
            if ( i.drec()->lengthWithHeaders() >= len + 24 ) {
                ret = i;
                break;
            }
        }

        /* unlink ourself from the deleted list */
        if ( !ret.isNull() ) {
            if ( prev.isNull() )
                cappedListOfAllDeletedRecords().writing() = ret.drec()->nextDeleted();
            else
                prev.drec()->nextDeleted().writing() = ret.drec()->nextDeleted();
            ret.drec()->nextDeleted().writing().setInvalid(); // defensive.
            verify( ret.drec()->extentOfs() < ret.getOfs() );
        }

        return ret;
    }
Пример #6
0
 // bypass standard alloc/insert routines to use the extent we want.
 static DiskLoc insert( DiskLoc ext, int i ) {
     BSONObjBuilder b;
     b.append( "a", i );
     BSONObj o = b.done();
     int len = o.objsize();
     Extent *e = ext.ext();
     int ofs;
     if ( e->lastRecord.isNull() )
         ofs = ext.getOfs() + ( e->extentData - (char *)e );
     else
         ofs = e->lastRecord.getOfs() + e->lastRecord.rec()->lengthWithHeaders;
     DiskLoc dl( ext.a(), ofs );
     Record *r = dl.rec();
     r->lengthWithHeaders = Record::HeaderSize + len;
     r->extentOfs = e->myLoc.getOfs();
     r->nextOfs = DiskLoc::NullOfs;
     r->prevOfs = e->lastRecord.isNull() ? DiskLoc::NullOfs : e->lastRecord.getOfs();
     memcpy( r->data, o.objdata(), len );
     if ( e->firstRecord.isNull() )
         e->firstRecord = dl;
     else
         e->lastRecord.rec()->nextOfs = ofs;
     e->lastRecord = dl;
     return dl;
 }
Пример #7
0
    Extent* ExtentManager::_createExtentInFile( int fileNo, DataFile* f,
                                                const char* ns, int size, bool newCapped,
                                                bool enforceQuota ) {

        size = ExtentManager::quantizeExtentSize( size );

        if ( enforceQuota ) {
            if ( fileIndexExceedsQuota( ns, fileNo - 1 ) ) {
                if ( cc().hasWrittenThisPass() ) {
                    warning() << "quota exceeded, but can't assert "
                              << " going over quota for: " << ns << " " << fileNo << endl;
                }
                else {
                    _quotaExceeded();
                }
            }
        }

        massert( 10358, "bad new extent size", size >= Extent::minSize() && size <= Extent::maxSize() );

        DiskLoc loc = f->allocExtentArea( size );
        loc.assertOk();

        Extent *e = getExtent( loc, false );
        verify( e );

        DiskLoc emptyLoc = getDur().writing(e)->init(ns, size, fileNo, loc.getOfs(), newCapped);

        addNewExtentToNamespace(ns, e, loc, emptyLoc, newCapped);

        LOG(1) << "ExtentManager: creating new extent for: " << ns << " in file: " << fileNo
               << " size: " << size << endl;

        return e;
    }
Пример #8
0
    void touchNs( const std::string& ns ) { 
        std::vector< touch_location > ranges;
        Client::ReadContext ctx(ns);
        {

            NamespaceDetails *nsd = nsdetails(ns.c_str());
            uassert( 16154, "namespace does not exist", nsd );
            
            for( DiskLoc L = nsd->firstExtent; !L.isNull(); L = L.ext()->xnext )  {
                MongoDataFile* mdf = cc().database()->getFile( L.a() );
                massert( 16238, "can't fetch extent file structure", mdf );
                touch_location tl;
                tl.fd = mdf->getFd();
                tl.offset = L.getOfs();
                tl.ext = L.ext();
                tl.length = tl.ext->length;
                
                ranges.push_back(tl);                
            }

        }
        LockMongoFilesShared lk;
        Lock::TempRelease tr;
        std::string progress_msg = "touch " + ns + " extents";
        ProgressMeterHolder pm( cc().curop()->setMessage( progress_msg.c_str() , ranges.size() ) );
        for ( std::vector< touch_location >::iterator it = ranges.begin(); it != ranges.end(); ++it ) {
            touch_pages( it->fd, it->offset, it->length, it->ext );
            pm.hit();
            killCurrentOp.checkForInterrupt(false);
        }
        pm.finished();
    }
Пример #9
0
    /**
     * analyzeDiskStorage helper which processes a single record.
     */
    void processRecord(const DiskLoc& dl, const DiskLoc& prevDl, const Record* r, int extentOfs,
                       const AnalyzeParams& params, vector<DiskStorageData>& sliceData,
                       BSONArrayBuilder* recordsArrayBuilder) {
        killCurrentOp.checkForInterrupt();

        BSONObj obj = dl.obj();
        int recBytes = r->lengthWithHeaders();
        double characteristicFieldValue = 0;
        bool hasCharacteristicField = extractCharacteristicFieldValue(obj, params,
                                                                      characteristicFieldValue);
        bool isLocatedBeforePrevious = dl.a() < prevDl.a();

        RecPos pos = RecPos::from(dl.getOfs(), recBytes, extentOfs, params);
        bool spansRequestedArea = false;
        for (RecPos::SliceIterator it = pos.iterateSlices(); !it.end(); ++it) {
            spansRequestedArea = true;
            DiskStorageData& slice = sliceData[it->sliceNum];
            slice.numEntries += it->ratioHere;
            slice.recBytes += it->sizeHere;
            slice.bsonBytes += static_cast<long long>(it->ratioHere * obj.objsize());
            if (hasCharacteristicField) {
                slice.characteristicCount += it->ratioHere;
                slice.characteristicSum += it->ratioHere * characteristicFieldValue;
            }
            if (isLocatedBeforePrevious) {
                slice.outOfOrderRecs += it->ratioHere;
            }
        }

        if (recordsArrayBuilder != NULL && spansRequestedArea) {
            DEV {
                int startsAt = dl.getOfs() - extentOfs;
                int endsAt = startsAt + recBytes;
                verify((startsAt < params.startOfs && endsAt > params.startOfs) ||
                       (startsAt < params.endOfs && endsAt >= params.endOfs) ||
                       (startsAt >= params.startOfs && endsAt < params.endOfs));
            }
            BSONObjBuilder recordBuilder(recordsArrayBuilder->subobjStart());
            recordBuilder.append("ofs", dl.getOfs() - extentOfs);
            recordBuilder.append("recBytes", recBytes);
            recordBuilder.append("bsonBytes", obj.objsize());
            recordBuilder.append("_id", obj["_id"]);
            if (hasCharacteristicField) {
                recordBuilder.append("characteristic", characteristicFieldValue);
            }
            recordBuilder.doneFast();
        }
Пример #10
0
    /* combine adjacent deleted records *for the current extent* of the capped collection

       this is O(n^2) but we call it for capped tables where typically n==1 or 2!
       (or 3...there will be a little unused sliver at the end of the extent.)
    */
    void NamespaceDetails::compact() {
        DDD( "NamespaceDetails::compact enter" );
        verify( isCapped() );
        
        vector<DiskLoc> drecs;
        
        // Pull out capExtent's DRs from deletedList
        DiskLoc i = cappedFirstDeletedInCurExtent();
        for (; !i.isNull() && inCapExtent( i ); i = i.drec()->nextDeleted() ) {
            DDD( "\t" << i );
            drecs.push_back( i );
        }

        getDur().writingDiskLoc( cappedFirstDeletedInCurExtent() ) = i;
        
        std::sort( drecs.begin(), drecs.end() );
        DDD( "\t drecs.size(): " << drecs.size() );

        vector<DiskLoc>::const_iterator j = drecs.begin();
        verify( j != drecs.end() );
        DiskLoc a = *j;
        while ( 1 ) {
            j++;
            if ( j == drecs.end() ) {
                DDD( "\t compact adddelrec" );
                addDeletedRec(a.drec(), a);
                break;
            }
            DiskLoc b = *j;
            while ( a.a() == b.a() && a.getOfs() + a.drec()->lengthWithHeaders() == b.getOfs() ) {
                // a & b are adjacent.  merge.
                getDur().writingInt( a.drec()->lengthWithHeaders() ) += b.drec()->lengthWithHeaders();
                j++;
                if ( j == drecs.end() ) {
                    DDD( "\t compact adddelrec2" );
                    addDeletedRec(a.drec(), a);
                    return;
                }
                b = *j;
            }
            DDD( "\t compact adddelrec3" );
            addDeletedRec(a.drec(), a);
            a = b;
        }

    }
Пример #11
0
    /* combine adjacent deleted records *for the current extent* of the capped collection
     
       this is O(n^2) but we call it for capped tables where typically n==1 or 2!
       (or 3...there will be a little unused sliver at the end of the extent.)
    */
    void NamespaceDetails::compact() {
        assert(capped);

        list<DiskLoc> drecs;

        // Pull out capExtent's DRs from deletedList
        DiskLoc i = cappedFirstDeletedInCurExtent();
        for (; !i.isNull() && inCapExtent( i ); i = i.drec()->nextDeleted )
            drecs.push_back( i );

        getDur().writingDiskLoc( cappedFirstDeletedInCurExtent() ) = i;

        // This is the O(n^2) part.
        drecs.sort();

        list<DiskLoc>::iterator j = drecs.begin();
        assert( j != drecs.end() );
        DiskLoc a = *j;
        while ( 1 ) {
            j++;
            if ( j == drecs.end() ) {
                DEBUGGING out() << "TEMP: compact adddelrec\n";
                addDeletedRec(a.drec(), a);
                break;
            }
            DiskLoc b = *j;
            while ( a.a() == b.a() && a.getOfs() + a.drec()->lengthWithHeaders == b.getOfs() ) {
                // a & b are adjacent.  merge.
                getDur().writingInt( a.drec()->lengthWithHeaders ) += b.drec()->lengthWithHeaders;
                j++;
                if ( j == drecs.end() ) {
                    DEBUGGING out() << "temp: compact adddelrec2\n";
                    addDeletedRec(a.drec(), a);
                    return;
                }
                b = *j;
            }
            DEBUGGING out() << "temp: compact adddelrec3\n";
            addDeletedRec(a.drec(), a);
            a = b;
        }
    }
Пример #12
0
 void DiskLoc56Bit::operator=(const DiskLoc& loc) {
     ofs = loc.getOfs();
     int la = loc.a();
     invariant( la <= 0xffffff ); // must fit in 3 bytes
     if( la < 0 ) {
         if ( la != -1 ) {
             log() << "btree diskloc isn't negative 1: " << la << std::endl;
             invariant ( la == -1 );
         }
         la = 0;
         ofs = OurNullOfs;
     }
     memcpy(_a, &la, 3); // endian
 }
Пример #13
0
 /** add a record to the end of the linked list chain within this extent. 
     require: you must have already declared write intent for the record header.        
 */
 void addRecordToRecListInExtent(Record *r, DiskLoc loc) {
     dassert( loc.rec() == r );
     Extent *e = r->myExtent(loc);
     if ( e->lastRecord.isNull() ) {
         Extent::FL *fl = getDur().writing(e->fl());
         fl->firstRecord = fl->lastRecord = loc;
         r->prevOfs() = r->nextOfs() = DiskLoc::NullOfs;
     }
     else {
         Record *oldlast = e->lastRecord.rec();
         r->prevOfs() = e->lastRecord.getOfs();
         r->nextOfs() = DiskLoc::NullOfs;
         getDur().writingInt(oldlast->nextOfs()) = loc.getOfs();
         getDur().writingDiskLoc(e->lastRecord) = loc;
     }
 }
Пример #14
0
    /* special version of insert for transaction logging -- streamlined a bit.
       assumes ns is capped and no indexes
    */
    Record* DataFileMgr::fast_oplog_insert(NamespaceDetails *d, const char *ns, int len) {
        verify( d );
        RARELY verify( d == nsdetails(ns) );
        DEV verify( d == nsdetails(ns) );

        massert( 16509,
                 str::stream()
                 << "fast_oplog_insert requires a capped collection "
                 << " but " << ns << " is not capped",
                 d->isCapped() );

        //record timing on oplog inserts
        boost::optional<TimerHolder> insertTimer;
        //skip non-oplog collections
        if (NamespaceString::oplog(ns)) {
            insertTimer = boost::in_place(&oplogInsertStats);
            oplogInsertBytesStats.increment(len); //record len of inserted records for oplog
        }

        int lenWHdr = len + Record::HeaderSize;
        DiskLoc loc = d->alloc(ns, lenWHdr);
        verify( !loc.isNull() );

        Record *r = loc.rec();
        verify( r->lengthWithHeaders() >= lenWHdr );

        Extent *e = r->myExtent(loc);
        if ( e->lastRecord.isNull() ) {
            Extent::FL *fl = getDur().writing( e->fl() );
            fl->firstRecord = fl->lastRecord = loc;

            Record::NP *np = getDur().writing(r->np());
            np->nextOfs = np->prevOfs = DiskLoc::NullOfs;
        }
        else {
            Record *oldlast = e->lastRecord.rec();
            Record::NP *np = getDur().writing(r->np());
            np->prevOfs = e->lastRecord.getOfs();
            np->nextOfs = DiskLoc::NullOfs;
            getDur().writingInt( oldlast->nextOfs() ) = loc.getOfs();
            e->lastRecord.writing() = loc;
        }

        d->incrementStats( r->netLength(), 1 );
        return r;
    }
Пример #15
0
 void RecordStoreV1Base::_addRecordToRecListInExtent(OperationContext* txn,
                                                     Record *r,
                                                     DiskLoc loc) {
     dassert( recordFor(loc) == r );
     Extent *e = _getExtent( _getExtentLocForRecord( loc ) );
     if ( e->lastRecord.isNull() ) {
         *txn->recoveryUnit()->writing(&e->firstRecord) = loc;
         *txn->recoveryUnit()->writing(&e->lastRecord) = loc;
         r->prevOfs() = r->nextOfs() = DiskLoc::NullOfs;
     }
     else {
         Record *oldlast = recordFor(e->lastRecord);
         r->prevOfs() = e->lastRecord.getOfs();
         r->nextOfs() = DiskLoc::NullOfs;
         txn->recoveryUnit()->writingInt(oldlast->nextOfs()) = loc.getOfs();
         *txn->recoveryUnit()->writing(&e->lastRecord) = loc;
     }
 }
    void assertStateV1RS(const LocAndSize* records,
                         const LocAndSize* drecs,
                         const ExtentManager* em,
                         const DummyRecordStoreV1MetaData* md) {
        invariant(records || drecs); // if both are NULL nothing is being asserted...

        try {
            if (records) {
                long long dataSize = 0;
                long long numRecs = 0;

                int recIdx = 0;

                DiskLoc extLoc = md->firstExtent();
                while (!extLoc.isNull()) { // for each Extent
                    Extent* ext = em->getExtent(extLoc, true);
                    int expectedPrevOfs = DiskLoc::NullOfs;
                    DiskLoc actualLoc = ext->firstRecord;
                    while (!actualLoc.isNull()) { // for each Record in this Extent
                        const Record* actualRec = em->recordForV1(actualLoc);
                        const int actualSize = actualRec->lengthWithHeaders();

                        dataSize += actualSize - Record::HeaderSize;
                        numRecs += 1;

                        ASSERT_EQUALS(actualLoc, records[recIdx].loc);
                        ASSERT_EQUALS(actualSize, records[recIdx].size);

                        ASSERT_EQUALS(actualRec->extentOfs(), extLoc.getOfs());
                        ASSERT_EQUALS(actualRec->prevOfs(), expectedPrevOfs);
                        expectedPrevOfs = actualLoc.getOfs();

                        recIdx++;
                        const int nextOfs = actualRec->nextOfs();
                        actualLoc = (nextOfs == DiskLoc::NullOfs ? DiskLoc()
                                                                 : DiskLoc(actualLoc.a(), nextOfs));
                    }

                    if (ext->xnext.isNull()) {
                        ASSERT_EQUALS(md->lastExtent(), extLoc);
                    }

                    extLoc = ext->xnext;
                }

                // both the expected and actual record lists must be done at this point
                ASSERT_EQUALS(records[recIdx].loc, DiskLoc());

                ASSERT_EQUALS(dataSize, md->dataSize());
                ASSERT_EQUALS(numRecs, md->numRecords());
            }

            if (drecs) {
                int drecIdx = 0;
                for (int bucketIdx = 0; bucketIdx < RecordStoreV1Base::Buckets; bucketIdx++) {
                    DiskLoc actualLoc = md->deletedListEntry(bucketIdx);

                    if (md->isCapped() && bucketIdx == 1) {
                        // In capped collections, the 2nd bucket (index 1) points to the drec before
                        // the first drec in the capExtent. If the capExtent is the first Extent,
                        // it should be Null.

                        if (md->capExtent() == md->firstExtent()) {
                            ASSERT_EQUALS(actualLoc, DiskLoc());
                        }
                        else {
                            ASSERT_NOT_EQUALS(actualLoc.a(), md->capExtent().a());
                            const DeletedRecord* actualDrec =
                                &em->recordForV1(actualLoc)->asDeleted();
                            ASSERT_EQUALS(actualDrec->nextDeleted().a(), md->capExtent().a());
                        }

                        // Don't do normal checking of bucket 1 in capped collections. Checking
                        // other buckets to verify that they are Null.
                        continue;
                    }

                    while (!actualLoc.isNull()) {
                        const DeletedRecord* actualDrec = &em->recordForV1(actualLoc)->asDeleted();
                        const int actualSize = actualDrec->lengthWithHeaders();

                        ASSERT_EQUALS(actualLoc, drecs[drecIdx].loc);
                        ASSERT_EQUALS(actualSize, drecs[drecIdx].size);

                        // Make sure the drec is correct
                        ASSERT_EQUALS(actualDrec->extentOfs(), 0);

                        // in capped collections all drecs are linked into a single list in bucket 0
                        ASSERT_EQUALS(bucketIdx, md->isCapped()
                                                   ? 0
                                                   : RecordStoreV1Base::bucket(actualSize));

                        drecIdx++;
                        actualLoc = actualDrec->nextDeleted();
                    }
                }
                // both the expected and actual deleted lists must be done at this point
                ASSERT_EQUALS(drecs[drecIdx].loc, DiskLoc());
            }
        }
        catch (...) {
            // If a test fails, provide extra info to make debugging easier
            printRecList(em, md);
            printDRecList(em, md);
            throw;
        }
    }
    void initializeV1RS(OperationContext* txn,
                        const LocAndSize* records,
                        const LocAndSize* drecs,
                        DummyExtentManager* em,
                        DummyRecordStoreV1MetaData* md) {
        invariant(records || drecs); // if both are NULL nothing is being created...
        
        // Need to start with a blank slate
        invariant(em->numFiles() == 0);
        invariant(md->firstExtent().isNull());

        // pre-allocate extents (even extents that aren't part of this RS)
        {
            typedef std::map<int, size_t> ExtentSizes;
            ExtentSizes extentSizes;
            accumulateExtentSizeRequirements(records, &extentSizes);
            accumulateExtentSizeRequirements(drecs, &extentSizes);
            invariant(!extentSizes.empty());

            const int maxExtent = extentSizes.rbegin()->first;
            for (int i = 0; i <= maxExtent; i++) {
                const size_t size = extentSizes.count(i) ? extentSizes[i] : 0;
                const DiskLoc loc = em->allocateExtent(txn, md->isCapped(), size, 0);

                // This function and assertState depend on these details of DummyExtentManager
                invariant(loc.a() == i);
                invariant(loc.getOfs() == 0);
            }

            // link together extents that should be part of this RS
            md->setFirstExtent(txn, DiskLoc(extentSizes.begin()->first, 0));
            md->setLastExtent(txn, DiskLoc(extentSizes.rbegin()->first, 0));
            for (ExtentSizes::iterator it = extentSizes.begin();
                    boost::next(it) != extentSizes.end(); /* ++it */ ) {
                const int a = it->first;
                ++it;
                const int b = it->first;
                em->getExtent(DiskLoc(a, 0))->xnext = DiskLoc(b, 0);
                em->getExtent(DiskLoc(b, 0))->xprev = DiskLoc(a, 0);
            }

            // This signals "done allocating new extents".
            if (md->isCapped())
                md->setDeletedListEntry(txn, 1, DiskLoc());
        }

        if (records && !records[0].loc.isNull()) {
            int recIdx = 0;
            DiskLoc extLoc = md->firstExtent();
            while (!extLoc.isNull()) {
                Extent* ext = em->getExtent(extLoc);
                int prevOfs = DiskLoc::NullOfs;
                while (extLoc.a() == records[recIdx].loc.a()) { // for all records in this extent
                    const DiskLoc loc = records[recIdx].loc;
                    const int size = records[recIdx].size;;
                    invariant(size >= Record::HeaderSize);

                    md->incrementStats(txn, size - Record::HeaderSize, 1);

                    if (ext->firstRecord.isNull())
                        ext->firstRecord = loc;

                    Record* rec = em->recordForV1(loc);
                    rec->lengthWithHeaders() = size;
                    rec->extentOfs() = 0;

                    rec->prevOfs() = prevOfs;
                    prevOfs = loc.getOfs();

                    const DiskLoc nextLoc = records[recIdx + 1].loc;
                    if (nextLoc.a() == loc.a()) { // if next is in same extent
                        rec->nextOfs() = nextLoc.getOfs();
                    }
                    else {
                        rec->nextOfs() = DiskLoc::NullOfs;
                        ext->lastRecord = loc;
                    }

                    recIdx++;
                }
                extLoc = ext->xnext;
            }
            invariant(records[recIdx].loc.isNull());
        }

        if (drecs && !drecs[0].loc.isNull()) {
            int drecIdx = 0;
            DiskLoc* prevNextPtr = NULL;
            int lastBucket = -1;
            while (!drecs[drecIdx].loc.isNull()) {
                const DiskLoc loc = drecs[drecIdx].loc;
                const int size = drecs[drecIdx].size;
                invariant(size >= Record::HeaderSize);
                const int bucket = RecordStoreV1Base::bucket(size);

                if (md->isCapped()) {
                    // All drecs form a single list in bucket 0
                    if (prevNextPtr == NULL) {
                        md->setDeletedListEntry(txn, 0, loc);
                    }
                    else {
                        *prevNextPtr = loc;
                    }

                    if (loc.a() < md->capExtent().a()
                            && drecs[drecIdx + 1].loc.a() == md->capExtent().a()) {
                        // Bucket 1 is known as cappedLastDelRecLastExtent
                        md->setDeletedListEntry(txn, 1, loc);
                    }
                } 
                else if (bucket != lastBucket) {
                    invariant(bucket > lastBucket); // if this fails, drecs weren't sorted by bucket
                    md->setDeletedListEntry(txn, bucket, loc);
                    lastBucket = bucket;
                }
                else {
                    *prevNextPtr = loc;
                }

                DeletedRecord* drec = &em->recordForV1(loc)->asDeleted();
                drec->lengthWithHeaders() = size;
                drec->extentOfs() = 0;
                drec->nextDeleted() = DiskLoc();
                prevNextPtr = &drec->nextDeleted();

                drecIdx++;
            }
        }

        // Make sure we set everything up as requested.
        assertStateV1RS(records, drecs, em, md);
    }
Пример #18
0
DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents(OperationContext* txn, int lenToAllocRaw) {
    // Slowly drain the deletedListLegacyGrabBag by popping one record off and putting it in the
    // correct deleted list each time we try to allocate a new record. This ensures we won't
    // orphan any data when upgrading from old versions, without needing a long upgrade phase.
    // This is done before we try to allocate the new record so we can take advantage of the new
    // space immediately.
    {
        const DiskLoc head = _details->deletedListLegacyGrabBag();
        if (!head.isNull()) {
            _details->setDeletedListLegacyGrabBag(txn, drec(head)->nextDeleted());
            addDeletedRec(txn, head);
        }
    }

    // align size up to a multiple of 4
    const int lenToAlloc = (lenToAllocRaw + (4 - 1)) & ~(4 - 1);

    freelistAllocs.increment();
    DiskLoc loc;
    DeletedRecord* dr = NULL;
    {
        int myBucket;
        for (myBucket = bucket(lenToAlloc); myBucket < Buckets; myBucket++) {
            // Only look at the first entry in each bucket. This works because we are either
            // quantizing or allocating fixed-size blocks.
            const DiskLoc head = _details->deletedListEntry(myBucket);
            if (head.isNull())
                continue;
            DeletedRecord* const candidate = drec(head);
            if (candidate->lengthWithHeaders() >= lenToAlloc) {
                loc = head;
                dr = candidate;
                break;
            }
        }

        if (!dr)
            return DiskLoc();  // no space

        // Unlink ourself from the deleted list
        _details->setDeletedListEntry(txn, myBucket, dr->nextDeleted());
        *txn->recoveryUnit()->writing(&dr->nextDeleted()) = DiskLoc().setInvalid();  // defensive
    }

    invariant(dr->extentOfs() < loc.getOfs());

    // Split the deleted record if it has at least as much left over space as our smallest
    // allocation size. Otherwise, just take the whole DeletedRecord.
    const int remainingLength = dr->lengthWithHeaders() - lenToAlloc;
    if (remainingLength >= bucketSizes[0]) {
        txn->recoveryUnit()->writingInt(dr->lengthWithHeaders()) = lenToAlloc;
        const DiskLoc newDelLoc = DiskLoc(loc.a(), loc.getOfs() + lenToAlloc);
        DeletedRecord* newDel = txn->recoveryUnit()->writing(drec(newDelLoc));
        newDel->extentOfs() = dr->extentOfs();
        newDel->lengthWithHeaders() = remainingLength;
        newDel->nextDeleted().Null();

        addDeletedRec(txn, newDelLoc);
    }

    return loc;
}
Пример #19
0
    DiskLoc SimpleRecordStoreV1::_allocFromExistingExtents( OperationContext* txn,
                                                            int lenToAlloc ) {
        // align size up to a multiple of 4
        lenToAlloc = (lenToAlloc + (4-1)) & ~(4-1);

        freelistAllocs.increment();
        DiskLoc loc;
        {
            DiskLoc *prev = 0;
            DiskLoc *bestprev = 0;
            DiskLoc bestmatch;
            int bestmatchlen = INT_MAX; // sentinel meaning we haven't found a record big enough
            int b = bucket(lenToAlloc);
            DiskLoc cur = _details->deletedListEntry(b);
            
            int extra = 5; // look for a better fit, a little.
            int chain = 0;
            while ( 1 ) {
                { // defensive check
                    int fileNumber = cur.a();
                    int fileOffset = cur.getOfs();
                    if (fileNumber < -1 || fileNumber >= 100000 || fileOffset < 0) {
                        StringBuilder sb;
                        sb << "Deleted record list corrupted in collection " << _ns
                           << ", bucket " << b
                           << ", link number " << chain
                           << ", invalid link is " << cur.toString()
                           << ", throwing Fatal Assertion";
                        log() << sb.str() << endl;
                        fassertFailed(16469);
                    }
                }
                if ( cur.isNull() ) {
                    // move to next bucket.  if we were doing "extra", just break
                    if ( bestmatchlen < INT_MAX )
                        break;

                    if ( chain > 0 ) {
                        // if we looked at things in the right bucket, but they were not suitable
                        freelistBucketExhausted.increment();
                    }

                    b++;
                    if ( b > MaxBucket ) {
                        // out of space. alloc a new extent.
                        freelistIterations.increment( 1 + chain );
                        return DiskLoc();
                    }
                    cur = _details->deletedListEntry(b);
                    prev = 0;
                    continue;
                }
                DeletedRecord *r = drec(cur);
                if ( r->lengthWithHeaders() >= lenToAlloc &&
                     r->lengthWithHeaders() < bestmatchlen ) {
                    bestmatchlen = r->lengthWithHeaders();
                    bestmatch = cur;
                    bestprev = prev;
                    if (r->lengthWithHeaders() == lenToAlloc)
                        // exact match, stop searching
                        break;
                }
                if ( bestmatchlen < INT_MAX && --extra <= 0 )
                    break;
                if ( ++chain > 30 && b <= MaxBucket ) {
                    // too slow, force move to next bucket to grab a big chunk
                    //b++;
                    freelistIterations.increment( chain );
                    chain = 0;
                    cur.Null();
                }
                else {
                    cur = r->nextDeleted();
                    prev = &r->nextDeleted();
                }
            }

            // unlink ourself from the deleted list
            DeletedRecord *bmr = drec(bestmatch);
            if ( bestprev ) {
                *txn->recoveryUnit()->writing(bestprev) = bmr->nextDeleted();
            }
            else {
                // should be the front of a free-list
                int myBucket = bucket(bmr->lengthWithHeaders());
                invariant( _details->deletedListEntry(myBucket) == bestmatch );
                _details->setDeletedListEntry(txn, myBucket, bmr->nextDeleted());
            }
            *txn->recoveryUnit()->writing(&bmr->nextDeleted()) = DiskLoc().setInvalid(); // defensive.
            invariant(bmr->extentOfs() < bestmatch.getOfs());

            freelistIterations.increment( 1 + chain );
            loc = bestmatch;
        }

        if ( loc.isNull() )
            return loc;

        // determine if we should chop up

        DeletedRecord *r = drec(loc);

        /* note we want to grab from the front so our next pointers on disk tend
        to go in a forward direction which is important for performance. */
        int regionlen = r->lengthWithHeaders();
        invariant( r->extentOfs() < loc.getOfs() );

        int left = regionlen - lenToAlloc;
        if ( left < 24 || left < (lenToAlloc / 8) ) {
            // you get the whole thing.
            return loc;
        }

        // don't quantize:
        //   - $ collections (indexes) as we already have those aligned the way we want SERVER-8425
        if ( _normalCollection ) {
            // we quantize here so that it only impacts newly sized records
            // this prevents oddities with older records and space re-use SERVER-8435
            lenToAlloc = std::min( r->lengthWithHeaders(),
                                   quantizeAllocationSpace( lenToAlloc ) );
            left = regionlen - lenToAlloc;

            if ( left < 24 ) {
                // you get the whole thing.
                return loc;
            }
        }

        /* split off some for further use. */
        txn->recoveryUnit()->writingInt(r->lengthWithHeaders()) = lenToAlloc;
        DiskLoc newDelLoc = loc;
        newDelLoc.inc(lenToAlloc);
        DeletedRecord* newDel = drec(newDelLoc);
        DeletedRecord* newDelW = txn->recoveryUnit()->writing(newDel);
        newDelW->extentOfs() = r->extentOfs();
        newDelW->lengthWithHeaders() = left;
        newDelW->nextDeleted().Null();

        addDeletedRec( txn, newDelLoc );
        return loc;
    }
Пример #20
0
    DiskLoc _repairExtent( Database* db , string ns, bool forward , DiskLoc eLoc , Writer& w ){
        LogIndentLevel lil;
        
        if ( eLoc.getOfs() <= 0 ){
            toolError() << "invalid extent ofs: " << eLoc.getOfs() << std::endl;
            return DiskLoc();
        }

        const ExtentManager& extentManager = db->getExtentManager();

        Extent* e = extentManager.getExtent( eLoc, false );
        if ( ! e->isOk() ){
            toolError() << "Extent not ok magic: " << e->magic << " going to try to continue"
                      << std::endl;
        }

        toolInfoLog() << "length:" << e->length << std::endl;
        
        LogIndentLevel lil2;
        
        set<DiskLoc> seen;

        DiskLoc loc = forward ? e->firstRecord : e->lastRecord;
        while ( ! loc.isNull() ){
            
            if ( ! seen.insert( loc ).second ) {
                toolError() << "infinite loop in extent, seen: " << loc << " before" << std::endl;
                break;
            }

            if ( loc.getOfs() <= 0 ){
                toolError() << "offset is 0 for record which should be impossible" << std::endl;
                break;
            }
            if (logger::globalLogDomain()->shouldLog(logger::LogSeverity::Debug(1))) {
                toolInfoLog() << loc << std::endl;
            }
            BSONObj obj;
            try {
                obj = loc.obj();
                verify( obj.valid() );
                if (logger::globalLogDomain()->shouldLog(logger::LogSeverity::Debug(1))) {
                    toolInfoLog() << obj << std::endl;
                }
                w( obj );
            }
            catch ( std::exception& e ) {
                toolError() << "found invalid document @ " << loc << " " << e.what() << std::endl;
                if ( ! obj.isEmpty() ) {
                    try {
                        BSONElement e = obj.firstElement();
                        stringstream ss;
                        ss << "first element: " << e;
                        toolError() << ss.str() << std::endl;
                    }
                    catch ( std::exception& ) {
                        toolError() << "unable to log invalid document @ " << loc << std::endl;
                    }
                }
            }
            loc = forward ?
                extentManager.getNextRecordInExtent( loc )
                : extentManager.getPrevRecordInExtent( loc );

            // break when new loc is outside current extent boundary
            if ( loc.isNull() ) {
                break;
            }
        }
        toolInfoLog() << "wrote " << seen.size() << " documents" << std::endl;
        return forward ? e->xnext : e->xprev;
    }
Пример #21
0
    DiskLoc _repairExtent( Database* db , string ns, bool forward , DiskLoc eLoc , Writer& w ){
        LogIndentLevel lil;
        
        if ( eLoc.getOfs() <= 0 ){
            error() << "invalid extent ofs: " << eLoc.getOfs() << endl;
            return DiskLoc();
        }
        

        MongoDataFile * mdf = db->getFile( eLoc.a() );

        Extent * e = mdf->debug_getExtent( eLoc );
        if ( ! e->isOk() ){
            warning() << "Extent not ok magic: " << e->magic << " going to try to continue" << endl;
        }
        
        log() << "length:" << e->length << endl;
        
        LogIndentLevel lil2;
        
        set<DiskLoc> seen;

        DiskLoc loc = forward ? e->firstRecord : e->lastRecord;
        while ( ! loc.isNull() ){
            
            if ( ! seen.insert( loc ).second ) {
                error() << "infinite loop in extend, seen: " << loc << " before" << endl;
                break;
            }

            if ( loc.getOfs() <= 0 ){
                error() << "offset is 0 for record which should be impossible" << endl;
                break;
            }
            log(1) << loc << endl;
            Record* rec = loc.rec();
            BSONObj obj;
            try {
                obj = loc.obj();
                assert( obj.valid() );
                LOG(1) << obj << endl;
                w( obj );
            }
            catch ( std::exception& e ) {
                log() << "found invalid document @ " << loc << " " << e.what() << endl;
                if ( ! obj.isEmpty() ) {
                    try {
                        BSONElement e = obj.firstElement();
                        stringstream ss;
                        ss << "first element: " << e;
                        log() << ss.str();
                    }
                    catch ( std::exception& ) {
                    }
                }
            }
            loc = forward ? rec->getNext( loc ) : rec->getPrev( loc );
        }
        return forward ? e->xnext : e->xprev;
        
    }
Пример #22
0
    /** @return number of skipped (invalid) documents */
    unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc diskloc, int n,
                const scoped_array<IndexSpec> &indexSpecs,
                scoped_array<SortPhaseOne>& phase1, int nidx, bool validate, 
                double pf, int pb)
    {
        log() << "compact begin extent #" << n << " for namespace " << ns << endl;
        unsigned oldObjSize = 0; // we'll report what the old padding was
        unsigned oldObjSizeWithPadding = 0;

        Extent *e = diskloc.ext();
        e->assertOk();
        verify( e->validates() );
        unsigned skipped = 0;

        {
            // the next/prev pointers within the extent might not be in order so we first page the whole thing in 
            // sequentially
            log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl;
            Timer t;
            MongoDataFile* mdf = cc().database()->getFile( diskloc.a() );
            HANDLE fd = mdf->getFd();
            int offset = diskloc.getOfs();
            Extent* ext = diskloc.ext();
            size_t length = ext->length;
                
            touch_pages(fd, offset, length, ext);
            int ms = t.millis();
            if( ms > 1000 ) 
                log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl;
        }

        {
            log() << "compact copying records" << endl;
            long long datasize = 0;
            long long nrecords = 0;
            DiskLoc L = e->firstRecord;
            if( !L.isNull() ) {
                while( 1 ) {
                    Record *recOld = L.rec();
                    L = recOld->nextInExtent(L);
                    BSONObj objOld = BSONObj::make(recOld);

                    if( !validate || objOld.valid() ) {
                        nrecords++;
                        unsigned sz = objOld.objsize();

                        oldObjSize += sz;
                        oldObjSizeWithPadding += recOld->netLength();

                        unsigned lenWHdr = sz + Record::HeaderSize;
                        unsigned lenWPadding = lenWHdr;
                        {
                            lenWPadding = static_cast<unsigned>(pf*lenWPadding);
                            lenWPadding += pb;
                            lenWPadding = lenWPadding & quantizeMask(lenWPadding);
                            if( lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { 
                                lenWPadding = lenWHdr;
                            }
                        }
                        DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false);
                        uassert(14024, "compact error out of space during compaction", !loc.isNull());
                        Record *recNew = loc.rec();
                        datasize += recNew->netLength();
                        recNew = (Record *) getDur().writingPtr(recNew, lenWHdr);
                        addRecordToRecListInExtent(recNew, loc);
                        memcpy(recNew->data(), objOld.objdata(), sz);

                        {
                            // extract keys for all indexes we will be rebuilding
                            for( int x = 0; x < nidx; x++ ) { 
                                phase1[x].addKeys(indexSpecs[x], objOld, loc);
                            }
                        }
                    }
                    else { 
                        if( ++skipped <= 10 )
                            log() << "compact skipping invalid object" << endl;
                    }

                    if( L.isNull() ) { 
                        // we just did the very last record from the old extent.  it's still pointed to 
                        // by the old extent ext, but that will be fixed below after this loop
                        break;
                    }

                    // remove the old records (orphan them) periodically so our commit block doesn't get too large
                    bool stopping = false;
                    RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0;
                    if( stopping || getDur().aCommitIsNeeded() ) {
                        e->firstRecord.writing() = L;
                        Record *r = L.rec();
                        getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs;
                        getDur().commitIfNeeded();
                        killCurrentOp.checkForInterrupt(false);
                    }
                }
            } // if !L.isNull()

            verify( d->firstExtent == diskloc );
            verify( d->lastExtent != diskloc );
            DiskLoc newFirst = e->xnext;
            d->firstExtent.writing() = newFirst;
            newFirst.ext()->xprev.writing().Null();
            getDur().writing(e)->markEmpty();
            freeExtents( diskloc, diskloc );
            // update datasize/record count for this namespace's extent
            {
                NamespaceDetails::Stats *s = getDur().writing(&d->stats);
                s->datasize += datasize;
                s->nrecords += nrecords;
            }

            getDur().commitIfNeeded();

            { 
                double op = 1.0;
                if( oldObjSize ) 
                    op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize;
                log() << "compact finished extent #" << n << " containing " << nrecords << " documents (" << datasize/1000000.0 << "MB)"
                    << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100
                    << endl;                    
            }
        }

        return skipped;
    }