Beispiel #1
0
 DiskLoc FindingStartCursor::prevExtentFirstLoc( const DiskLoc& rec ) const {
     Extent *e = rec.rec()->myExtent( rec );
     if ( _qp.nsd()->capLooped() ) {
         while( true ) {
             // Advance e to preceding extent (looping to lastExtent if necessary).
             if ( e->xprev.isNull() ) {
                 e = _qp.nsd()->lastExtent.ext();
             }
             else {
                 e = e->xprev.ext();
             }
             if ( e->myLoc == _qp.nsd()->capExtent ) {
                 // Reached the extent containing the oldest data in the collection.
                 return DiskLoc();
             }
             if ( !e->firstRecord.isNull() ) {
                 // Return the first record of the first non empty extent encountered.
                 return e->firstRecord;
             }
         }
     }
     else {
         while( true ) {
             if ( e->xprev.isNull() ) {
                 // Reached the beginning of the collection.
                 return DiskLoc();
             }
             e = e->xprev.ext();
             if ( !e->firstRecord.isNull() ) {
                 // Return the first record of the first non empty extent encountered.
                 return e->firstRecord;
             }
         }
     }
 }
Beispiel #2
0
    void TextStage::filterAndScore(BSONObj key, DiskLoc loc) {
        // Locate score within possibly compound key: {prefix,term,score,suffix}.
        BSONObjIterator keyIt(key);
        for (unsigned i = 0; i < _params.spec.numExtraBefore(); i++) {
            keyIt.next();
        }

        keyIt.next(); // Skip past 'term'.

        BSONElement scoreElement = keyIt.next();
        double documentTermScore = scoreElement.number();
        double& documentAggregateScore = _scores[loc];
        
        // Handle filtering.
        if (documentAggregateScore < 0) {
            // We have already rejected this document.
            return;
        }
        if (documentAggregateScore == 0 && _filter) {
            // We have not seen this document before and need to apply a filter.
            Record* rec_p = loc.rec();
            BSONObj doc = BSONObj::make(rec_p);

            // TODO: Covered index matching logic here.
            if (!_filter->matchesBSON(doc)) {
                documentAggregateScore = -1;
                return;
            }
        }

        // Aggregate relevance score, term keys.
        documentAggregateScore += documentTermScore;
    }
Beispiel #3
0
    DiskLoc _repairExtent( Database* db , string ns, bool forward , DiskLoc eLoc ){
        LogIndentLevel lil;
        
        if ( eLoc.getOfs() <= 0 ){
            error() << "invalid extent ofs: " << eLoc.getOfs() << endl;
            return DiskLoc();
        }
        

        MongoDataFile * mdf = db->getFile( eLoc.a() );

        Extent * e = mdf->debug_getExtent( eLoc );
        if ( ! e->isOk() ){
            warning() << "Extent not ok magic: " << e->magic << " going to try to continue" << endl;
        }
        
        log() << "length:" << e->length << endl;
        
        LogIndentLevel lil2;
        
        DiskLoc loc = forward ? e->firstRecord : e->lastRecord;
        while ( ! loc.isNull() ){
            if ( loc.getOfs() <= 0 ){
                error() << "offset is 0 for record which should be impossible" << endl;
                break;
            }
            log() << loc << endl;
            Record* rec = loc.rec();
            log() << loc.obj() << endl;
            loc = forward ? rec->getNext( loc ) : rec->getPrev( loc );
        }
        return forward ? e->xnext : e->xprev;
        
    }
Beispiel #4
0
 // static
 DiskLoc OplogStart::prevExtentFirstLoc(const NamespaceDetails* nsd, const DiskLoc& rec ) {
     Extent *e = rec.rec()->myExtentLoc( rec ).ext();
     if (nsd->capLooped() ) {
         while( true ) {
             // Advance e to preceding extent (looping to lastExtent if necessary).
             if ( e->xprev.isNull() ) {
                 e = nsd->lastExtent().ext();
             }
             else {
                 e = e->xprev.ext();
             }
             if ( e->myLoc == nsd->capExtent() ) {
                 // Reached the extent containing the oldest data in the collection.
                 return DiskLoc();
             }
             if ( !e->firstRecord.isNull() ) {
                 // Return the first record of the first non empty extent encountered.
                 return e->firstRecord;
             }
         }
     }
     else {
         while( true ) {
             if ( e->xprev.isNull() ) {
                 // Reached the beginning of the collection.
                 return DiskLoc();
             }
             e = e->xprev.ext();
             if ( !e->firstRecord.isNull() ) {
                 // Return the first record of the first non empty extent encountered.
                 return e->firstRecord;
             }
         }
     }
 }
Beispiel #5
0
    void DataFileMgr::deleteRecord(NamespaceDetails* d, const StringData& ns, Record *todelete,
                                   const DiskLoc& dl, bool cappedOK, bool noWarn, bool doLog ) {
        dassert( todelete == dl.rec() );

        if ( d->isCapped() && !cappedOK ) {
            out() << "failing remove on a capped ns " << ns << endl;
            uassert( 10089 ,  "can't remove from a capped collection" , 0 );
            return;
        }

        BSONObj obj = BSONObj::make( todelete );

        Collection* collection = cc().database()->getCollection( ns );
        verify( collection );

        BSONObj toDelete;
        collection->deleteDocument( dl, cappedOK, noWarn, doLog ? &toDelete : NULL );

        if ( ! toDelete.isEmpty() ) {
            // TODO: this is crazy, need to fix logOp
            const char* raw = ns.rawData();
            if ( strlen(raw) == ns.size() ) {
                logOp( "d", raw, toDelete );
            }
            else {
                string temp = ns.toString();
                logOp( "d", temp.c_str(), toDelete );
            }
        }
    }
Beispiel #6
0
    Record* ClientCursor::_recordForYield( ClientCursor::RecordNeeds need ) {
        
        if ( ! ok() )
            return 0;

        if ( need == DontNeed ) {
            return 0;
        }
        else if ( need == MaybeCovered ) {
            // TODO
            return 0;
        }
        else if ( need == WillNeed ) {
            // no-op
        }
        else {
            warning() << "don't understand RecordNeeds: " << (int)need << endl;
            return 0;
        }

        DiskLoc l = currLoc();
        if ( l.isNull() )
            return 0;
        
        Record * rec = l.rec();
        if ( rec->likelyInPhysicalMemory() ) 
            return 0;
        
        return rec;
    }
Beispiel #7
0
 /** @param o the object to insert. can be modified to add _id and thus be an in/out param
  */
 DiskLoc DataFileMgr::insertWithObjMod(const char* ns, BSONObj& o, bool mayInterrupt, bool god) {
     bool addedID = false;
     DiskLoc loc = insert( ns, o.objdata(), o.objsize(), mayInterrupt, god, true, &addedID );
     if( addedID && !loc.isNull() )
         o = BSONObj::make( loc.rec() );
     return loc;
 }
    long long Helpers::removeRange( const string& ns , const BSONObj& min , const BSONObj& max , bool yield , bool maxInclusive , RemoveCallback * callback, bool fromMigrate ) {
        BSONObj keya , keyb;
        BSONObj minClean = toKeyFormat( min , keya );
        BSONObj maxClean = toKeyFormat( max , keyb );
        verify( keya == keyb );

        Client::Context ctx(ns);

        shared_ptr<Cursor> c;
        auto_ptr<ClientCursor> cc;
        {
            NamespaceDetails* nsd = nsdetails( ns.c_str() );
            if ( ! nsd )
                return 0;
            
            int ii = nsd->findIndexByKeyPattern( keya );
            verify( ii >= 0 );
            
            IndexDetails& i = nsd->idx( ii );
            
            c.reset( BtreeCursor::make( nsd , ii , i , minClean , maxClean , maxInclusive, 1 ) );
            cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) );
            cc->setDoingDeletes( true );
        }

        long long num = 0;

        while ( cc->ok() ) {

            if ( yield && ! cc->yieldSometimes( ClientCursor::WillNeed) ) {
                // cursor got finished by someone else, so we're done
                cc.release(); // if the collection/db is dropped, cc may be deleted
                break;
            }

            if ( ! cc->ok() )
                break;

            DiskLoc rloc = cc->currLoc();

            if ( callback )
                callback->goingToDelete( cc->current() );

            cc->advance();
            c->prepareToTouchEarlierIterate();

            logOp( "d" , ns.c_str() , rloc.obj()["_id"].wrap() , 0 , 0 , fromMigrate );
            theDataFileMgr.deleteRecord(ns.c_str() , rloc.rec(), rloc);
            num++;

            c->recoverFromTouchingEarlierIterate();

            getDur().commitIfNeeded();


        }

        return num;
    }
Beispiel #9
0
 DiskLoc FindingStartCursor::extentFirstLoc( const DiskLoc &rec ) {
     Extent *e = rec.rec()->myExtent( rec );
     if ( !_qp.nsd()->capLooped() || ( e->myLoc != _qp.nsd()->capExtent ) )
         return e->firstRecord;
     // Likely we are on the fresh side of capExtent, so return first fresh record.
     // If we are on the stale side of capExtent, then the collection is small and it
     // doesn't matter if we start the extent scan with capFirstNewRecord.
     return _qp.nsd()->capFirstNewRecord;
 }
Beispiel #10
0
 /** add a record to the end of the linked list chain within this extent. 
     require: you must have already declared write intent for the record header.        
 */
 void addRecordToRecListInExtent(Record *r, DiskLoc loc) {
     dassert( loc.rec() == r );
     Extent *e = r->myExtent(loc);
     if ( e->lastRecord.isNull() ) {
         Extent::FL *fl = getDur().writing(e->fl());
         fl->firstRecord = fl->lastRecord = loc;
         r->prevOfs() = r->nextOfs() = DiskLoc::NullOfs;
     }
     else {
         Record *oldlast = e->lastRecord.rec();
         r->prevOfs() = e->lastRecord.getOfs();
         r->nextOfs() = DiskLoc::NullOfs;
         getDur().writingInt(oldlast->nextOfs()) = loc.getOfs();
         getDur().writingDiskLoc(e->lastRecord) = loc;
     }
 }
Beispiel #11
0
    long long Helpers::removeRange( const string& ns , const BSONObj& min , const BSONObj& max , bool yield , bool maxInclusive , RemoveCallback * callback ) {
        BSONObj keya , keyb;
        BSONObj minClean = toKeyFormat( min , keya );
        BSONObj maxClean = toKeyFormat( max , keyb );
        assert( keya == keyb );

        Client::Context ctx(ns);
        NamespaceDetails* nsd = nsdetails( ns.c_str() );
        if ( ! nsd )
            return 0;

        int ii = nsd->findIndexByKeyPattern( keya );
        assert( ii >= 0 );

        long long num = 0;

        IndexDetails& i = nsd->idx( ii );

        shared_ptr<Cursor> c( new BtreeCursor( nsd , ii , i , minClean , maxClean , maxInclusive, 1 ) );
        auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) );
        cc->setDoingDeletes( true );

        while ( c->ok() ) {
            DiskLoc rloc = c->currLoc();
            BSONObj key = c->currKey();

            if ( callback )
                callback->goingToDelete( c->current() );

            c->advance();
            c->noteLocation();

            logOp( "d" , ns.c_str() , rloc.obj()["_id"].wrap() );
            theDataFileMgr.deleteRecord(ns.c_str() , rloc.rec(), rloc);
            num++;

            c->checkLocation();

            if ( yield && ! cc->yieldSometimes() ) {
                // cursor got finished by someone else, so we're done
                cc.release(); // if the collection/db is dropped, cc may be deleted
                break;
            }
        }

        return num;
    }
Beispiel #12
0
    bool CoveredIndexMatcher::matches(const BSONObj &key, const DiskLoc &recLoc , MatchDetails * details ) {
        if ( details )
            details->reset();
        
        if ( !_keyMatcher.matches(key, details ) ){
            return false;
        }
        
        if ( ! _needRecord ){
            return true;
        }

        if ( details )
            details->loadedObject = true;

        return _docMatcher->matches(recLoc.rec() , details );
    }
Beispiel #13
0
    /* special version of insert for transaction logging -- streamlined a bit.
       assumes ns is capped and no indexes
    */
    Record* DataFileMgr::fast_oplog_insert(NamespaceDetails *d, const char *ns, int len) {
        verify( d );
        RARELY verify( d == nsdetails(ns) );
        DEV verify( d == nsdetails(ns) );

        massert( 16509,
                 str::stream()
                 << "fast_oplog_insert requires a capped collection "
                 << " but " << ns << " is not capped",
                 d->isCapped() );

        //record timing on oplog inserts
        boost::optional<TimerHolder> insertTimer;
        //skip non-oplog collections
        if (NamespaceString::oplog(ns)) {
            insertTimer = boost::in_place(&oplogInsertStats);
            oplogInsertBytesStats.increment(len); //record len of inserted records for oplog
        }

        int lenWHdr = len + Record::HeaderSize;
        DiskLoc loc = d->alloc(ns, lenWHdr);
        verify( !loc.isNull() );

        Record *r = loc.rec();
        verify( r->lengthWithHeaders() >= lenWHdr );

        Extent *e = r->myExtent(loc);
        if ( e->lastRecord.isNull() ) {
            Extent::FL *fl = getDur().writing( e->fl() );
            fl->firstRecord = fl->lastRecord = loc;

            Record::NP *np = getDur().writing(r->np());
            np->nextOfs = np->prevOfs = DiskLoc::NullOfs;
        }
        else {
            Record *oldlast = e->lastRecord.rec();
            Record::NP *np = getDur().writing(r->np());
            np->prevOfs = e->lastRecord.getOfs();
            np->nextOfs = DiskLoc::NullOfs;
            getDur().writingInt( oldlast->nextOfs() ) = loc.getOfs();
            e->lastRecord.writing() = loc;
        }

        d->incrementStats( r->netLength(), 1 );
        return r;
    }
Beispiel #14
0
 DiskLoc FindingStartCursor::prevLoc( const DiskLoc &rec ) {
     Extent *e = rec.rec()->myExtent( rec );
     if ( _qp.nsd()->capLooped() ) {
         if ( e->xprev.isNull() )
             e = _qp.nsd()->lastExtent.ext();
         else
             e = e->xprev.ext();
         if ( e->myLoc != _qp.nsd()->capExtent )
             return e->firstRecord;
     }
     else {
         if ( !e->xprev.isNull() ) {
             e = e->xprev.ext();
             return e->firstRecord;
         }
     }
     return DiskLoc(); // reached beginning of collection
 }
Beispiel #15
0
    void DataFileMgr::deleteRecord(NamespaceDetails* d, const StringData& ns, Record *todelete,
                                   const DiskLoc& dl, bool cappedOK, bool noWarn, bool doLog ) {
        dassert( todelete == dl.rec() );

        if ( d->isCapped() && !cappedOK ) {
            out() << "failing remove on a capped ns " << ns << endl;
            uassert( 10089 ,  "can't remove from a capped collection" , 0 );
            return;
        }

        BSONObj obj = BSONObj::make( todelete );

        BSONObj toDelete;
        if ( doLog ) {
            BSONElement e = obj["_id"];
            if ( e.type() ) {
                toDelete = e.wrap();
            }
        }
        Collection* collection = cc().database()->getCollection( ns );
        verify( collection );

        /* check if any cursors point to us.  if so, advance them. */
        ClientCursor::aboutToDelete(ns, d, dl);

        collection->getIndexCatalog()->unindexRecord( obj, dl, noWarn );

        _deleteRecord(d, ns, todelete, dl);

        collection->infoCache()->notifyOfWriteOp();

        if ( ! toDelete.isEmpty() ) {
            // TODO: this is crazy, need to fix logOp
            const char* raw = ns.rawData();
            if ( strlen(raw) == ns.size() ) {
                logOp( "d", raw, toDelete );
            }
            else {
                string temp = ns.toString();
                logOp( "d", temp.c_str(), toDelete );
            }
        }
    }
Beispiel #16
0
        IndexCatalog::IndexBuildBlock* halfAddIndex(const std::string& key) {
            string name = key + "_1";
            BSONObj indexInfo = BSON( "v" << 1 <<
                                      "key" << BSON( key << 1 ) <<
                                      "ns" << _ns <<
                                      "name" << name );
            int32_t lenWHdr = indexInfo.objsize() + Record::HeaderSize;
            const char* systemIndexes = "unittests.system.indexes";
            DiskLoc infoLoc = allocateSpaceForANewRecord( systemIndexes,
                                                          nsdetails( systemIndexes ),
                                                          lenWHdr,
                                                          false );
            Record* infoRecord = reinterpret_cast<Record*>( getDur().writingPtr( infoLoc.rec(),
                                                                                 lenWHdr ) );
            memcpy( infoRecord->data(), indexInfo.objdata(), indexInfo.objsize() );
            addRecordToRecListInExtent( infoRecord, infoLoc );

            return new IndexCatalog::IndexBuildBlock( _ctx.ctx().db()->getCollection( _ns )->getIndexCatalog(), name, infoLoc );
        }
Beispiel #17
0
 /** @return IndexDetails for a new index on a:1, with the info field populated. */
 IndexDetails& addIndexWithInfo() {
     BSONObj indexInfo = BSON( "v" << 1 <<
                               "key" << BSON( "a" << 1 ) <<
                               "ns" << _ns <<
                               "name" << "a_1" );
     int32_t lenWHdr = indexInfo.objsize() + Record::HeaderSize;
     const char* systemIndexes = "unittests.system.indexes";
     DiskLoc infoLoc = allocateSpaceForANewRecord( systemIndexes,
                                                   nsdetails( systemIndexes ),
                                                   lenWHdr,
                                                   false );
     Record* infoRecord = reinterpret_cast<Record*>( getDur().writingPtr( infoLoc.rec(),
                                                                          lenWHdr ) );
     memcpy( infoRecord->data(), indexInfo.objdata(), indexInfo.objsize() );
     addRecordToRecListInExtent( infoRecord, infoLoc );
     IndexDetails& id = nsdetails( _ns )->getNextIndexDetails( _ns );
     nsdetails( _ns )->addIndex();
     id.info.writing() = infoLoc;
     return id;
 }
Beispiel #18
0
        /** @return IndexDetails for a new index on a:1, with the info field populated. */
        IndexDescriptor* addIndexWithInfo() {
            BSONObj indexInfo = BSON( "v" << 1 <<
                                      "key" << BSON( "a" << 1 ) <<
                                      "ns" << _ns <<
                                      "name" << "a_1" );
            int32_t lenWHdr = indexInfo.objsize() + Record::HeaderSize;
            const char* systemIndexes = "unittests.system.indexes";
            DiskLoc infoLoc = allocateSpaceForANewRecord( systemIndexes,
                                                          nsdetails( systemIndexes ),
                                                          lenWHdr,
                                                          false );
            Record* infoRecord = reinterpret_cast<Record*>( getDur().writingPtr( infoLoc.rec(),
                                                                                 lenWHdr ) );
            memcpy( infoRecord->data(), indexInfo.objdata(), indexInfo.objsize() );
            addRecordToRecListInExtent( infoRecord, infoLoc );

            IndexCatalog::IndexBuildBlock blk( collection()->getIndexCatalog(), "a_1", infoLoc );
            blk.success();

            return collection()->getIndexCatalog()->findIndexByName( "a_1" );
        }
Beispiel #19
0
    /* everything from end on, eliminate from the capped collection.
       @param inclusive if true, deletes end (i.e. closed or open range)
    */
    void NamespaceDetails::cappedTruncateAfter(const char *ns, DiskLoc end, bool inclusive) {
        DEV assert( this == nsdetails(ns) );
        assert( cappedLastDelRecLastExtent().isValid() );
        
        bool foundLast = false;
        while( 1 ) {
            if ( foundLast ) {
                break;
            }
            DiskLoc curr = theCapExtent()->lastRecord;
            assert( !curr.isNull() );
            if ( curr == end ) {
                if ( inclusive ) {
                    foundLast = true;
                } else {
                    break;
                }
            }
            
            uassert( 13415, "emptying the collection is not allowed", stats.nrecords > 1 );
            
            if ( !capLooped() ) {
                theDataFileMgr.deleteRecord(ns, curr.rec(), curr, true);
                compact();
                if ( theCapExtent()->lastRecord.isNull() ) {
                    assert( !theCapExtent()->xprev.isNull() );
                    capExtent = theCapExtent()->xprev;
                    theCapExtent()->assertOk();
                    if ( capExtent == firstExtent ) {
                        cappedLastDelRecLastExtent() = DiskLoc();
                    } else {
                        // slow - there's no prev ptr for deleted rec
                        DiskLoc i = cappedListOfAllDeletedRecords();
                        for( ;
                            !i.drec()->nextDeleted.isNull() &&
                            !inCapExtent( i.drec()->nextDeleted );
                            i = i.drec()->nextDeleted );
                        assert( !i.drec()->nextDeleted.isNull() ); // I believe there is always at least one drec per extent
                        cappedLastDelRecLastExtent() = i;
                    }
                }
                continue;
            }

            theDataFileMgr.deleteRecord(ns, curr.rec(), curr, true);
            compact();
            if ( curr == capFirstNewRecord ) { // invalid, but can compare locations
                capExtent = ( capExtent == firstExtent ) ? lastExtent : theCapExtent()->xprev;
                theCapExtent()->assertOk();
                assert( !theCapExtent()->firstRecord.isNull() );
                capFirstNewRecord = theCapExtent()->firstRecord;
                if ( capExtent == firstExtent ) {
                    cappedLastDelRecLastExtent() = DiskLoc();
                } else {
                    // slow - there's no prev ptr for deleted rec
                    DiskLoc i = cappedListOfAllDeletedRecords();
                    for( ;
                        !i.drec()->nextDeleted.isNull() &&
                        !inCapExtent( i.drec()->nextDeleted );
                        i = i.drec()->nextDeleted );
                    assert( !i.drec()->nextDeleted.isNull() ); // I believe there is always at least one drec per extent
                    cappedLastDelRecLastExtent() = i;
                }
            }
        }
    }
Beispiel #20
0
    /* note: this is only (as-is) called for

             - not multi
             - not mods is indexed
             - not upsert
    */
    static UpdateResult _updateById(bool isOperatorUpdate,
                                    int idIdxNo,
                                    ModSet* mods,
                                    NamespaceDetails* d,
                                    NamespaceDetailsTransient *nsdt,
                                    bool su,
                                    const char* ns,
                                    const BSONObj& updateobj,
                                    BSONObj patternOrig,
                                    bool logop,
                                    OpDebug& debug,
                                    bool fromMigrate = false) {

        DiskLoc loc;
        {
            IndexDetails& i = d->idx(idIdxNo);
            BSONObj key = i.getKeyFromQuery( patternOrig );
            loc = QueryRunner::fastFindSingle(i, key);
            if( loc.isNull() ) {
                // no upsert support in _updateById yet, so we are done.
                return UpdateResult( 0 , 0 , 0 , BSONObj() );
            }
        }
        Record* r = loc.rec();

        if ( cc().allowedToThrowPageFaultException() && ! r->likelyInPhysicalMemory() ) {
            throw PageFaultException( r );
        }

        /* look for $inc etc.  note as listed here, all fields to inc must be this type, you can't set some
           regular ones at the moment. */
        BSONObj newObj;
        if ( isOperatorUpdate ) {
            const BSONObj& onDisk = loc.obj();
            auto_ptr<ModSetState> mss = mods->prepare( onDisk, false /* not an insertion */ );

            if( mss->canApplyInPlace() ) {
                mss->applyModsInPlace(true);
                debug.fastmod = true;
                DEBUGUPDATE( "\t\t\t updateById doing in place update" );

                newObj = onDisk;
            }
            else {
                newObj = mss->createNewFromMods();
                checkTooLarge(newObj);
                verify(nsdt);
                theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , newObj.objdata(), newObj.objsize(), debug);
            }

            if ( logop ) {
                DEV verify( mods->size() );
                BSONObj pattern = patternOrig;
                BSONObj logObj = mss->getOpLogRewrite();
                DEBUGUPDATE( "\t rewrite update: " << logObj );

                // It is possible that the entire mod set was a no-op over this document.  We
                // would have an empty log record in that case. If we call logOp, with an empty
                // record, that would be replicated as "clear this record", which is not what
                // we want. Therefore, to get a no-op in the replica, we simply don't log.
                if ( logObj.nFields() ) {
                    logOp("u", ns, logObj, &pattern, 0, fromMigrate, &newObj );
                }
            }
            return UpdateResult( 1 , 1 , 1 , BSONObj() );

        } // end $operator update

        // regular update
        BSONElementManipulator::lookForTimestamps( updateobj );
        checkNoMods( updateobj );
        verify(nsdt);
        theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug );
        if ( logop ) {
            logOp("u", ns, updateobj, &patternOrig, 0, fromMigrate, &updateobj );
        }
        return UpdateResult( 1 , 0 , 1 , BSONObj() );
    }
Beispiel #21
0
    /** Note: if the object shrinks a lot, we don't free up space, we leave extra at end of the record.
     */
    const DiskLoc DataFileMgr::updateRecord(
        const char *ns,
        Collection* collection,
        Record *toupdate, const DiskLoc& dl,
        const char *_buf, int _len, OpDebug& debug,  bool god) {

        dassert( toupdate == dl.rec() );

        BSONObj objOld = BSONObj::make(toupdate);
        BSONObj objNew(_buf);
        DEV verify( objNew.objsize() == _len );
        DEV verify( objNew.objdata() == _buf );

        if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) {
            /* add back the old _id value if the update removes it.  Note this implementation is slow
               (copies entire object multiple times), but this shouldn't happen often, so going for simple
               code, not speed.
            */
            BSONObjBuilder b;
            BSONElement e;
            verify( objOld.getObjectID(e) );
            b.append(e); // put _id first, for best performance
            b.appendElements(objNew);
            objNew = b.obj();
        }

        NamespaceString nsstring(ns);
        if (nsstring.coll() == "system.users") {
            V2UserDocumentParser parser;
            uassertStatusOK(parser.checkValidUserDocument(objNew));
        }

        uassert( 13596 , str::stream() << "cannot change _id of a document old:" << objOld << " new:" << objNew,
                objNew["_id"] == objOld["_id"]);

        /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further
           below.  that is suboptimal, but it's pretty complicated to do it the other way without rollbacks...
        */
        OwnedPointerVector<UpdateTicket> updateTickets;
        updateTickets.mutableVector().resize(collection->details()->getTotalIndexCount());
        for (int i = 0; i < collection->details()->getTotalIndexCount(); ++i) {
            auto_ptr<IndexDescriptor> descriptor(CatalogHack::getDescriptor(collection->details(), i));
            auto_ptr<IndexAccessMethod> iam(CatalogHack::getIndex(descriptor.get()));
            InsertDeleteOptions options;
            options.logIfError = false;
            options.dupsAllowed = !(KeyPattern::isIdKeyPattern(descriptor->keyPattern())
                                    || descriptor->unique())
                                  || ignoreUniqueIndex(descriptor->getOnDisk());
            updateTickets.mutableVector()[i] = new UpdateTicket();
            Status ret = iam->validateUpdate(objOld, objNew, dl, options,
                                             updateTickets.mutableVector()[i]);

            if (Status::OK() != ret) {
                uasserted(ASSERT_ID_DUPKEY, "Update validation failed: " + ret.toString());
            }
        }

        if ( toupdate->netLength() < objNew.objsize() ) {
            // doesn't fit.  reallocate -----------------------------------------------------
            moveCounter.increment();
            uassert( 10003,
                     "failing update: objects in a capped ns cannot grow",
                     !(collection && collection->details()->isCapped()));
            collection->details()->paddingTooSmall();
            deleteRecord(ns, toupdate, dl);
            DiskLoc res = insert(ns, objNew.objdata(), objNew.objsize(), false, god);

            if (debug.nmoved == -1) // default of -1 rather than 0
                debug.nmoved = 1;
            else
                debug.nmoved += 1;

            return res;
        }

        collection->infoCache()->notifyOfWriteOp();
        collection->details()->paddingFits();

        debug.keyUpdates = 0;

        for (int i = 0; i < collection->details()->getTotalIndexCount(); ++i) {
            auto_ptr<IndexDescriptor> descriptor(CatalogHack::getDescriptor(collection->details(), i));
            auto_ptr<IndexAccessMethod> iam(CatalogHack::getIndex(descriptor.get()));
            int64_t updatedKeys;
            Status ret = iam->update(*updateTickets.vector()[i], &updatedKeys);
            if (Status::OK() != ret) {
                // This shouldn't happen unless something disastrous occurred.
                massert(16799, "update failed: " + ret.toString(), false);
            }
            debug.keyUpdates += updatedKeys;
        }

        //  update in place
        int sz = objNew.objsize();
        memcpy(getDur().writingPtr(toupdate->data(), sz), objNew.objdata(), sz);
        return dl;
    }
Beispiel #22
0
    void Collection::_compactExtent(const DiskLoc diskloc, int extentNumber,
                                    MultiIndexBlock& indexesToInsertTo,
                                    const CompactOptions* compactOptions, CompactStats* stats ) {

        log() << "compact begin extent #" << extentNumber
              << " for namespace " << _ns << " " << diskloc;

        unsigned oldObjSize = 0; // we'll report what the old padding was
        unsigned oldObjSizeWithPadding = 0;

        Extent *e = diskloc.ext();
        e->assertOk();
        verify( e->validates(diskloc) );

        {
            // the next/prev pointers within the extent might not be in order so we first
            // page the whole thing in sequentially
            log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl;
            Timer t;
            size_t length = e->length;

            touch_pages( reinterpret_cast<const char*>(e), length );
            int ms = t.millis();
            if( ms > 1000 )
                log() << "compact end paging in " << ms << "ms "
                      << e->length/1000000.0/t.seconds() << "MB/sec" << endl;
        }

        {
            log() << "compact copying records" << endl;
            long long datasize = 0;
            long long nrecords = 0;
            DiskLoc L = e->firstRecord;
            if( !L.isNull() ) {
                while( 1 ) {
                    Record *recOld = L.rec();
                    L = getExtentManager()->getNextRecordInExtent(L);
                    BSONObj objOld = BSONObj::make(recOld);

                    if ( compactOptions->validateDocuments && !objOld.valid() ) {
                        // object is corrupt!
                        log() << "compact skipping corrupt document!";
                        stats->corruptDocuments++;
                    }
                    else {
                        unsigned docSize = objOld.objsize();

                        nrecords++;
                        oldObjSize += docSize;
                        oldObjSizeWithPadding += recOld->netLength();

                        unsigned lenWHdr = docSize + Record::HeaderSize;
                        unsigned lenWPadding = lenWHdr;

                        switch( compactOptions->paddingMode ) {
                        case CompactOptions::NONE:
                            if ( details()->isUserFlagSet(NamespaceDetails::Flag_UsePowerOf2Sizes) )
                                lenWPadding = details()->quantizePowerOf2AllocationSpace(lenWPadding);
                            break;
                        case CompactOptions::PRESERVE:
                            // if we are preserving the padding, the record should not change size
                            lenWPadding = recOld->lengthWithHeaders();
                            break;
                        case CompactOptions::MANUAL:
                            lenWPadding = compactOptions->computeRecordSize(lenWPadding);
                            if (lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) {
                                lenWPadding = lenWHdr;
                            }
                            break;
                        }

                        CompactDocWriter writer( objOld, lenWPadding );
                        StatusWith<DiskLoc> status = _recordStore->insertRecord( &writer, 0 );
                        uassertStatusOK( status.getStatus() );
                        datasize += _recordStore->recordFor( status.getValue() )->netLength();

                        InsertDeleteOptions options;
                        options.logIfError = false;
                        options.dupsAllowed = true; // in compact we should be doing no checking

                        indexesToInsertTo.insert( objOld, status.getValue(), options );
                    }

                    if( L.isNull() ) {
                        // we just did the very last record from the old extent.  it's still pointed to
                        // by the old extent ext, but that will be fixed below after this loop
                        break;
                    }

                    // remove the old records (orphan them) periodically so our commit block doesn't get too large
                    bool stopping = false;
                    RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0;
                    if( stopping || getDur().aCommitIsNeeded() ) {
                        e->firstRecord.writing() = L;
                        Record *r = L.rec();
                        getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs;
                        getDur().commitIfNeeded();
                        killCurrentOp.checkForInterrupt();
                    }
                }
            } // if !L.isNull()

            verify( details()->firstExtent() == diskloc );
            verify( details()->lastExtent() != diskloc );
            DiskLoc newFirst = e->xnext;
            details()->firstExtent().writing() = newFirst;
            newFirst.ext()->xprev.writing().Null();
            getDur().writing(e)->markEmpty();
            getExtentManager()->freeExtents( diskloc, diskloc );

            getDur().commitIfNeeded();

            {
                double op = 1.0;
                if( oldObjSize )
                    op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize;
                log() << "compact finished extent #" << extentNumber << " containing " << nrecords
                      << " documents (" << datasize/1000000.0 << "MB)"
                      << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100;
            }
        }

    }
Beispiel #23
0
    /** @return number of skipped (invalid) documents */
    unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc ext, int n,
                const scoped_array<IndexSpec> &indexSpecs,
                scoped_array<SortPhaseOne>& phase1, int nidx, bool validate, 
                double pf, int pb)
    {
        log() << "compact extent #" << n << endl;
        unsigned oldObjSize = 0; // we'll report what the old padding was
        unsigned oldObjSizeWithPadding = 0;

        Extent *e = ext.ext();
        e->assertOk();
        assert( e->validates() );
        unsigned skipped = 0;

        {
            // the next/prev pointers within the extent might not be in order so we first page the whole thing in 
            // sequentially
            log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl;
            Timer t;
            MAdvise adv(e, e->length, MAdvise::Sequential);
            const char *p = (const char *) e;
            for( int i = 0; i < e->length; i += 4096 ) { 
                faux += p[i];
            }
            int ms = t.millis();
            if( ms > 1000 ) 
                log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl;
        }

        {
            log() << "compact copying records" << endl;
            unsigned totalSize = 0;
            int nrecs = 0;
            DiskLoc L = e->firstRecord;
            if( !L.isNull() ) {
                while( 1 ) {
                    Record *recOld = L.rec();
                    L = recOld->nextInExtent(L);
                    nrecs++;
                    BSONObj objOld(recOld);

                    if( !validate || objOld.valid() ) {
                        unsigned sz = objOld.objsize();

                        oldObjSize += sz;
                        oldObjSizeWithPadding += recOld->netLength();

                        unsigned lenWHdr = sz + Record::HeaderSize;
                        unsigned lenWPadding = lenWHdr;
                        {
                            lenWPadding = static_cast<unsigned>(pf*lenWPadding);
                            lenWPadding += pb;
                            lenWPadding = lenWPadding & quantizeMask(lenWPadding);
                            if( lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { 
                                lenWPadding = lenWHdr;
                            }
                        }
                        totalSize += lenWPadding;
                        DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false);
                        uassert(14024, "compact error out of space during compaction", !loc.isNull());
                        Record *recNew = loc.rec();
                        recNew = (Record *) getDur().writingPtr(recNew, lenWHdr);
                        addRecordToRecListInExtent(recNew, loc);
                        memcpy(recNew->data, objOld.objdata(), sz);

                        {
                            // extract keys for all indexes we will be rebuilding
                            for( int x = 0; x < nidx; x++ ) { 
                                phase1[x].addKeys(indexSpecs[x], objOld, loc);
                            }
                        }
                    }
                    else { 
                        if( ++skipped <= 10 )
                            log() << "compact skipping invalid object" << endl;
                    }

                    if( L.isNull() ) { 
                        // we just did the very last record from the old extent.  it's still pointed to 
                        // by the old extent ext, but that will be fixed below after this loop
                        break;
                    }

                    // remove the old records (orphan them) periodically so our commit block doesn't get too large
                    bool stopping = false;
                    RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0;
                    if( stopping || getDur().aCommitIsNeeded() ) {
                        e->firstRecord.writing() = L;
                        Record *r = L.rec();
                        getDur().writingInt(r->prevOfs) = DiskLoc::NullOfs;
                        getDur().commitIfNeeded();
                        killCurrentOp.checkForInterrupt(false);
                    }
                }
            } // if !L.isNull()

            assert( d->firstExtent == ext );
            assert( d->lastExtent != ext );
            DiskLoc newFirst = e->xnext;
            d->firstExtent.writing() = newFirst;
            newFirst.ext()->xprev.writing().Null();
            getDur().writing(e)->markEmpty();
            freeExtents(ext,ext);
            getDur().commitIfNeeded();

            { 
                double op = 1.0;
                if( oldObjSize ) 
                    op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize;
                log() << "compact " << nrecs << " documents " << totalSize/1000000.0 << "MB"
                    << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100
                    << endl;                    
            }
        }

        return skipped;
    }
Beispiel #24
0
    void NamespaceDetails::cappedTruncateAfter(const char *ns, DiskLoc end, bool inclusive) {
        DEV verify( this == nsdetails(ns) );
        verify( cappedLastDelRecLastExtent().isValid() );

        // We iteratively remove the newest document until the newest document
        // is 'end', then we remove 'end' if requested.
        bool foundLast = false;
        while( 1 ) {
            if ( foundLast ) {
                // 'end' has been found and removed, so break.
                break;
            }
            getDur().commitIfNeeded();
            // 'curr' will point to the newest document in the collection.
            DiskLoc curr = theCapExtent()->lastRecord;
            verify( !curr.isNull() );
            if ( curr == end ) {
                if ( inclusive ) {
                    // 'end' has been found, so break next iteration.
                    foundLast = true;
                }
                else {
                    // 'end' has been found, so break.
                    break;
                }
            }

            // TODO The algorithm used in this function cannot generate an
            // empty collection, but we could call emptyCappedCollection() in
            // this case instead of asserting.
            uassert( 13415, "emptying the collection is not allowed", _stats.nrecords > 1 );

            // Delete the newest record, and coalesce the new deleted
            // record with existing deleted records.
            theDataFileMgr.deleteRecord(this, ns, curr.rec(), curr, true);
            compact();

            // This is the case where we have not yet had to remove any
            // documents to make room for other documents, and we are allocating
            // documents from free space in fresh extents instead of reusing
            // space from familiar extents.
            if ( !capLooped() ) {

                // We just removed the last record from the 'capExtent', and
                // the 'capExtent' can't be empty, so we set 'capExtent' to
                // capExtent's prev extent.
                if ( theCapExtent()->lastRecord.isNull() ) {
                    verify( !theCapExtent()->xprev.isNull() );
                    // NOTE Because we didn't delete the last document, and
                    // capLooped() is false, capExtent is not the first extent
                    // so xprev will be nonnull.
                    _capExtent.writing() = theCapExtent()->xprev;
                    theCapExtent()->assertOk();

                    // update cappedLastDelRecLastExtent()
                    cappedTruncateLastDelUpdate();
                }
                continue;
            }

            // This is the case where capLooped() is true, and we just deleted
            // from capExtent, and we just deleted capFirstNewRecord, which was
            // the last record on the fresh side of capExtent.
            // NOTE In this comparison, curr and potentially capFirstNewRecord
            // may point to invalid data, but we can still compare the
            // references themselves.
            if ( curr == _capFirstNewRecord ) {

                // Set 'capExtent' to the first nonempty extent prior to the
                // initial capExtent.  There must be such an extent because we
                // have not deleted the last document in the collection.  It is
                // possible that all extents other than the capExtent are empty.
                // In this case we will keep the initial capExtent and specify
                // that all records contained within are on the fresh rather than
                // stale side of the extent.
                DiskLoc newCapExtent = _capExtent;
                do {
                    // Find the previous extent, looping if necessary.
                    newCapExtent = ( newCapExtent == _firstExtent ) ? _lastExtent : newCapExtent.ext()->xprev;
                    newCapExtent.ext()->assertOk();
                }
                while ( newCapExtent.ext()->firstRecord.isNull() );
                _capExtent.writing() = newCapExtent;

                // Place all documents in the new capExtent on the fresh side
                // of the capExtent by setting capFirstNewRecord to the first
                // document in the new capExtent.
                _capFirstNewRecord.writing() = theCapExtent()->firstRecord;

                // update cappedLastDelRecLastExtent()
                cappedTruncateLastDelUpdate();
            }
        }
    }
Beispiel #25
0
    /* ns:      namespace, e.g. <database>.<collection>
       pattern: the "where" clause / criteria
       justOne: stop after 1 match
       god:     allow access to system namespaces, and don't yield
    */
    long long deleteObjects(const char *ns, BSONObj pattern, bool justOneOrig, bool logop, bool god, RemoveSaver * rs ) {
        if( !god ) {
            if ( strstr(ns, ".system.") ) {
                /* note a delete from system.indexes would corrupt the db
                if done here, as there are pointers into those objects in
                NamespaceDetails.
                */
                uassert(12050, "cannot delete from system namespace", legalClientSystemNS( ns , true ) );
            }
            if ( strchr( ns , '$' ) ) {
                log() << "cannot delete from collection with reserved $ in name: " << ns << endl;
                uassert( 10100 ,  "cannot delete from collection with reserved $ in name", strchr(ns, '$') == 0 );
            }
        }

        {
            NamespaceDetails *d = nsdetails( ns );
            if ( ! d )
                return 0;
            uassert( 10101 ,  "can't remove from a capped collection" , ! d->capped );
        }

        long long nDeleted = 0;

        shared_ptr< Cursor > creal = NamespaceDetailsTransient::getCursor( ns, pattern, BSONObj(), false, 0 );

        if( !creal->ok() )
            return nDeleted;

        shared_ptr< Cursor > cPtr = creal;
        auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout, cPtr, ns) );
        cc->setDoingDeletes( true );

        CursorId id = cc->cursorid();

        bool justOne = justOneOrig;
        bool canYield = !god && !(creal->matcher() && creal->matcher()->docMatcher().atomic());

        do {
            // TODO: we can generalize this I believe
            //       
            bool willNeedRecord = (creal->matcher() && creal->matcher()->needRecord()) || pattern.isEmpty() || isSimpleIdQuery( pattern );
            if ( ! willNeedRecord ) {
                // TODO: this is a total hack right now
                // check if the index full encompasses query
                
                if ( pattern.nFields() == 1 && 
                     str::equals( pattern.firstElement().fieldName() , creal->indexKeyPattern().firstElement().fieldName() ) )
                    willNeedRecord = true;
            }
            
            if ( canYield && ! cc->yieldSometimes( willNeedRecord ? ClientCursor::WillNeed : ClientCursor::MaybeCovered ) ) {
                cc.release(); // has already been deleted elsewhere
                // TODO should we assert or something?
                break;
            }
            if ( !cc->ok() ) {
                break; // if we yielded, could have hit the end
            }

            // this way we can avoid calling updateLocation() every time (expensive)
            // as well as some other nuances handled
            cc->setDoingDeletes( true );

            DiskLoc rloc = cc->currLoc();
            BSONObj key = cc->currKey();

            bool match = creal->currentMatches();
            bool dup = cc->c()->getsetdup(rloc);

            if ( ! cc->advance() )
                justOne = true;

            if ( ! match )
                continue;

            assert( !dup ); // can't be a dup, we deleted it!

            if ( !justOne ) {
                /* NOTE: this is SLOW.  this is not good, noteLocation() was designed to be called across getMore
                    blocks.  here we might call millions of times which would be bad.
                    */
                cc->c()->prepareToTouchEarlierIterate();
            }

            if ( logop ) {
                BSONElement e;
                if( BSONObj( rloc.rec() ).getObjectID( e ) ) {
                    BSONObjBuilder b;
                    b.append( e );
                    bool replJustOne = true;
                    logOp( "d", ns, b.done(), 0, &replJustOne );
                }
                else {
                    problem() << "deleted object without id, not logging" << endl;
                }
            }

            if ( rs )
                rs->goingToDelete( rloc.obj() /*cc->c->current()*/ );

            theDataFileMgr.deleteRecord(ns, rloc.rec(), rloc);
            nDeleted++;
            if ( justOne ) {
                break;
            }
            cc->c()->recoverFromTouchingEarlierIterate();
         
            if( !god ) 
                getDur().commitIfNeeded();

            if( debug && god && nDeleted == 100 ) 
                log() << "warning high number of deletes with god=true which could use significant memory" << endl;
        }
        while ( cc->ok() );

        if ( cc.get() && ClientCursor::find( id , false ) == 0 ) {
            // TODO: remove this and the id declaration above if this doesn't trigger
            //       if it does, then i'm very confused (ERH 06/2011)
            error() << "this should be impossible" << endl;
            printStackTrace();
            cc.release();
        }

        return nDeleted;
    }
Beispiel #26
0
    DiskLoc NamespaceDetails::cappedAlloc(const char *ns, int len) {
        
        if ( len > theCapExtent()->length ) {
            // the extent check is a way to try and improve performance
            uassert( 16328 , str::stream() << "document is larger than capped size " 
                     << len << " > " << storageSize() , len <= storageSize() );
        }
        
        // signal done allocating new extents.
        if ( !cappedLastDelRecLastExtent().isValid() )
            getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = DiskLoc();

        verify( len < 400000000 );
        int passes = 0;
        int maxPasses = ( len / 30 ) + 2; // 30 is about the smallest entry that could go in the oplog
        if ( maxPasses < 5000 ) {
            // this is for bacwards safety since 5000 was the old value
            maxPasses = 5000;
        }
        DiskLoc loc;

        // delete records until we have room and the max # objects limit achieved.

        /* this fails on a rename -- that is ok but must keep commented out */
        //verify( theCapExtent()->ns == ns );

        theCapExtent()->assertOk();
        DiskLoc firstEmptyExtent;
        while ( 1 ) {
            if ( _stats.nrecords < maxCappedDocs() ) {
                loc = __capAlloc( len );
                if ( !loc.isNull() )
                    break;
            }

            // If on first iteration through extents, don't delete anything.
            if ( !_capFirstNewRecord.isValid() ) {
                advanceCapExtent( ns );

                if ( _capExtent != _firstExtent )
                    _capFirstNewRecord.writing().setInvalid();
                // else signal done with first iteration through extents.
                continue;
            }

            if ( !_capFirstNewRecord.isNull() &&
                    theCapExtent()->firstRecord == _capFirstNewRecord ) {
                // We've deleted all records that were allocated on the previous
                // iteration through this extent.
                advanceCapExtent( ns );
                continue;
            }

            if ( theCapExtent()->firstRecord.isNull() ) {
                if ( firstEmptyExtent.isNull() )
                    firstEmptyExtent = _capExtent;
                advanceCapExtent( ns );
                if ( firstEmptyExtent == _capExtent ) {
                    maybeComplain( ns, len );
                    return DiskLoc();
                }
                continue;
            }

            DiskLoc fr = theCapExtent()->firstRecord;
            theDataFileMgr.deleteRecord(this, ns, fr.rec(), fr, true); // ZZZZZZZZZZZZ
            compact();
            if( ++passes > maxPasses ) {
                StringBuilder sb;
                sb << "passes >= maxPasses in NamespaceDetails::cappedAlloc: ns: " << ns
                   << ", len: " << len
                   << ", maxPasses: " << maxPasses
                   << ", _maxDocsInCapped: " << _maxDocsInCapped
                   << ", nrecords: " << _stats.nrecords
                   << ", datasize: " << _stats.datasize;
                msgasserted(10345, sb.str());
            }
        }

        // Remember first record allocated on this iteration through capExtent.
        if ( _capFirstNewRecord.isValid() && _capFirstNewRecord.isNull() )
            getDur().writingDiskLoc(_capFirstNewRecord) = loc;

        return loc;
    }
Beispiel #27
0
    DiskLoc NamespaceDetails::cappedAlloc(const char *ns, int len) {
        // signal done allocating new extents.
        if ( !cappedLastDelRecLastExtent().isValid() )
            getDur().writingDiskLoc( cappedLastDelRecLastExtent() ) = DiskLoc();

        verify( len < 400000000 );
        int passes = 0;
        int maxPasses = ( len / 30 ) + 2; // 30 is about the smallest entry that could go in the oplog
        if ( maxPasses < 5000 ) {
            // this is for bacwards safety since 5000 was the old value
            maxPasses = 5000;
        }
        DiskLoc loc;

        // delete records until we have room and the max # objects limit achieved.

        /* this fails on a rename -- that is ok but must keep commented out */
        //verify( theCapExtent()->ns == ns );

        theCapExtent()->assertOk();
        DiskLoc firstEmptyExtent;
        while ( 1 ) {
            if ( stats.nrecords < max ) {
                loc = __capAlloc( len );
                if ( !loc.isNull() )
                    break;
            }

            // If on first iteration through extents, don't delete anything.
            if ( !capFirstNewRecord.isValid() ) {
                advanceCapExtent( ns );

                if ( capExtent != firstExtent )
                    capFirstNewRecord.writing().setInvalid();
                // else signal done with first iteration through extents.
                continue;
            }

            if ( !capFirstNewRecord.isNull() &&
                    theCapExtent()->firstRecord == capFirstNewRecord ) {
                // We've deleted all records that were allocated on the previous
                // iteration through this extent.
                advanceCapExtent( ns );
                continue;
            }

            if ( theCapExtent()->firstRecord.isNull() ) {
                if ( firstEmptyExtent.isNull() )
                    firstEmptyExtent = capExtent;
                advanceCapExtent( ns );
                if ( firstEmptyExtent == capExtent ) {
                    maybeComplain( ns, len );
                    return DiskLoc();
                }
                continue;
            }

            DiskLoc fr = theCapExtent()->firstRecord;
            theDataFileMgr.deleteRecord(ns, fr.rec(), fr, true); // ZZZZZZZZZZZZ
            compact();
            if( ++passes > maxPasses ) {
                log() << "passes ns:" << ns << " len:" << len << " maxPasses: " << maxPasses << '\n';
                log() << "passes max:" << max << " nrecords:" << stats.nrecords << " datasize: " << stats.datasize << endl;
                massert( 10345 ,  "passes >= maxPasses in capped collection alloc", false );
            }
        }

        // Remember first record allocated on this iteration through capExtent.
        if ( capFirstNewRecord.isValid() && capFirstNewRecord.isNull() )
            getDur().writingDiskLoc(capFirstNewRecord) = loc;

        return loc;
    }
Beispiel #28
0
    DiskLoc DataFileMgr::insert(const char* ns,
                                const void* obuf,
                                int32_t len,
                                bool mayInterrupt,
                                bool god,
                                bool mayAddIndex,
                                bool* addedID) {

        Database* database = cc().database();

        bool wouldAddIndex = false;
        massert( 10093 , "cannot insert into reserved $ collection", god || NamespaceString::normal( ns ) );
        uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) );
        {
            const char *sys = strstr(ns, "system.");
            if ( sys && !insert_checkSys(sys, ns, wouldAddIndex, obuf, god) )
                return DiskLoc();
        }
        bool addIndex = wouldAddIndex && mayAddIndex;

        Collection* collection = database->getCollection( ns );
        if ( collection == NULL ) {
            collection = database->createCollection( ns, false, NULL );

            int ies = Extent::initialSize(len);
            if( str::contains(ns, '$') &&
                len + Record::HeaderSize >= BtreeData_V1::BucketSize - 256 &&
                len + Record::HeaderSize <= BtreeData_V1::BucketSize + 256 ) {
                // probably an index.  so we pick a value here for the first extent instead of using
                // initialExtentSize() which is more for user collections.
                // TODO: we could look at the # of records in the parent collection to be smarter here.
                ies = (32+4) * 1024;
            }
            collection->increaseStorageSize( ies, false);
            if ( !god )
                ensureIdIndexForNewNs(ns);
        }

        NamespaceDetails* d = collection->details();

        string tabletoidxns;
        Collection* collectionToIndex = 0;
        NamespaceDetails* tableToIndex = 0;

        BSONObj fixedIndexObject;
        if ( addIndex ) {
            verify( obuf );
            BSONObj io((const char *) obuf);

            tabletoidxns = io.getStringField( "ns" );
            uassert(10096, "invalid ns to index", tabletoidxns.find( '.' ) != string::npos);
            massert(10097,
                    str::stream() << "trying to create index on wrong db "
                    << " db: " << database->name() << " collection: " << tabletoidxns,
                    database->ownsNS( tabletoidxns ) );

            collectionToIndex = database->getCollection( tabletoidxns );
            if ( !collectionToIndex ) {
                collectionToIndex = database->createCollection( tabletoidxns, false, NULL );
                verify( collectionToIndex );
                if ( !god )
                    ensureIdIndexForNewNs( tabletoidxns.c_str() );
            }

            tableToIndex = collectionToIndex->details();

            Status status = collectionToIndex->getIndexCatalog()->okToAddIndex( io );
            if ( status.code() == ErrorCodes::IndexAlreadyExists ) {
                // dup index, we ignore
                return DiskLoc();
            }

            uassert( 17199,
                     str::stream() << "cannot build index on " << tabletoidxns
                     << " because of " << status.toString(),
                     status.isOK() );

            if( !prepareToBuildIndex(io,
                                     mayInterrupt,
                                     god,
                                     tabletoidxns ) ) {
                // prepare creates _id itself, or this indicates to fail the build silently (such 
                // as if index already exists)
                return DiskLoc();
            }

            fixedIndexObject = IndexCatalog::fixIndexSpec( io );

            obuf = fixedIndexObject.objdata();
            len = fixedIndexObject.objsize();
        }

        IDToInsert idToInsert; // only initialized if needed

        if( !god ) {
            /* Check if we have an _id field. If we don't, we'll add it.
               Note that btree buckets which we insert aren't BSONObj's, but in that case god==true.
            */
            BSONObj io((const char *) obuf);
            BSONElement idField = io.getField( "_id" );
            uassert( 10099 ,  "_id cannot be an array", idField.type() != Array );
            // we don't add _id for capped collections in local as they don't have an _id index
            if( idField.eoo() &&
                !wouldAddIndex &&
                nsToDatabase( ns ) != "local" &&
                d->haveIdIndex() ) {

                if( addedID )
                    *addedID = true;

                idToInsert.init();
                len += idToInsert.size();
            }

            BSONElementManipulator::lookForTimestamps( io );
        }

        int lenWHdr = d->getRecordAllocationSize( len + Record::HeaderSize );
        fassert( 16440, lenWHdr >= ( len + Record::HeaderSize ) );

        // If the collection is capped, check if the new object will violate a unique index
        // constraint before allocating space.
        if ( d->isCapped() && !god) {
            BSONObj temp = BSONObj( reinterpret_cast<const char *>( obuf ) );
            Status ret = collection->getIndexCatalog()->checkNoIndexConflicts( temp );
            uassert(12582, "duplicate key insert for unique index of capped collection", ret.isOK() );
        }

        DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god);

        if ( loc.isNull() ) {
            log() << "insert: couldn't alloc space for object ns:" << ns
                  << " capped:" << d->isCapped() << endl;
            verify(d->isCapped());
            return DiskLoc();
        }

        Record *r = loc.rec();
        {
            verify( r->lengthWithHeaders() >= lenWHdr );
            r = (Record*) getDur().writingPtr(r, lenWHdr);
            if( idToInsert.needed() ) {
                /* a little effort was made here to avoid a double copy when we add an ID */
                int originalSize = *((int*) obuf);
                ((int&)*r->data()) = originalSize + idToInsert.size();
                memcpy(r->data()+4, idToInsert.rawdata(), idToInsert.size());
                memcpy(r->data()+4+idToInsert.size(), ((char*)obuf)+4, originalSize-4);
            }
            else {
                if( obuf ) // obuf can be null from internal callers
                    memcpy(r->data(), obuf, len);
            }
        }

        addRecordToRecListInExtent(r, loc);

        d->incrementStats( r->netLength(), 1 );

        // we don't bother resetting query optimizer stats for the god tables - also god is true when adding a btree bucket
        if ( !god )
            collection->infoCache()->notifyOfWriteOp();

        if ( tableToIndex ) {
            insert_makeIndex(collectionToIndex, loc, mayInterrupt);
        }

        /* add this record to our indexes */
        if ( d->getTotalIndexCount() > 0 ) {
            try {
                BSONObj obj(r->data());
                collection->getIndexCatalog()->indexRecord(obj, loc);
            }
            catch( AssertionException& e ) {
                // should be a dup key error on _id index
                if( tableToIndex || d->isCapped() ) {
                    massert( 12583, "unexpected index insertion failure on capped collection", !d->isCapped() );
                    string s = e.toString();
                    s += " : on addIndex/capped - collection and its index will not match";
                    setLastError(0, s.c_str());
                    error() << s << endl;
                }
                else {
                    // normal case -- we can roll back
                    _deleteRecord(d, ns, r, loc);
                    throw;
                }
            }
        }

        d->paddingFits();

        return loc;
    }
Beispiel #29
0
    /** @return number of skipped (invalid) documents */
    unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc diskloc, int n,
                const scoped_array<IndexSpec> &indexSpecs,
                scoped_array<SortPhaseOne>& phase1, int nidx, bool validate, 
                double pf, int pb)
    {
        log() << "compact begin extent #" << n << " for namespace " << ns << endl;
        unsigned oldObjSize = 0; // we'll report what the old padding was
        unsigned oldObjSizeWithPadding = 0;

        Extent *e = diskloc.ext();
        e->assertOk();
        verify( e->validates() );
        unsigned skipped = 0;

        {
            // the next/prev pointers within the extent might not be in order so we first page the whole thing in 
            // sequentially
            log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl;
            Timer t;
            MongoDataFile* mdf = cc().database()->getFile( diskloc.a() );
            HANDLE fd = mdf->getFd();
            int offset = diskloc.getOfs();
            Extent* ext = diskloc.ext();
            size_t length = ext->length;
                
            touch_pages(fd, offset, length, ext);
            int ms = t.millis();
            if( ms > 1000 ) 
                log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl;
        }

        {
            log() << "compact copying records" << endl;
            long long datasize = 0;
            long long nrecords = 0;
            DiskLoc L = e->firstRecord;
            if( !L.isNull() ) {
                while( 1 ) {
                    Record *recOld = L.rec();
                    L = recOld->nextInExtent(L);
                    BSONObj objOld = BSONObj::make(recOld);

                    if( !validate || objOld.valid() ) {
                        nrecords++;
                        unsigned sz = objOld.objsize();

                        oldObjSize += sz;
                        oldObjSizeWithPadding += recOld->netLength();

                        unsigned lenWHdr = sz + Record::HeaderSize;
                        unsigned lenWPadding = lenWHdr;
                        {
                            lenWPadding = static_cast<unsigned>(pf*lenWPadding);
                            lenWPadding += pb;
                            lenWPadding = lenWPadding & quantizeMask(lenWPadding);
                            if( lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { 
                                lenWPadding = lenWHdr;
                            }
                        }
                        DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false);
                        uassert(14024, "compact error out of space during compaction", !loc.isNull());
                        Record *recNew = loc.rec();
                        datasize += recNew->netLength();
                        recNew = (Record *) getDur().writingPtr(recNew, lenWHdr);
                        addRecordToRecListInExtent(recNew, loc);
                        memcpy(recNew->data(), objOld.objdata(), sz);

                        {
                            // extract keys for all indexes we will be rebuilding
                            for( int x = 0; x < nidx; x++ ) { 
                                phase1[x].addKeys(indexSpecs[x], objOld, loc);
                            }
                        }
                    }
                    else { 
                        if( ++skipped <= 10 )
                            log() << "compact skipping invalid object" << endl;
                    }

                    if( L.isNull() ) { 
                        // we just did the very last record from the old extent.  it's still pointed to 
                        // by the old extent ext, but that will be fixed below after this loop
                        break;
                    }

                    // remove the old records (orphan them) periodically so our commit block doesn't get too large
                    bool stopping = false;
                    RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0;
                    if( stopping || getDur().aCommitIsNeeded() ) {
                        e->firstRecord.writing() = L;
                        Record *r = L.rec();
                        getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs;
                        getDur().commitIfNeeded();
                        killCurrentOp.checkForInterrupt(false);
                    }
                }
            } // if !L.isNull()

            verify( d->firstExtent == diskloc );
            verify( d->lastExtent != diskloc );
            DiskLoc newFirst = e->xnext;
            d->firstExtent.writing() = newFirst;
            newFirst.ext()->xprev.writing().Null();
            getDur().writing(e)->markEmpty();
            freeExtents( diskloc, diskloc );
            // update datasize/record count for this namespace's extent
            {
                NamespaceDetails::Stats *s = getDur().writing(&d->stats);
                s->datasize += datasize;
                s->nrecords += nrecords;
            }

            getDur().commitIfNeeded();

            { 
                double op = 1.0;
                if( oldObjSize ) 
                    op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize;
                log() << "compact finished extent #" << n << " containing " << nrecords << " documents (" << datasize/1000000.0 << "MB)"
                    << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100
                    << endl;                    
            }
        }

        return skipped;
    }
Beispiel #30
0
    DiskLoc _repairExtent( Database* db , string ns, bool forward , DiskLoc eLoc , Writer& w ){
        LogIndentLevel lil;
        
        if ( eLoc.getOfs() <= 0 ){
            error() << "invalid extent ofs: " << eLoc.getOfs() << endl;
            return DiskLoc();
        }
        

        MongoDataFile * mdf = db->getFile( eLoc.a() );

        Extent * e = mdf->debug_getExtent( eLoc );
        if ( ! e->isOk() ){
            warning() << "Extent not ok magic: " << e->magic << " going to try to continue" << endl;
        }
        
        log() << "length:" << e->length << endl;
        
        LogIndentLevel lil2;
        
        set<DiskLoc> seen;

        DiskLoc loc = forward ? e->firstRecord : e->lastRecord;
        while ( ! loc.isNull() ){
            
            if ( ! seen.insert( loc ).second ) {
                error() << "infinite loop in extend, seen: " << loc << " before" << endl;
                break;
            }

            if ( loc.getOfs() <= 0 ){
                error() << "offset is 0 for record which should be impossible" << endl;
                break;
            }
            log(1) << loc << endl;
            Record* rec = loc.rec();
            BSONObj obj;
            try {
                obj = loc.obj();
                assert( obj.valid() );
                LOG(1) << obj << endl;
                w( obj );
            }
            catch ( std::exception& e ) {
                log() << "found invalid document @ " << loc << " " << e.what() << endl;
                if ( ! obj.isEmpty() ) {
                    try {
                        BSONElement e = obj.firstElement();
                        stringstream ss;
                        ss << "first element: " << e;
                        log() << ss.str();
                    }
                    catch ( std::exception& ) {
                    }
                }
            }
            loc = forward ? rec->getNext( loc ) : rec->getPrev( loc );
        }
        return forward ? e->xnext : e->xprev;
        
    }