Beispiel #1
0
    void FSyncLockThread::doRealWork() {
        SimpleMutex::scoped_lock lkf(filesLockedFsync);
        Lock::GlobalWrite global(true/*stopGreed*/);
        SimpleMutex::scoped_lock lk(fsyncCmd.m);
        
        verify( ! fsyncCmd.locked ); // impossible to get here if locked is true
        try { 
            getDur().syncDataAndTruncateJournal();
        } 
        catch( std::exception& e ) { 
            error() << "error doing syncDataAndTruncateJournal: " << e.what() << endl;
            fsyncCmd.err = e.what();
            fsyncCmd._threadSync.notify_one();
            fsyncCmd.locked = false;
            return;
        }
        
        global.downgrade();
        
        try {
            MemoryMappedFile::flushAll(true);
        }
        catch( std::exception& e ) { 
            error() << "error doing flushAll: " << e.what() << endl;
            fsyncCmd.err = e.what();
            fsyncCmd._threadSync.notify_one();
            fsyncCmd.locked = false;
            return;
        }

        verify( ! fsyncCmd.locked );
        fsyncCmd.locked = true;
        
        fsyncCmd._threadSync.notify_one();

        while ( ! fsyncCmd.pendingUnlock ) {
            fsyncCmd._unlockSync.wait(fsyncCmd.m);
        }
        fsyncCmd.pendingUnlock = false;
        
        fsyncCmd.locked = false;
        fsyncCmd.err = "unlocked";

        fsyncCmd._unlockSync.notify_one();
    }
Beispiel #2
0
    void IndexRebuilder::checkDB(const std::string& dbName, bool* firstTime) {
        const std::string systemNS = dbName + ".system.namespaces";
        DBDirectClient cli;
        scoped_ptr<DBClientCursor> cursor(cli.query(systemNS, Query()));

        // This depends on system.namespaces not changing while we iterate
        while (cursor->more()) {
            BSONObj nsDoc = cursor->next();
            const char* ns = nsDoc["name"].valuestrsafe();

            Client::WriteContext ctx(ns);
            NamespaceDetails* nsd = nsdetails(ns);

            if (!nsd || !nsd->indexBuildsInProgress) {
                continue;
            }

            log() << "Found interrupted index build on " << ns << endl;
            if (*firstTime) {
                log() << "Restart the server with --noIndexBuildRetry to skip index rebuilds"
                      << endl;
                *firstTime = false;
            }

            // If the indexBuildRetry flag isn't set, just clear the inProg flag
            if (!cmdLine.indexBuildRetry) {
                // If we crash between unsetting the inProg flag and cleaning up the index, the
                // index space will be lost.
                int inProg = nsd->indexBuildsInProgress;
                getDur().writingInt(nsd->indexBuildsInProgress) = 0;

                for (int i = 0; i < inProg; i++) {
                    nsd->idx(nsd->nIndexes+i).kill_idx();
                }

                continue;
            }

            // We go from right to left building these indexes, so that indexBuildInProgress-- has
            // the correct effect of "popping" an index off the list.
            while (nsd->indexBuildsInProgress > 0) {
                retryIndexBuild(dbName, nsd, nsd->nIndexes+nsd->indexBuildsInProgress-1);
            }
        }
    }
Beispiel #3
0
    NOINLINE_DECL void insertMulti(bool keepGoing, const char *ns, vector<BSONObj>& objs, CurOp& op) {
        size_t i;
        for (i=0; i<objs.size(); i++){
            try {
                checkAndInsert(ns, objs[i]);
                getDur().commitIfNeeded();
            } catch (const UserException&) {
                if (!keepGoing || i == objs.size()-1){
                    globalOpCounters.incInsertInWriteLock(i);
                    throw;
                }
                // otherwise ignore and keep going
            }
        }

        globalOpCounters.incInsertInWriteLock(i);
        op.debug().ninserted = i;
    }
Beispiel #4
0
    /**
     * Perform a single insert into a collection.  Requires the insert be preprocessed and the
     * collection already has been created.
     *
     * Might fault or error, otherwise populates the result.
     */
    static void singleInsert( const BSONObj& docToInsert,
                              Collection* collection,
                              WriteOpResult* result ) {

        const string& insertNS = collection->ns().ns();

        Lock::assertWriteLocked( insertNS );

        StatusWith<DiskLoc> status = collection->insertDocument( docToInsert, true );

        if ( !status.isOK() ) {
            result->setError(toWriteError(status.getStatus()));
        }
        else {
            logOp( "i", insertNS.c_str(), docToInsert );
            getDur().commitIfNeeded();
            result->getStats().n = 1;
        }
    }
    void DurRecoveryUnit::commitUnitOfWork() {
#if ROLLBACK_ENABLED
        invariant(_state != MUST_ROLLBACK);
        invariant(_nestingLevel > 0);

        if (_nestingLevel != 1) {
            // If we are nested, punt to outer UnitOfWork. These changes will only be pushed to the
            // global damages list when the outer UnitOfWork commits (which it must now do).
            if (haveUncommitedChanges())
                _state = MUST_COMMIT;
            return;
        }

        publishChanges();
#endif

        // global journal flush
        getDur().commitIfNeeded(_txn);
    }
Beispiel #6
0
    void DurRecoveryUnit::commitUnitOfWork() {
        invariant(inAUnitOfWork());
        invariant(!_mustRollback);

        if (!inOutermostUnitOfWork()) {
            // If we are nested, make all changes for this level part of the containing UnitOfWork.
            // They will be added to the global damages list once the outermost UnitOfWork commits,
            // which it must now do.
            if (haveUncommitedChangesAtCurrentLevel()) {
                _startOfUncommittedChangesForLevel.back() = _changes.size();
            }
            return;
        }

        publishChanges();

        // global journal flush opportunity
        getDur().commitIfNeeded(_txn);
    }
Beispiel #7
0
        IndexCatalog::IndexBuildBlock* halfAddIndex(const std::string& key) {
            string name = key + "_1";
            BSONObj indexInfo = BSON( "v" << 1 <<
                                      "key" << BSON( key << 1 ) <<
                                      "ns" << _ns <<
                                      "name" << name );
            int32_t lenWHdr = indexInfo.objsize() + Record::HeaderSize;
            const char* systemIndexes = "unittests.system.indexes";
            DiskLoc infoLoc = allocateSpaceForANewRecord( systemIndexes,
                                                          nsdetails( systemIndexes ),
                                                          lenWHdr,
                                                          false );
            Record* infoRecord = reinterpret_cast<Record*>( getDur().writingPtr( infoLoc.rec(),
                                                                                 lenWHdr ) );
            memcpy( infoRecord->data(), indexInfo.objdata(), indexInfo.objsize() );
            addRecordToRecListInExtent( infoRecord, infoLoc );

            return new IndexCatalog::IndexBuildBlock( _ctx.ctx().db()->getCollection( _ns )->getIndexCatalog(), name, infoLoc );
        }
Beispiel #8
0
    /** given a BSON object, create a new one at dst which is the existing (partial) object
        with a new object element appended at the end with fieldname "o".

        @param partial already build object with everything except the o member.  e.g. something like:
               { ts:..., ns:..., os2:... }
        @param o a bson object to be added with fieldname "o"
        @dst   where to put the newly built combined object.  e.g. ends up as something like:
               { ts:..., ns:..., os2:..., o:... }
    */
    void append_O_Obj(char *dst, const BSONObj& partial, const BSONObj& o) {
        const int size1 = partial.objsize() - 1;  // less the EOO char
        const int oOfs = size1+3;                 // 3 = byte BSONOBJTYPE + byte 'o' + byte \0

        void *p = getDur().writingPtr(dst, oOfs+o.objsize()+1);

        memcpy(p, partial.objdata(), size1);

        // adjust overall bson object size for the o: field
        *(static_cast<unsigned*>(p)) += o.objsize() + 1/*fieldtype byte*/ + 2/*"o" fieldname*/;

        char *b = static_cast<char *>(p);
        b += size1;
        *b++ = (char) Object;
        *b++ = 'o'; // { o : ... }
        *b++ = 0;   // null terminate "o" fieldname
        memcpy(b, o.objdata(), o.objsize());
        b += o.objsize();
        *b = EOO;
    }
Beispiel #9
0
    /* assumes already zeroed -- insufficient for block 'reuse' perhaps */
    DiskLoc Extent::init(const char *nsname, int _length, int _fileNo, int _offset, bool capped) {
        magic = extentSignature;
        myLoc.set(_fileNo, _offset);
        xnext.Null();
        xprev.Null();
        nsDiagnostic = nsname;
        length = _length;
        firstRecord.Null();
        lastRecord.Null();

        DiskLoc emptyLoc;
        int delRecLength;
        extent_getEmptyLoc(nsname, myLoc, _length, capped, emptyLoc, delRecLength);

        DeletedRecord* empty = getDur().writing(DataFileMgr::getDeletedRecord(emptyLoc));
        empty->lengthWithHeaders() = delRecLength;
        empty->extentOfs() = myLoc.getOfs();
        empty->nextDeleted().Null();
        return emptyLoc;
    }
Beispiel #10
0
 /** @return IndexDetails for a new index on a:1, with the info field populated. */
 IndexDetails& addIndexWithInfo() {
     BSONObj indexInfo = BSON( "v" << 1 <<
                               "key" << BSON( "a" << 1 ) <<
                               "ns" << _ns <<
                               "name" << "a_1" );
     int32_t lenWHdr = indexInfo.objsize() + Record::HeaderSize;
     const char* systemIndexes = "unittests.system.indexes";
     DiskLoc infoLoc = allocateSpaceForANewRecord( systemIndexes,
                                                   nsdetails( systemIndexes ),
                                                   lenWHdr,
                                                   false );
     Record* infoRecord = reinterpret_cast<Record*>( getDur().writingPtr( infoLoc.rec(),
                                                                          lenWHdr ) );
     memcpy( infoRecord->data(), indexInfo.objdata(), indexInfo.objsize() );
     addRecordToRecListInExtent( infoRecord, infoLoc );
     IndexDetails& id = nsdetails( _ns )->getNextIndexDetails( _ns );
     nsdetails( _ns )->addIndex();
     id.info.writing() = infoLoc;
     return id;
 }
Beispiel #11
0
    bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ) {
        log() << "DatabaseHolder::closeAll path:" << path << endl;
        verify( Lock::isW() );
        getDur().commitNow(); // bad things happen if we close a DB with outstanding writes

        map<string,Database*>& m = _paths[path];
        _size -= m.size();

        set< string > dbs;
        for ( map<string,Database*>::iterator i = m.begin(); i != m.end(); i++ ) {
            wassert( i->second->path() == path );
            dbs.insert( i->first );
        }

        currentClient.get()->getContext()->_clear();

        BSONObjBuilder bb( result.subarrayStart( "dbs" ) );
        int n = 0;
        int nNotClosed = 0;
        for( set< string >::iterator i = dbs.begin(); i != dbs.end(); ++i ) {
            string name = *i;
            LOG(2) << "DatabaseHolder::closeAll path:" << path << " name:" << name << endl;
            Client::Context ctx( name , path );
            if( !force && BackgroundOperation::inProgForDb(name) ) {
                log() << "WARNING: can't close database " << name << " because a bg job is in progress - try killOp command" << endl;
                nNotClosed++;
            }
            else {
                Database::closeDatabase( name.c_str() , path );
                bb.append( bb.numStr( n++ ) , name );
            }
        }
        bb.done();
        if( nNotClosed )
            result.append("nNotClosed", nNotClosed);
        else {
            ClientCursor::assertNoCursors();
        }

        return true;
    }
Beispiel #12
0
        /** @return IndexDetails for a new index on a:1, with the info field populated. */
        IndexDescriptor* addIndexWithInfo() {
            BSONObj indexInfo = BSON( "v" << 1 <<
                                      "key" << BSON( "a" << 1 ) <<
                                      "ns" << _ns <<
                                      "name" << "a_1" );
            int32_t lenWHdr = indexInfo.objsize() + Record::HeaderSize;
            const char* systemIndexes = "unittests.system.indexes";
            DiskLoc infoLoc = allocateSpaceForANewRecord( systemIndexes,
                                                          nsdetails( systemIndexes ),
                                                          lenWHdr,
                                                          false );
            Record* infoRecord = reinterpret_cast<Record*>( getDur().writingPtr( infoLoc.rec(),
                                                                                 lenWHdr ) );
            memcpy( infoRecord->data(), indexInfo.objdata(), indexInfo.objsize() );
            addRecordToRecListInExtent( infoRecord, infoLoc );

            IndexCatalog::IndexBuildBlock blk( collection()->getIndexCatalog(), "a_1", infoLoc );
            blk.success();

            return collection()->getIndexCatalog()->findIndexByName( "a_1" );
        }
Beispiel #13
0
    void DurRecoveryUnit::commitChanges() {
        if (!inAUnitOfWork())
            return;

        invariant(!_mustRollback);
        invariant(inOutermostUnitOfWork());
        invariant(_startOfUncommittedChangesForLevel.front().changeIndex == 0);
        invariant(_startOfUncommittedChangesForLevel.front().writeIndex == 0);

        if (getDur().isDurable())
            pushChangesToDurSubSystem();

        for (Changes::const_iterator it = _changes.begin(), end = _changes.end(); it != end; ++it) {
            (*it)->commit();
        }

        // We now reset to a "clean" state without any uncommited changes.
        _changes.clear();
        _writes.clear();
        _preimageBuffer.clear();
    }
Beispiel #14
0
    bool DatabaseHolder::closeAll(OperationContext* txn,
                                  BSONObjBuilder& result,
                                  bool force) {
        invariant(txn->lockState()->isW());

        getDur().commitNow(txn); // bad things happen if we close a DB with outstanding writes

        set< string > dbs;
        for ( map<string,Database*>::iterator i = _dbs.begin(); i != _dbs.end(); i++ ) {
            dbs.insert( i->first );
        }

        BSONObjBuilder bb( result.subarrayStart( "dbs" ) );
        int n = 0;
        int nNotClosed = 0;
        for( set< string >::iterator i = dbs.begin(); i != dbs.end(); ++i ) {
            string name = *i;

            LOG(2) << "DatabaseHolder::closeAll name:" << name;
            Client::Context ctx(txn, name);

            if( !force && BackgroundOperation::inProgForDb(name) ) {
                log() << "WARNING: can't close database "
                      << name
                      << " because a bg job is in progress - try killOp command" 
                      << endl;
                nNotClosed++;
            }
            else {
                Database::closeDatabase(txn, name.c_str());
                bb.append( bb.numStr( n++ ) , name );
            }
        }
        bb.done();
        if( nNotClosed ) {
            result.append("nNotClosed", nNotClosed);
        }

        return true;
    }
        ~RepairFileDeleter() {
            if ( _success )
                 return;

            log() << "cleaning up failed repair "
                  << "db: " << _dbName << " path: " << _pathString;

            try {
                getDur().syncDataAndTruncateJournal();
                MongoFile::flushAll(true); // need both in case journaling is disabled
                {
                    Client::Context tempContext( _dbName, _pathString );
                    Database::closeDatabase( _dbName, _pathString );
                }
                MONGO_ASSERT_ON_EXCEPTION( boost::filesystem::remove_all( _path ) );
            }
            catch ( DBException& e ) {
                error() << "RepairFileDeleter failed to cleanup: " << e;
                error() << "aborting";
                fassertFailed( 17402 );
            }
        }
Beispiel #16
0
    /** write an op to the oplog that is already built.
        todo : make _logOpRS() call this so we don't repeat ourself?
        */
    void _logOpObjRS(const BSONObj& op) {
        Lock::DBWrite lk("local");

        const OpTime ts = op["ts"]._opTime();
        long long h = op["h"].numberLong();

        {
            const char *logns = rsoplog;
            if ( rsOplogDetails == 0 ) {
                Client::Context ctx(logns , dbpath);
                localDB = ctx.db();
                verify( localDB );
                rsOplogDetails = nsdetails(logns);
                massert(13389, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails);
            }
            Client::Context ctx(logns , localDB);
            {
                int len = op.objsize();
                Record *r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len);
                memcpy(getDur().writingPtr(r->data(), len), op.objdata(), len);
            }
            /* todo: now() has code to handle clock skew.  but if the skew server to server is large it will get unhappy.
                     this code (or code in now() maybe) should be improved.
                     */
            if( theReplSet ) {
                if( !(theReplSet->lastOpTimeWritten<ts) ) {
                    log() << "replSet error possible failover clock skew issue? " << theReplSet->lastOpTimeWritten.toString() << ' ' << endl;
                }
                theReplSet->lastOpTimeWritten = ts;
                theReplSet->lastH = h;
                ctx.getClient()->setLastOp( ts );

                replset::BackgroundSync::notify();
            }
        }

        OpTime::setLast( ts );
    }
Beispiel #17
0
    DiskLoc Extent::_reuse(const char *nsname, bool capped) {
        LOG(3) << "_reuse extent was:" << nsDiagnostic.toString() << " now:" << nsname << endl;
        if (magic != extentSignature) {
            StringBuilder sb;
            sb << "bad extent signature " << integerToHex(magic)
               << " for namespace '" << nsDiagnostic.toString()
               << "' found in Extent::_reuse";
            msgasserted(10360, sb.str());
        }
        nsDiagnostic = nsname;
        markEmpty();

        DiskLoc emptyLoc;
        int delRecLength;
        extent_getEmptyLoc(nsname, myLoc, length, capped, emptyLoc, delRecLength);

        // todo: some dup code here and below in Extent::init
        DeletedRecord* empty = getDur().writing(DataFileMgr::getDeletedRecord(emptyLoc));
        empty->lengthWithHeaders() = delRecLength;
        empty->extentOfs() = myLoc.getOfs();
        empty->nextDeleted().Null();
        return emptyLoc;
    }
Beispiel #18
0
    /* apply the log op that is in param o
       @return bool success (true) or failure (false)
    */
    bool SyncTail::syncApply(const BSONObj &op, bool convertUpdateToUpsert) {
        const char *ns = op.getStringField("ns");
        verify(ns);

        if ( (*ns == '\0') || (*ns == '.') ) {
            // this is ugly
            // this is often a no-op
            // but can't be 100% sure
            if( *op.getStringField("op") != 'n' ) {
                error() << "replSet skipping bad op in oplog: " << op.toString() << rsLog;
            }
            return true;
        }

        bool isCommand(op["op"].valuestrsafe()[0] == 'c');

        boost::scoped_ptr<Lock::ScopedLock> lk;

        if(isCommand) {
            // a command may need a global write lock. so we will conservatively go 
            // ahead and grab one here. suboptimal. :-(
            lk.reset(new Lock::GlobalWrite());
        } else {
            // DB level lock for this operation
            lk.reset(new Lock::DBWrite(ns)); 
        }

        Client::Context ctx(ns, dbpath);
        ctx.getClient()->curop()->reset();
        // For non-initial-sync, we convert updates to upserts
        // to suppress errors when replaying oplog entries.
        bool ok = !applyOperation_inlock(op, true, convertUpdateToUpsert);
        opsAppliedStats.increment();
        getDur().commitIfNeeded();

        return ok;
    }
Beispiel #19
0
/**
 * Perform a single insert into a collection.  Requires the insert be preprocessed and the
 * collection already has been created.
 *
 * Might fault or error, otherwise populates the result.
 */
static void singleInsert( const BatchItemRef& insertItem,
                          const BSONObj& normalInsert,
                          Collection* collection,
                          WriteOpResult* result ) {

    const string& insertNS = insertItem.getRequest()->getNS();

    Lock::assertWriteLocked( insertNS );

    try {

        // XXX - are we 100% sure that all !OK statuses do not write a document?
        StatusWith<DiskLoc> status = collection->insertDocument( normalInsert, true );

        if ( !status.isOK() ) {
            result->error = toWriteError( status.getStatus() );
        }
        else {
            logOp( "i", insertNS.c_str(), normalInsert );
            getDur().commitIfNeeded();
            result->stats.n = 1;
        }
    }
    catch ( const PageFaultException& ex ) {
        // TODO: An actual data structure that's not an exception for this
        result->fault = new PageFaultException( ex );
    }
    catch ( const DBException& ex ) {
        Status status(ex.toStatus());
        if (ErrorCodes::isInterruption(status.code())) {
            throw;
        }
        result->error = toWriteError(status);
    }

}
Beispiel #20
0
    Status validateWriteConcern( const WriteConcernOptions& writeConcern ) {

        const bool isJournalEnabled = getDur().isDurable();

        if ( writeConcern.syncMode == WriteConcernOptions::JOURNAL && !isJournalEnabled ) {
            return Status( ErrorCodes::BadValue,
                           "cannot use 'j' option when a host does not have journaling enabled" );
        }

        const bool isConfigServer = serverGlobalParams.configsvr;
        const repl::ReplicationCoordinator::Mode replMode =
                repl::getGlobalReplicationCoordinator()->getReplicationMode();

        if ( isConfigServer || replMode == repl::ReplicationCoordinator::modeNone ) {

            // Note that config servers can be replicated (have an oplog), but we still don't allow
            // w > 1

            if ( writeConcern.wNumNodes > 1 ) {
                return Status( ErrorCodes::BadValue,
                               string( "cannot use 'w' > 1 " ) +
                               ( isConfigServer ? "on a config server host" :
                                                  "when a host is not replicated" ) );
            }
        }

        if ( replMode != repl::ReplicationCoordinator::modeReplSet &&
                !writeConcern.wMode.empty() &&
                writeConcern.wMode != "majority" ) {
            return Status( ErrorCodes::BadValue,
                           string( "cannot use non-majority 'w' mode " ) + writeConcern.wMode
                           + " when a host is not a member of a replica set" );
        }

        return Status::OK();
    }
Beispiel #21
0
    uint64_t BtreeBasedBuilder::fastBuildIndex(const char* ns, NamespaceDetails* d,
                                               IndexDetails& idx, bool mayInterrupt,
                                               int idxNo) {
        CurOp * op = cc().curop();

        Timer t;

        tlog(1) << "fastBuildIndex " << ns << ' ' << idx.info.obj().toString() << endl;

        bool dupsAllowed = !idx.unique() || ignoreUniqueIndex(idx);
        bool dropDups = idx.dropDups() || inDBRepair;
        BSONObj order = idx.keyPattern();

        getDur().writingDiskLoc(idx.head).Null();

        if ( logLevel > 1 ) printMemInfo( "before index start" );

        /* get and sort all the keys ----- */
        ProgressMeterHolder pm(op->setMessage("index: (1/3) external sort",
                                              "Index: (1/3) External Sort Progress",
                                              d->stats.nrecords,
                                              10));
        SortPhaseOne phase1;
        addKeysToPhaseOne(d, ns, idx, order, &phase1, d->stats.nrecords, pm.get(),
                          mayInterrupt, idxNo );
        pm.finished();

        BSONObjExternalSorter& sorter = *(phase1.sorter);

        if( phase1.multi ) {
            d->setIndexIsMultikey(ns, idxNo);
        }

        if ( logLevel > 1 ) printMemInfo( "before final sort" );
        phase1.sorter->sort( mayInterrupt );
        if ( logLevel > 1 ) printMemInfo( "after final sort" );

        LOG(t.seconds() > 5 ? 0 : 1) << "\t external sort used : " << sorter.numFiles()
                                     << " files " << " in " << t.seconds() << " secs" << endl;

        set<DiskLoc> dupsToDrop;

        /* build index --- */
        if( idx.version() == 0 )
            buildBottomUpPhases2And3<V0>(dupsAllowed,
                                         idx,
                                         sorter,
                                         dropDups,
                                         dupsToDrop,
                                         op,
                                         &phase1,
                                         pm,
                                         t,
                                         mayInterrupt);
        else if( idx.version() == 1 ) 
            buildBottomUpPhases2And3<V1>(dupsAllowed,
                                         idx,
                                         sorter,
                                         dropDups,
                                         dupsToDrop,
                                         op,
                                         &phase1,
                                         pm,
                                         t,
                                         mayInterrupt);
        else
            verify(false);

        if( dropDups ) 
            log() << "\t fastBuildIndex dupsToDrop:" << dupsToDrop.size() << endl;

        BtreeBasedBuilder::doDropDups(ns, d, dupsToDrop, mayInterrupt);

        return phase1.n;
    }
Beispiel #22
0
    /** @return number of skipped (invalid) documents */
    unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc ext, int n,
                const scoped_array<IndexSpec> &indexSpecs,
                scoped_array<SortPhaseOne>& phase1, int nidx, bool validate, 
                double pf, int pb)
    {
        log() << "compact extent #" << n << endl;
        unsigned oldObjSize = 0; // we'll report what the old padding was
        unsigned oldObjSizeWithPadding = 0;

        Extent *e = ext.ext();
        e->assertOk();
        assert( e->validates() );
        unsigned skipped = 0;

        {
            // the next/prev pointers within the extent might not be in order so we first page the whole thing in 
            // sequentially
            log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl;
            Timer t;
            MAdvise adv(e, e->length, MAdvise::Sequential);
            const char *p = (const char *) e;
            for( int i = 0; i < e->length; i += 4096 ) { 
                faux += p[i];
            }
            int ms = t.millis();
            if( ms > 1000 ) 
                log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl;
        }

        {
            log() << "compact copying records" << endl;
            unsigned totalSize = 0;
            int nrecs = 0;
            DiskLoc L = e->firstRecord;
            if( !L.isNull() ) {
                while( 1 ) {
                    Record *recOld = L.rec();
                    L = recOld->nextInExtent(L);
                    nrecs++;
                    BSONObj objOld(recOld);

                    if( !validate || objOld.valid() ) {
                        unsigned sz = objOld.objsize();

                        oldObjSize += sz;
                        oldObjSizeWithPadding += recOld->netLength();

                        unsigned lenWHdr = sz + Record::HeaderSize;
                        unsigned lenWPadding = lenWHdr;
                        {
                            lenWPadding = static_cast<unsigned>(pf*lenWPadding);
                            lenWPadding += pb;
                            lenWPadding = lenWPadding & quantizeMask(lenWPadding);
                            if( lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { 
                                lenWPadding = lenWHdr;
                            }
                        }
                        totalSize += lenWPadding;
                        DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false);
                        uassert(14024, "compact error out of space during compaction", !loc.isNull());
                        Record *recNew = loc.rec();
                        recNew = (Record *) getDur().writingPtr(recNew, lenWHdr);
                        addRecordToRecListInExtent(recNew, loc);
                        memcpy(recNew->data, objOld.objdata(), sz);

                        {
                            // extract keys for all indexes we will be rebuilding
                            for( int x = 0; x < nidx; x++ ) { 
                                phase1[x].addKeys(indexSpecs[x], objOld, loc);
                            }
                        }
                    }
                    else { 
                        if( ++skipped <= 10 )
                            log() << "compact skipping invalid object" << endl;
                    }

                    if( L.isNull() ) { 
                        // we just did the very last record from the old extent.  it's still pointed to 
                        // by the old extent ext, but that will be fixed below after this loop
                        break;
                    }

                    // remove the old records (orphan them) periodically so our commit block doesn't get too large
                    bool stopping = false;
                    RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0;
                    if( stopping || getDur().aCommitIsNeeded() ) {
                        e->firstRecord.writing() = L;
                        Record *r = L.rec();
                        getDur().writingInt(r->prevOfs) = DiskLoc::NullOfs;
                        getDur().commitIfNeeded();
                        killCurrentOp.checkForInterrupt(false);
                    }
                }
            } // if !L.isNull()

            assert( d->firstExtent == ext );
            assert( d->lastExtent != ext );
            DiskLoc newFirst = e->xnext;
            d->firstExtent.writing() = newFirst;
            newFirst.ext()->xprev.writing().Null();
            getDur().writing(e)->markEmpty();
            freeExtents(ext,ext);
            getDur().commitIfNeeded();

            { 
                double op = 1.0;
                if( oldObjSize ) 
                    op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize;
                log() << "compact " << nrecs << " documents " << totalSize/1000000.0 << "MB"
                    << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100
                    << endl;                    
            }
        }

        return skipped;
    }
Beispiel #23
0
    bool _compact(const char *ns, NamespaceDetails *d, string& errmsg, bool validate, BSONObjBuilder& result, double pf, int pb) { 
        //int les = d->lastExtentSize;

        // this is a big job, so might as well make things tidy before we start just to be nice.
        getDur().commitNow();

        list<DiskLoc> extents;
        for( DiskLoc L = d->firstExtent; !L.isNull(); L = L.ext()->xnext ) 
            extents.push_back(L);
        log() << "compact " << extents.size() << " extents" << endl;

        ProgressMeterHolder pm( cc().curop()->setMessage( "compact extent" , extents.size() ) );

        // same data, but might perform a little different after compact?
        NamespaceDetailsTransient::get(ns).clearQueryCache();

        int nidx = d->nIndexes;
        scoped_array<IndexSpec> indexSpecs( new IndexSpec[nidx] );
        scoped_array<SortPhaseOne> phase1( new SortPhaseOne[nidx] );
        {
            NamespaceDetails::IndexIterator ii = d->ii(); 
            int x = 0;
            while( ii.more() ) { 
                BSONObjBuilder b;
                IndexDetails& idx = ii.next();
                BSONObj::iterator i(idx.info.obj());
                while( i.more() ) { 
                    BSONElement e = i.next();
                    if( !str::equals(e.fieldName(), "v") && !str::equals(e.fieldName(), "background") ) {
                        b.append(e);
                    }
                }
                BSONObj o = b.obj().getOwned();
                phase1[x].sorter.reset( new BSONObjExternalSorter( idx.idxInterface(), o.getObjectField("key") ) );
                phase1[x].sorter->hintNumObjects( d->stats.nrecords );
                indexSpecs[x++].reset(o);
            }
        }

        log() << "compact orphan deleted lists" << endl;
        for( int i = 0; i < Buckets; i++ ) { 
            d->deletedList[i].writing().Null();
        }

        // before dropping indexes, at least make sure we can allocate one extent!
        uassert(14025, "compact error no space available to allocate", !allocateSpaceForANewRecord(ns, d, Record::HeaderSize+1, false).isNull());

        // note that the drop indexes call also invalidates all clientcursors for the namespace, which is important and wanted here
        log() << "compact dropping indexes" << endl;
        BSONObjBuilder b;
        if( !dropIndexes(d, ns, "*", errmsg, b, true) ) { 
            errmsg = "compact drop indexes failed";
            log() << errmsg << endl;
            return false;
        }

        getDur().commitNow();

        long long skipped = 0;
        int n = 0;
        for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) { 
            skipped += compactExtent(ns, d, *i, n++, indexSpecs, phase1, nidx, validate, pf, pb);
            pm.hit();
        }

        if( skipped ) {
            result.append("invalidObjects", skipped);
        }

        assert( d->firstExtent.ext()->xprev.isNull() );

        // indexes will do their own progress meter?
        pm.finished();

        // build indexes
        NamespaceString s(ns);
        string si = s.db + ".system.indexes";
        for( int i = 0; i < nidx; i++ ) {
            killCurrentOp.checkForInterrupt(false);
            BSONObj info = indexSpecs[i].info;
            log() << "compact create index " << info["key"].Obj().toString() << endl;
            try {
                precalced = &phase1[i];
                theDataFileMgr.insert(si.c_str(), info.objdata(), info.objsize());
            }
            catch(...) { 
                precalced = 0;
                throw;
            }
            precalced = 0;
        }

        return true;
    }
Beispiel #24
0
    void Collection::_compactExtent(const DiskLoc diskloc, int extentNumber,
                                    MultiIndexBlock& indexesToInsertTo,
                                    const CompactOptions* compactOptions, CompactStats* stats ) {

        log() << "compact begin extent #" << extentNumber
              << " for namespace " << _ns << " " << diskloc;

        unsigned oldObjSize = 0; // we'll report what the old padding was
        unsigned oldObjSizeWithPadding = 0;

        Extent *e = diskloc.ext();
        e->assertOk();
        verify( e->validates(diskloc) );

        {
            // the next/prev pointers within the extent might not be in order so we first
            // page the whole thing in sequentially
            log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl;
            Timer t;
            size_t length = e->length;

            touch_pages( reinterpret_cast<const char*>(e), length );
            int ms = t.millis();
            if( ms > 1000 )
                log() << "compact end paging in " << ms << "ms "
                      << e->length/1000000.0/t.seconds() << "MB/sec" << endl;
        }

        {
            log() << "compact copying records" << endl;
            long long datasize = 0;
            long long nrecords = 0;
            DiskLoc L = e->firstRecord;
            if( !L.isNull() ) {
                while( 1 ) {
                    Record *recOld = L.rec();
                    L = getExtentManager()->getNextRecordInExtent(L);
                    BSONObj objOld = BSONObj::make(recOld);

                    if ( compactOptions->validateDocuments && !objOld.valid() ) {
                        // object is corrupt!
                        log() << "compact skipping corrupt document!";
                        stats->corruptDocuments++;
                    }
                    else {
                        unsigned docSize = objOld.objsize();

                        nrecords++;
                        oldObjSize += docSize;
                        oldObjSizeWithPadding += recOld->netLength();

                        unsigned lenWHdr = docSize + Record::HeaderSize;
                        unsigned lenWPadding = lenWHdr;

                        switch( compactOptions->paddingMode ) {
                        case CompactOptions::NONE:
                            if ( details()->isUserFlagSet(NamespaceDetails::Flag_UsePowerOf2Sizes) )
                                lenWPadding = details()->quantizePowerOf2AllocationSpace(lenWPadding);
                            break;
                        case CompactOptions::PRESERVE:
                            // if we are preserving the padding, the record should not change size
                            lenWPadding = recOld->lengthWithHeaders();
                            break;
                        case CompactOptions::MANUAL:
                            lenWPadding = compactOptions->computeRecordSize(lenWPadding);
                            if (lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) {
                                lenWPadding = lenWHdr;
                            }
                            break;
                        }

                        CompactDocWriter writer( objOld, lenWPadding );
                        StatusWith<DiskLoc> status = _recordStore->insertRecord( &writer, 0 );
                        uassertStatusOK( status.getStatus() );
                        datasize += _recordStore->recordFor( status.getValue() )->netLength();

                        InsertDeleteOptions options;
                        options.logIfError = false;
                        options.dupsAllowed = true; // in compact we should be doing no checking

                        indexesToInsertTo.insert( objOld, status.getValue(), options );
                    }

                    if( L.isNull() ) {
                        // we just did the very last record from the old extent.  it's still pointed to
                        // by the old extent ext, but that will be fixed below after this loop
                        break;
                    }

                    // remove the old records (orphan them) periodically so our commit block doesn't get too large
                    bool stopping = false;
                    RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0;
                    if( stopping || getDur().aCommitIsNeeded() ) {
                        e->firstRecord.writing() = L;
                        Record *r = L.rec();
                        getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs;
                        getDur().commitIfNeeded();
                        killCurrentOp.checkForInterrupt();
                    }
                }
            } // if !L.isNull()

            verify( details()->firstExtent() == diskloc );
            verify( details()->lastExtent() != diskloc );
            DiskLoc newFirst = e->xnext;
            details()->firstExtent().writing() = newFirst;
            newFirst.ext()->xprev.writing().Null();
            getDur().writing(e)->markEmpty();
            getExtentManager()->freeExtents( diskloc, diskloc );

            getDur().commitIfNeeded();

            {
                double op = 1.0;
                if( oldObjSize )
                    op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize;
                log() << "compact finished extent #" << extentNumber << " containing " << nrecords
                      << " documents (" << datasize/1000000.0 << "MB)"
                      << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100;
            }
        }

    }
Beispiel #25
0
    StatusWith<CompactStats> Collection::compact( const CompactOptions* compactOptions ) {

        if ( isCapped() )
            return StatusWith<CompactStats>( ErrorCodes::BadValue,
                                             "cannot compact capped collection" );

        if ( _indexCatalog.numIndexesInProgress() )
            return StatusWith<CompactStats>( ErrorCodes::BadValue,
                                             "cannot compact when indexes in progress" );

        NamespaceDetails* d = details();

        // this is a big job, so might as well make things tidy before we start just to be nice.
        getDur().commitIfNeeded();

        list<DiskLoc> extents;
        for( DiskLoc L = d->firstExtent(); !L.isNull(); L = L.ext()->xnext )
            extents.push_back(L);
        log() << "compact " << extents.size() << " extents" << endl;

        // same data, but might perform a little different after compact?
        _infoCache.reset();

        vector<BSONObj> indexSpecs;
        {
            IndexCatalog::IndexIterator ii( _indexCatalog.getIndexIterator( false ) );
            while ( ii.more() ) {
                IndexDescriptor* descriptor = ii.next();

                const BSONObj spec = _compactAdjustIndexSpec(descriptor->infoObj());
                const BSONObj key = spec.getObjectField("key");
                const Status keyStatus = validateKeyPattern(key);
                if (!keyStatus.isOK()) {
                    return StatusWith<CompactStats>(
                        ErrorCodes::CannotCreateIndex,
                        str::stream() << "Cannot rebuild index " << spec << ": "
                                      << keyStatus.reason()
                                      << " For more info see"
                                      << " http://dochub.mongodb.org/core/index-validation");
                }
                indexSpecs.push_back(spec);
            }
        }

        log() << "compact orphan deleted lists" << endl;
        d->orphanDeletedList();

        // Start over from scratch with our extent sizing and growth
        d->setLastExtentSize( 0 );

        // before dropping indexes, at least make sure we can allocate one extent!
        // this will allocate an extent and add to free list
        // if it cannot, it will throw an exception
        increaseStorageSize( _details->lastExtentSize(), true );

        // note that the drop indexes call also invalidates all clientcursors for the namespace,
        // which is important and wanted here
        log() << "compact dropping indexes" << endl;
        Status status = _indexCatalog.dropAllIndexes( true );
        if ( !status.isOK() ) {
            return StatusWith<CompactStats>( status );
        }

        getDur().commitIfNeeded();
        killCurrentOp.checkForInterrupt();

        CompactStats stats;

        MultiIndexBlock multiIndexBlock( this );
        status = multiIndexBlock.init( indexSpecs );
        if ( !status.isOK() )
            return StatusWith<CompactStats>( status );

        // reset data size and record counts to 0 for this namespace
        // as we're about to tally them up again for each new extent
        d->setStats( 0, 0 );

        ProgressMeterHolder pm(cc().curop()->setMessage("compact extent",
                                                        "Extent Compacting Progress",
                                                        extents.size()));

        int extentNumber = 0;
        for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) {
            _compactExtent(*i, extentNumber++, multiIndexBlock, compactOptions, &stats );
            pm.hit();
        }

        verify( d->firstExtent().ext()->xprev.isNull() );

        // indexes will do their own progress meter?
        pm.finished();

        log() << "starting index commits";

        status = multiIndexBlock.commit();
        if ( !status.isOK() )
            return StatusWith<CompactStats>( status );

        return StatusWith<CompactStats>( stats );
    }
Beispiel #26
0
        void operator()( DBClientCursorBatchIterator &i ) {
            mongolock l( true );
            if ( context ) {
                context->relocked();
            }

            while( i.moreInCurrentBatch() ) {
                if ( n % 128 == 127 /*yield some*/ ) {
                    time_t now = time(0);
                    if( now - lastLog >= 60 ) { 
                        // report progress
                        if( lastLog )
                            log() << "clone " << to_collection << ' ' << n << endl;
                        lastLog = now;
                    }
                    mayInterrupt( _mayBeInterrupted );
                    dbtempreleaseif t( _mayYield );
                }

                BSONObj tmp = i.nextSafe();

                /* assure object is valid.  note this will slow us down a little. */
                if ( !tmp.valid() ) {
                    stringstream ss;
                    ss << "Cloner: skipping corrupt object from " << from_collection;
                    BSONElement e = tmp.firstElement();
                    try {
                        e.validate();
                        ss << " firstElement: " << e;
                    }
                    catch( ... ) {
                        ss << " firstElement corrupt";
                    }
                    out() << ss.str() << endl;
                    continue;
                }

                ++n;

                BSONObj js = tmp;
                if ( isindex ) {
                    verify( strstr(from_collection, "system.indexes") );
                    js = fixindex(tmp);
                    storedForLater->push_back( js.getOwned() );
                    continue;
                }

                try {
                    theDataFileMgr.insertWithObjMod(to_collection, js);
                    if ( logForRepl )
                        logOp("i", to_collection, js);

                    getDur().commitIfNeeded();
                }
                catch( UserException& e ) {
                    log() << "warning: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n';
                }

                RARELY if ( time( 0 ) - saveLast > 60 ) {
                    log() << n << " objects cloned so far from collection " << from_collection << endl;
                    saveLast = time( 0 );
                }
            }
        }
Beispiel #27
0
    /* ns:      namespace, e.g. <database>.<collection>
       pattern: the "where" clause / criteria
       justOne: stop after 1 match
       god:     allow access to system namespaces, and don't yield
    */
    long long deleteObjects(const char *ns, BSONObj pattern, bool justOneOrig, bool logop, bool god, RemoveSaver * rs ) {
        if( !god ) {
            if ( strstr(ns, ".system.") ) {
                /* note a delete from system.indexes would corrupt the db
                if done here, as there are pointers into those objects in
                NamespaceDetails.
                */
                uassert(12050, "cannot delete from system namespace", legalClientSystemNS( ns , true ) );
            }
            if ( strchr( ns , '$' ) ) {
                log() << "cannot delete from collection with reserved $ in name: " << ns << endl;
                uassert( 10100 ,  "cannot delete from collection with reserved $ in name", strchr(ns, '$') == 0 );
            }
        }

        {
            NamespaceDetails *d = nsdetails( ns );
            if ( ! d )
                return 0;
            uassert( 10101 ,  "can't remove from a capped collection" , ! d->capped );
        }

        long long nDeleted = 0;

        shared_ptr< Cursor > creal = NamespaceDetailsTransient::getCursor( ns, pattern, BSONObj(), false, 0 );

        if( !creal->ok() )
            return nDeleted;

        shared_ptr< Cursor > cPtr = creal;
        auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout, cPtr, ns) );
        cc->setDoingDeletes( true );

        CursorId id = cc->cursorid();

        bool justOne = justOneOrig;
        bool canYield = !god && !(creal->matcher() && creal->matcher()->docMatcher().atomic());

        do {
            // TODO: we can generalize this I believe
            //       
            bool willNeedRecord = (creal->matcher() && creal->matcher()->needRecord()) || pattern.isEmpty() || isSimpleIdQuery( pattern );
            if ( ! willNeedRecord ) {
                // TODO: this is a total hack right now
                // check if the index full encompasses query
                
                if ( pattern.nFields() == 1 && 
                     str::equals( pattern.firstElement().fieldName() , creal->indexKeyPattern().firstElement().fieldName() ) )
                    willNeedRecord = true;
            }
            
            if ( canYield && ! cc->yieldSometimes( willNeedRecord ? ClientCursor::WillNeed : ClientCursor::MaybeCovered ) ) {
                cc.release(); // has already been deleted elsewhere
                // TODO should we assert or something?
                break;
            }
            if ( !cc->ok() ) {
                break; // if we yielded, could have hit the end
            }

            // this way we can avoid calling updateLocation() every time (expensive)
            // as well as some other nuances handled
            cc->setDoingDeletes( true );

            DiskLoc rloc = cc->currLoc();
            BSONObj key = cc->currKey();

            bool match = creal->currentMatches();
            bool dup = cc->c()->getsetdup(rloc);

            if ( ! cc->advance() )
                justOne = true;

            if ( ! match )
                continue;

            assert( !dup ); // can't be a dup, we deleted it!

            if ( !justOne ) {
                /* NOTE: this is SLOW.  this is not good, noteLocation() was designed to be called across getMore
                    blocks.  here we might call millions of times which would be bad.
                    */
                cc->c()->prepareToTouchEarlierIterate();
            }

            if ( logop ) {
                BSONElement e;
                if( BSONObj( rloc.rec() ).getObjectID( e ) ) {
                    BSONObjBuilder b;
                    b.append( e );
                    bool replJustOne = true;
                    logOp( "d", ns, b.done(), 0, &replJustOne );
                }
                else {
                    problem() << "deleted object without id, not logging" << endl;
                }
            }

            if ( rs )
                rs->goingToDelete( rloc.obj() /*cc->c->current()*/ );

            theDataFileMgr.deleteRecord(ns, rloc.rec(), rloc);
            nDeleted++;
            if ( justOne ) {
                break;
            }
            cc->c()->recoverFromTouchingEarlierIterate();
         
            if( !god ) 
                getDur().commitIfNeeded();

            if( debug && god && nDeleted == 100 ) 
                log() << "warning high number of deletes with god=true which could use significant memory" << endl;
        }
        while ( cc->ok() );

        if ( cc.get() && ClientCursor::find( id , false ) == 0 ) {
            // TODO: remove this and the id declaration above if this doesn't trigger
            //       if it does, then i'm very confused (ERH 06/2011)
            error() << "this should be impossible" << endl;
            printStackTrace();
            cc.release();
        }

        return nDeleted;
    }
Beispiel #28
0
    long long Helpers::removeRange( const string& ns , const BSONObj& min , const BSONObj& max , bool yield , bool maxInclusive , RemoveCallback * callback, bool fromMigrate ) {
        BSONObj keya , keyb;
        BSONObj minClean = toKeyFormat( min , keya );
        BSONObj maxClean = toKeyFormat( max , keyb );
        verify( keya == keyb );

        Client::Context ctx(ns);

        shared_ptr<Cursor> c;
        auto_ptr<ClientCursor> cc;
        {
            NamespaceDetails* nsd = nsdetails( ns.c_str() );
            if ( ! nsd )
                return 0;
            
            int ii = nsd->findIndexByKeyPattern( keya );
            verify( ii >= 0 );
            
            IndexDetails& i = nsd->idx( ii );
            
            c.reset( BtreeCursor::make( nsd , ii , i , minClean , maxClean , maxInclusive, 1 ) );
            cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) );
            cc->setDoingDeletes( true );
        }

        long long num = 0;

        while ( cc->ok() ) {

            if ( yield && ! cc->yieldSometimes( ClientCursor::WillNeed) ) {
                // cursor got finished by someone else, so we're done
                cc.release(); // if the collection/db is dropped, cc may be deleted
                break;
            }

            if ( ! cc->ok() )
                break;

            DiskLoc rloc = cc->currLoc();

            if ( callback )
                callback->goingToDelete( cc->current() );

            cc->advance();
            // SERVER-5198 Additional advancement is unnecessary for a single btree cursor, and see
            // SERVER-5725.
            c->prepareToTouchEarlierIterate();

            logOp( "d" , ns.c_str() , rloc.obj()["_id"].wrap() , 0 , 0 , fromMigrate );
            theDataFileMgr.deleteRecord(ns.c_str() , rloc.rec(), rloc);
            num++;

            c->recoverFromTouchingEarlierIterate();

            getDur().commitIfNeeded();


        }

        return num;
    }
    Status cloneCollectionAsCapped( Database* db,
                                    const string& shortFrom,
                                    const string& shortTo,
                                    double size,
                                    bool temp,
                                    bool logForReplication ) {

        string fromNs = db->name() + "." + shortFrom;
        string toNs = db->name() + "." + shortTo;

        Collection* fromCollection = db->getCollection( fromNs );
        if ( !fromCollection )
            return Status( ErrorCodes::NamespaceNotFound,
                           str::stream() << "source collection " << fromNs <<  " does not exist" );

        if ( db->getCollection( toNs ) )
            return Status( ErrorCodes::NamespaceExists, "to collection already exists" );

        // create new collection
        {
            Client::Context ctx( toNs );
            BSONObjBuilder spec;
            spec.appendBool( "capped", true );
            spec.append( "size", size );
            if ( temp )
                spec.appendBool( "temp", true );

            Status status = userCreateNS( ctx.db(), toNs, spec.done(), logForReplication );
            if ( !status.isOK() )
                return status;
        }

        Collection* toCollection = db->getCollection( toNs );
        invariant( toCollection ); // we created above

        // how much data to ignore because it won't fit anyway
        // datasize and extentSize can't be compared exactly, so add some padding to 'size'
        long long excessSize =
            static_cast<long long>( fromCollection->dataSize() -
                                    ( toCollection->storageSize() * 2 ) );

        scoped_ptr<Runner> runner( InternalPlanner::collectionScan(fromNs,
                                                                   fromCollection,
                                                                   InternalPlanner::FORWARD ) );


        while ( true ) {
            BSONObj obj;
            Runner::RunnerState state = runner->getNext(&obj, NULL);

            switch( state ) {
            case Runner::RUNNER_EOF:
                return Status::OK();
            case Runner::RUNNER_DEAD:
                db->dropCollection( toNs );
                return Status( ErrorCodes::InternalError, "runner turned dead while iterating" );
            case Runner::RUNNER_ERROR:
                return Status( ErrorCodes::InternalError, "runner error while iterating" );
            case Runner::RUNNER_ADVANCED:
                if ( excessSize > 0 ) {
                    excessSize -= ( 4 * obj.objsize() ); // 4x is for padding, power of 2, etc...
                    continue;
                }

                toCollection->insertDocument( obj, true );
                if ( logForReplication )
                    logOp( "i", toNs.c_str(), obj );
                getDur().commitIfNeeded();
            }
        }

        invariant( false ); // unreachable
    }
Beispiel #30
0
    void NamespaceDetails::cappedTruncateAfter(const char *ns, DiskLoc end, bool inclusive) {
        DEV verify( this == nsdetails(ns) );
        verify( cappedLastDelRecLastExtent().isValid() );

        // We iteratively remove the newest document until the newest document
        // is 'end', then we remove 'end' if requested.
        bool foundLast = false;
        while( 1 ) {
            if ( foundLast ) {
                // 'end' has been found and removed, so break.
                break;
            }
            getDur().commitIfNeeded();
            // 'curr' will point to the newest document in the collection.
            DiskLoc curr = theCapExtent()->lastRecord;
            verify( !curr.isNull() );
            if ( curr == end ) {
                if ( inclusive ) {
                    // 'end' has been found, so break next iteration.
                    foundLast = true;
                }
                else {
                    // 'end' has been found, so break.
                    break;
                }
            }

            // TODO The algorithm used in this function cannot generate an
            // empty collection, but we could call emptyCappedCollection() in
            // this case instead of asserting.
            uassert( 13415, "emptying the collection is not allowed", _stats.nrecords > 1 );

            // Delete the newest record, and coalesce the new deleted
            // record with existing deleted records.
            theDataFileMgr.deleteRecord(this, ns, curr.rec(), curr, true);
            compact();

            // This is the case where we have not yet had to remove any
            // documents to make room for other documents, and we are allocating
            // documents from free space in fresh extents instead of reusing
            // space from familiar extents.
            if ( !capLooped() ) {

                // We just removed the last record from the 'capExtent', and
                // the 'capExtent' can't be empty, so we set 'capExtent' to
                // capExtent's prev extent.
                if ( theCapExtent()->lastRecord.isNull() ) {
                    verify( !theCapExtent()->xprev.isNull() );
                    // NOTE Because we didn't delete the last document, and
                    // capLooped() is false, capExtent is not the first extent
                    // so xprev will be nonnull.
                    _capExtent.writing() = theCapExtent()->xprev;
                    theCapExtent()->assertOk();

                    // update cappedLastDelRecLastExtent()
                    cappedTruncateLastDelUpdate();
                }
                continue;
            }

            // This is the case where capLooped() is true, and we just deleted
            // from capExtent, and we just deleted capFirstNewRecord, which was
            // the last record on the fresh side of capExtent.
            // NOTE In this comparison, curr and potentially capFirstNewRecord
            // may point to invalid data, but we can still compare the
            // references themselves.
            if ( curr == _capFirstNewRecord ) {

                // Set 'capExtent' to the first nonempty extent prior to the
                // initial capExtent.  There must be such an extent because we
                // have not deleted the last document in the collection.  It is
                // possible that all extents other than the capExtent are empty.
                // In this case we will keep the initial capExtent and specify
                // that all records contained within are on the fresh rather than
                // stale side of the extent.
                DiskLoc newCapExtent = _capExtent;
                do {
                    // Find the previous extent, looping if necessary.
                    newCapExtent = ( newCapExtent == _firstExtent ) ? _lastExtent : newCapExtent.ext()->xprev;
                    newCapExtent.ext()->assertOk();
                }
                while ( newCapExtent.ext()->firstRecord.isNull() );
                _capExtent.writing() = newCapExtent;

                // Place all documents in the new capExtent on the fresh side
                // of the capExtent by setting capFirstNewRecord to the first
                // document in the new capExtent.
                _capFirstNewRecord.writing() = theCapExtent()->firstRecord;

                // update cappedLastDelRecLastExtent()
                cappedTruncateLastDelUpdate();
            }
        }
    }