StatusWith<DiskLoc> RecordStoreV1Base::insertRecord( OperationContext* txn,
                                                         const DocWriter* doc,
                                                         bool enforceQuota ) {
        int docSize = doc->documentSize();
        if ( docSize < 4 ) {
            return StatusWith<DiskLoc>( ErrorCodes::InvalidLength,
                                        "record has to be >= 4 bytes" );
        }
        int lenWHdr = docSize + Record::HeaderSize;
        if ( doc->addPadding() )
            lenWHdr = getRecordAllocationSize( lenWHdr );

        StatusWith<DiskLoc> loc = allocRecord( txn, lenWHdr, enforceQuota );
        if ( !loc.isOK() )
            return loc;

        Record *r = recordFor( loc.getValue() );
        fassert( 17319, r->lengthWithHeaders() >= lenWHdr );

        r = reinterpret_cast<Record*>( txn->recoveryUnit()->writingPtr(r, lenWHdr) );
        doc->writeDocument( r->data() );

        _addRecordToRecListInExtent(txn, r, loc.getValue());

        _details->incrementStats( txn, r->netLength(), 1 );

        _paddingFits( txn );

        return loc;
    }
Exemple #2
0
 // bypass standard alloc/insert routines to use the extent we want.
 static DiskLoc insert( const DiskLoc& ext, int i ) {
     BSONObjBuilder b;
     b.append( "a", i );
     BSONObj o = b.done();
     int len = o.objsize();
     Extent *e = ext.ext();
     e = getDur().writing(e);
     int ofs;
     if ( e->lastRecord.isNull() )
         ofs = ext.getOfs() + ( e->_extentData - (char *)e );
     else
         ofs = e->lastRecord.getOfs() + e->lastRecord.rec()->lengthWithHeaders();
     DiskLoc dl( ext.a(), ofs );
     Record *r = dl.rec();
     r = (Record*) getDur().writingPtr(r, Record::HeaderSize + len);
     r->lengthWithHeaders() = Record::HeaderSize + len;
     r->extentOfs() = e->myLoc.getOfs();
     r->nextOfs() = DiskLoc::NullOfs;
     r->prevOfs() = e->lastRecord.isNull() ? DiskLoc::NullOfs : e->lastRecord.getOfs();
     memcpy( r->data(), o.objdata(), len );
     if ( e->firstRecord.isNull() )
         e->firstRecord = dl;
     else
         getDur().writingInt(e->lastRecord.rec()->nextOfs()) = ofs;
     e->lastRecord = dl;
     return dl;
 }
RelationSchema::RelationSchema(const Record& record)
{
   // De-serialize relation meta data
   istringstream in(string(record.data(), record.size()), ios::binary);
   util::readBinary(sid, in);
   util::readBinary(name, in);

   // De-serialize its attributes
   size_t len;
   util::readBinary(len, in);
   attributes.resize(len);
   for(auto& iter : attributes) {
      util::readBinary(iter.name, in);
      util::readBinary(iter.type, in);
      util::readBinary(iter.notNull, in);
      util::readBinary(iter.offset, in);
   }

   // De-serialize its indexes
   util::readBinary(len, in);
   indexes.resize(len);
   for(auto& iter : indexes) {
      util::readBinary(iter.sid, in);
      util::readBinary(iter.indexedColumns, in);
      util::readBinary(iter.type, in);
      util::readBinary(iter.unique, in);
   }

   assert(in.good());
}
Exemple #4
0
    /** write an op to the oplog that is already built.
        todo : make _logOpRS() call this so we don't repeat ourself?
        */
    void _logOpObjRS(const BSONObj& op) {
        Lock::DBWrite lk("local");

        const OpTime ts = op["ts"]._opTime();
        long long h = op["h"].numberLong();

        {
            const char *logns = rsoplog;
            if ( rsOplogDetails == 0 ) {
                Client::Context ctx( logns , dbpath, false);
                localDB = ctx.db();
                verify( localDB );
                rsOplogDetails = nsdetails(logns);
                massert(13389, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails);
            }
            Client::Context ctx( logns , localDB, false );
            {
                int len = op.objsize();
                Record *r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len);
                memcpy(getDur().writingPtr(r->data(), len), op.objdata(), len);
            }
            /* todo: now() has code to handle clock skew.  but if the skew server to server is large it will get unhappy.
                     this code (or code in now() maybe) should be improved.
                     */
            if( theReplSet ) {
                if( !(theReplSet->lastOpTimeWritten<ts) ) {
                    log() << "replSet error possible failover clock skew issue? " << theReplSet->lastOpTimeWritten.toString() << ' ' << endl;
                }
                theReplSet->lastOpTimeWritten = ts;
                theReplSet->lastH = h;
                ctx.getClient()->setLastOp( ts );
            }
        }
    }
    StatusWith<DiskLoc> RecordStoreV1Base::_insertRecord( OperationContext* txn,
                                                          const char* data,
                                                          int len,
                                                          bool enforceQuota ) {

        int lenWHdr = getRecordAllocationSize( len + Record::HeaderSize );
        fassert( 17208, lenWHdr >= ( len + Record::HeaderSize ) );

        StatusWith<DiskLoc> loc = allocRecord( txn, lenWHdr, enforceQuota );
        if ( !loc.isOK() )
            return loc;

        Record *r = recordFor( loc.getValue() );
        fassert( 17210, r->lengthWithHeaders() >= lenWHdr );

        // copy the data
        r = reinterpret_cast<Record*>( txn->recoveryUnit()->writingPtr(r, lenWHdr) );
        memcpy( r->data(), data, len );

        _addRecordToRecListInExtent(txn, r, loc.getValue());

        _details->incrementStats( txn, r->netLength(), 1 );

        return loc;
    }
    void RecordStoreV1Base::deleteRecord( TransactionExperiment* txn, const DiskLoc& dl ) {

        Record* todelete = recordFor( dl );

        /* remove ourself from the record next/prev chain */
        {
            if ( todelete->prevOfs() != DiskLoc::NullOfs ) {
                DiskLoc prev = getPrevRecordInExtent( dl );
                Record* prevRecord = recordFor( prev );
                txn->writingInt( prevRecord->nextOfs() ) = todelete->nextOfs();
            }

            if ( todelete->nextOfs() != DiskLoc::NullOfs ) {
                DiskLoc next = getNextRecord( dl );
                Record* nextRecord = recordFor( next );
                txn->writingInt( nextRecord->prevOfs() ) = todelete->prevOfs();
            }
        }

        /* remove ourself from extent pointers */
        {
            Extent *e = txn->writing( _getExtent( _getExtentLocForRecord( dl ) ) );
            if ( e->firstRecord == dl ) {
                if ( todelete->nextOfs() == DiskLoc::NullOfs )
                    e->firstRecord.Null();
                else
                    e->firstRecord.set(dl.a(), todelete->nextOfs() );
            }
            if ( e->lastRecord == dl ) {
                if ( todelete->prevOfs() == DiskLoc::NullOfs )
                    e->lastRecord.Null();
                else
                    e->lastRecord.set(dl.a(), todelete->prevOfs() );
            }
        }

        /* add to the free list */
        {
            _details->incrementStats( txn, -1 * todelete->netLength(), -1 );

            if ( _isSystemIndexes ) {
                /* temp: if in system.indexes, don't reuse, and zero out: we want to be
                   careful until validated more, as IndexDetails has pointers
                   to this disk location.  so an incorrectly done remove would cause
                   a lot of problems.
                */
                memset( txn->writingPtr(todelete, todelete->lengthWithHeaders() ),
                        0, todelete->lengthWithHeaders() );
            }
            else {
                DEV {
                    unsigned long long *p = reinterpret_cast<unsigned long long *>( todelete->data() );
                    *txn->writing(p) = 0;
                }
                addDeletedRec(txn, dl);
            }
        }

    }
vector<harriet::Value> RelationSchema::recordToTuple(const Record& record) const
{
   vector<harriet::Value> result;
   result.reserve(attributes.size());
   for(auto& attribute : attributes)
      result.emplace_back(harriet::Value::createFromRecord(attribute.type, record.data()+attribute.offset));
   return result;
}
Exemple #8
0
    void profile( const Client& c , CurOp& currentOp ) {
        verify( Lock::somethingWriteLocked() );

        Database *db = c.database();
        DEV verify( db );
        const char *ns = db->profileName.c_str();
        
        // build object
        profileBufBuilder.reset();
        BSONObjBuilder b(profileBufBuilder);

        const bool isQueryObjTooBig = !currentOp.debug().append(currentOp, b,
                MAX_PROFILE_DOC_SIZE_BYTES);

        b.appendDate("ts", jsTime());
        b.append("client", c.clientAddress());

        if (c.getAuthenticationInfo()) {
            b.append("user", c.getAuthenticationInfo()->getUser(nsToDatabase(ns)));
        }

        BSONObj p = b.done();

        if (static_cast<size_t>(p.objsize()) > MAX_PROFILE_DOC_SIZE_BYTES || isQueryObjTooBig) {
            string small = p.toString(/*isArray*/false, /*full*/false);

            warning() << "can't add full line to system.profile: " << small << endl;

            // rebuild with limited info
            BSONObjBuilder b(profileBufBuilder);
            b.appendDate("ts", jsTime());
            b.append("client", c.clientAddress() );
            if ( c.getAuthenticationInfo() )
                b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) );

            b.append("err", "profile line too large (max is 100KB)");

            // should be much smaller but if not don't break anything
            if (small.size() < MAX_PROFILE_DOC_SIZE_BYTES){
                b.append("abbreviated", small);
            }

            p = b.done();
        }

        // write: not replicated
        // get or create the profiling collection
        NamespaceDetails *details = getOrCreateProfileCollection(db);
        if (details) {
            int len = p.objsize();
            Record *r = theDataFileMgr.fast_oplog_insert(details, ns, len);
            memcpy(getDur().writingPtr(r->data(), len), p.objdata(), len);
        }
    }
Exemple #9
0
    StatusWith<DiskLoc> Collection::insertDocument( const BSONObj& docToInsert, bool enforceQuota ) {
        int lenWHdr = _details->getRecordAllocationSize( docToInsert.objsize() + Record::HeaderSize );
        fassert( 17208, lenWHdr >= ( docToInsert.objsize() + Record::HeaderSize ) );

        if ( _details->isCapped() ) {
            // TOOD: old god not done
            Status ret = _indexCatalog.checkNoIndexConflicts( docToInsert );
            uassert(17209, "duplicate key insert for unique index of capped collection", ret.isOK() );
        }

        // TODO: for now, capped logic lives inside NamespaceDetails, which is hidden
        //       under the RecordStore, this feels broken since that should be a
        //       collection access method probably
        StatusWith<DiskLoc> loc = _recordStore.allocRecord( lenWHdr,
                                                            enforceQuota ? largestFileNumberInQuota() : 0 );
        if ( !loc.isOK() )
            return loc;

        Record *r = loc.getValue().rec();
        fassert( 17210, r->lengthWithHeaders() >= lenWHdr );

        // copy the data
        r = reinterpret_cast<Record*>( getDur().writingPtr(r, lenWHdr) );
        memcpy( r->data(), docToInsert.objdata(), docToInsert.objsize() );

        addRecordToRecListInExtent(r, loc.getValue()); // XXX move down into record store

        _details->incrementStats( r->netLength(), 1 );

        // TOOD: old god not done
        _infoCache.notifyOfWriteOp();

        try {
            _indexCatalog.indexRecord( docToInsert, loc.getValue() );
        }
        catch( AssertionException& e ) {
            if ( _details->isCapped() ) {
                return StatusWith<DiskLoc>( ErrorCodes::InternalError,
                                            str::stream() << "unexpected index insertion failure on"
                                            << " capped collection" << e.toString()
                                            << " - collection and its index will not match" );
            }

            // normal case -- we can roll back
            deleteDocument( loc.getValue(), false, true, NULL );
            throw;
        }

        // TODO: this is what the old code did, but is it correct?
        _details->paddingFits();

        return loc;

    }
    StatusWith<DiskLoc> RecordStoreV1Base::updateRecord( OperationContext* txn,
                                                         const DiskLoc& oldLocation,
                                                         const char* data,
                                                         int dataSize,
                                                         bool enforceQuota,
                                                         UpdateMoveNotifier* notifier ) {
        Record* oldRecord = recordFor( oldLocation );
        if ( oldRecord->netLength() >= dataSize ) {
            // we fit
            _paddingFits( txn );
            memcpy( txn->recoveryUnit()->writingPtr( oldRecord->data(), dataSize ), data, dataSize );
            return StatusWith<DiskLoc>( oldLocation );
        }

        if ( isCapped() )
            return StatusWith<DiskLoc>( ErrorCodes::InternalError,
                                        "failing update: objects in a capped ns cannot grow",
                                        10003 );

        // we have to move

        _paddingTooSmall( txn );

        StatusWith<DiskLoc> newLocation = _insertRecord( txn, data, dataSize, enforceQuota );
        if ( !newLocation.isOK() )
            return newLocation;

        // insert worked, so we delete old record
        if ( notifier ) {
            Status moveStatus = notifier->recordStoreGoingToMove( txn,
                                                                  oldLocation,
                                                                  oldRecord->data(),
                                                                  oldRecord->netLength() );
            if ( !moveStatus.isOK() )
                return StatusWith<DiskLoc>( moveStatus );
        }

        deleteRecord( txn, oldLocation );

        return newLocation;
    }
Exemple #11
0
    static void _profile(const Client& c, CurOp& currentOp, BufBuilder& profileBufBuilder) {
        Database *db = c.database();
        DEV verify( db );
        const char *ns = db->profileName.c_str();
        
        // build object
        BSONObjBuilder b(profileBufBuilder);
        b.appendDate("ts", jsTime());
        currentOp.debug().append( currentOp , b );

        b.append("client", c.clientAddress() );

        if ( c.getAuthenticationInfo() )
            b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) );

        BSONObj p = b.done();

        if (p.objsize() > 100*1024){
            string small = p.toString(/*isArray*/false, /*full*/false);

            warning() << "can't add full line to system.profile: " << small;

            // rebuild with limited info
            BSONObjBuilder b(profileBufBuilder);
            b.appendDate("ts", jsTime());
            b.append("client", c.clientAddress() );
            if ( c.getAuthenticationInfo() )
                b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) );

            b.append("err", "profile line too large (max is 100KB)");
            if (small.size() < 100*1024){ // should be much smaller but if not don't break anything
                b.append("abbreviated", small);
            }

            p = b.done();
        }

        // write: not replicated
        // get or create the profiling collection
        NamespaceDetails *details = getOrCreateProfileCollection(db);
        if (details) {
            int len = p.objsize();
            Record *r = theDataFileMgr.fast_oplog_insert(details, ns, len);
            memcpy(getDur().writingPtr(r->data(), len), p.objdata(), len);
        }
    }
Exemple #12
0
        IndexCatalog::IndexBuildBlock* halfAddIndex(const std::string& key) {
            string name = key + "_1";
            BSONObj indexInfo = BSON( "v" << 1 <<
                                      "key" << BSON( key << 1 ) <<
                                      "ns" << _ns <<
                                      "name" << name );
            int32_t lenWHdr = indexInfo.objsize() + Record::HeaderSize;
            const char* systemIndexes = "unittests.system.indexes";
            DiskLoc infoLoc = allocateSpaceForANewRecord( systemIndexes,
                                                          nsdetails( systemIndexes ),
                                                          lenWHdr,
                                                          false );
            Record* infoRecord = reinterpret_cast<Record*>( getDur().writingPtr( infoLoc.rec(),
                                                                                 lenWHdr ) );
            memcpy( infoRecord->data(), indexInfo.objdata(), indexInfo.objsize() );
            addRecordToRecListInExtent( infoRecord, infoLoc );

            return new IndexCatalog::IndexBuildBlock( _ctx.ctx().db()->getCollection( _ns )->getIndexCatalog(), name, infoLoc );
        }
    Status RecordStoreV1Base::updateWithDamages( OperationContext* txn,
                                                 const DiskLoc& loc,
                                                 const char* damageSource,
                                                 const mutablebson::DamageVector& damages ) {
        _paddingFits( txn );

        Record* rec = recordFor( loc );
        char* root = rec->data();

        // All updates were in place. Apply them via durability and writing pointer.
        mutablebson::DamageVector::const_iterator where = damages.begin();
        const mutablebson::DamageVector::const_iterator end = damages.end();
        for( ; where != end; ++where ) {
            const char* sourcePtr = damageSource + where->sourceOffset;
            void* targetPtr = txn->recoveryUnit()->writingPtr(root + where->targetOffset, where->size);
            std::memcpy(targetPtr, sourcePtr, where->size);
        }

        return Status::OK();
    }
Exemple #14
0
 /** @return IndexDetails for a new index on a:1, with the info field populated. */
 IndexDetails& addIndexWithInfo() {
     BSONObj indexInfo = BSON( "v" << 1 <<
                               "key" << BSON( "a" << 1 ) <<
                               "ns" << _ns <<
                               "name" << "a_1" );
     int32_t lenWHdr = indexInfo.objsize() + Record::HeaderSize;
     const char* systemIndexes = "unittests.system.indexes";
     DiskLoc infoLoc = allocateSpaceForANewRecord( systemIndexes,
                                                   nsdetails( systemIndexes ),
                                                   lenWHdr,
                                                   false );
     Record* infoRecord = reinterpret_cast<Record*>( getDur().writingPtr( infoLoc.rec(),
                                                                          lenWHdr ) );
     memcpy( infoRecord->data(), indexInfo.objdata(), indexInfo.objsize() );
     addRecordToRecListInExtent( infoRecord, infoLoc );
     IndexDetails& id = nsdetails( _ns )->getNextIndexDetails( _ns );
     nsdetails( _ns )->addIndex();
     id.info.writing() = infoLoc;
     return id;
 }
Exemple #15
0
        /** @return IndexDetails for a new index on a:1, with the info field populated. */
        IndexDescriptor* addIndexWithInfo() {
            BSONObj indexInfo = BSON( "v" << 1 <<
                                      "key" << BSON( "a" << 1 ) <<
                                      "ns" << _ns <<
                                      "name" << "a_1" );
            int32_t lenWHdr = indexInfo.objsize() + Record::HeaderSize;
            const char* systemIndexes = "unittests.system.indexes";
            DiskLoc infoLoc = allocateSpaceForANewRecord( systemIndexes,
                                                          nsdetails( systemIndexes ),
                                                          lenWHdr,
                                                          false );
            Record* infoRecord = reinterpret_cast<Record*>( getDur().writingPtr( infoLoc.rec(),
                                                                                 lenWHdr ) );
            memcpy( infoRecord->data(), indexInfo.objdata(), indexInfo.objsize() );
            addRecordToRecListInExtent( infoRecord, infoLoc );

            IndexCatalog::IndexBuildBlock blk( collection()->getIndexCatalog(), "a_1", infoLoc );
            blk.success();

            return collection()->getIndexCatalog()->findIndexByName( "a_1" );
        }
    StatusWith<DiskLoc> RecordStoreV1Base::insertRecord( TransactionExperiment* txn,
                                                         const DocWriter* doc,
                                                         int quotaMax ) {
        int lenWHdr = doc->documentSize() + Record::HeaderSize;
        if ( doc->addPadding() )
            lenWHdr = getRecordAllocationSize( lenWHdr );

        StatusWith<DiskLoc> loc = allocRecord( txn, lenWHdr, quotaMax );
        if ( !loc.isOK() )
            return loc;

        Record *r = recordFor( loc.getValue() );
        fassert( 17319, r->lengthWithHeaders() >= lenWHdr );

        r = reinterpret_cast<Record*>( txn->writingPtr(r, lenWHdr) );
        doc->writeDocument( r->data() );

        _addRecordToRecListInExtent(txn, r, loc.getValue());

        _details->incrementStats( txn, r->netLength(), 1 );

        return loc;
    }
void RelationSchema::loadTuple(const Record& record, harriet::Value& target, uint32_t position) const
{
   target = harriet::Value::createFromRecord(attributes[position].type, record.data()+attributes[position].offset);
}
Exemple #18
0
    /* we write to local.oplog.$main:
         { ts : ..., op: ..., ns: ..., o: ... }
       ts: an OpTime timestamp
       op:
        "i" insert
        "u" update
        "d" delete
        "c" db cmd
        "db" declares presence of a database (ns is set to the db name + '.')
        "n" no op
       logNS: where to log it.  0/null means "local.oplog.$main".
       bb:
         if not null, specifies a boolean to pass along to the other side as b: param.
         used for "justOne" or "upsert" flags on 'd', 'u'
       first: true
         when set, indicates this is the first thing we have logged for this database.
         thus, the slave does not need to copy down all the data when it sees this.

       note this is used for single collection logging even when --replSet is enabled.
    */
    static void _logOpOld(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb, bool fromMigrate ) {
        Lock::DBWrite lk("local");
        static BufBuilder bufbuilder(8*1024); // todo there is likely a mutex on this constructor

        if ( strncmp(ns, "local.", 6) == 0 ) {
            if ( strncmp(ns, "local.slaves", 12) == 0 ) {
                resetSlaveCache();
            }
            return;
        }

        mutex::scoped_lock lk2(OpTime::m);

        const OpTime ts = OpTime::now(lk2);
        Client::Context context("",0,false);

        /* we jump through a bunch of hoops here to avoid copying the obj buffer twice --
           instead we do a single copy to the destination position in the memory mapped file.
        */

        bufbuilder.reset();
        BSONObjBuilder b(bufbuilder);
        b.appendTimestamp("ts", ts.asDate());
        b.append("op", opstr);
        b.append("ns", ns);
        if (fromMigrate) 
            b.appendBool("fromMigrate", true);
        if ( bb )
            b.appendBool("b", *bb);
        if ( o2 )
            b.append("o2", *o2);
        BSONObj partial = b.done(); // partial is everything except the o:... part.

        int po_sz = partial.objsize();
        int len = po_sz + obj.objsize() + 1 + 2 /*o:*/;

        Record *r;
        if( logNS == 0 ) {
            logNS = "local.oplog.$main";
            if ( localOplogMainDetails == 0 ) {
                Client::Context ctx( logNS , dbpath, false);
                localDB = ctx.db();
                verify( localDB );
                localOplogMainDetails = nsdetails(logNS);
                verify( localOplogMainDetails );
            }
            Client::Context ctx( logNS , localDB, false );
            r = theDataFileMgr.fast_oplog_insert(localOplogMainDetails, logNS, len);
        }
        else {
            Client::Context ctx( logNS, dbpath, false );
            verify( nsdetails( logNS ) );
            // first we allocate the space, then we fill it below.
            r = theDataFileMgr.fast_oplog_insert( nsdetails( logNS ), logNS, len);
        }

        append_O_Obj(r->data(), partial, obj);

        context.getClient()->setLastOp( ts );

        if ( logLevel >= 6 ) {
            BSONObj temp(r);
            log( 6 ) << "logging op:" << temp << endl;
        }
    }
Exemple #19
0
    static void _logOpRS(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb, bool fromMigrate ) {
        Lock::DBWrite lk1("local");

        if ( strncmp(ns, "local.", 6) == 0 ) {
            if ( strncmp(ns, "local.slaves", 12) == 0 )
                resetSlaveCache();
            return;
        }

        mutex::scoped_lock lk2(OpTime::m);

        const OpTime ts = OpTime::now(lk2);
        long long hashNew;
        if( theReplSet ) {
            massert(13312, "replSet error : logOp() but not primary?", theReplSet->box.getState().primary());
            hashNew = (theReplSet->lastH * 131 + ts.asLL()) * 17 + theReplSet->selfId();
        }
        else {
            // must be initiation
            verify( *ns == 0 );
            hashNew = 0;
        }

        /* we jump through a bunch of hoops here to avoid copying the obj buffer twice --
           instead we do a single copy to the destination position in the memory mapped file.
        */

        logopbufbuilder.reset();
        BSONObjBuilder b(logopbufbuilder);
        b.appendTimestamp("ts", ts.asDate());
        b.append("h", hashNew);
        b.append("op", opstr);
        b.append("ns", ns);
        if (fromMigrate) 
            b.appendBool("fromMigrate", true);
        if ( bb )
            b.appendBool("b", *bb);
        if ( o2 )
            b.append("o2", *o2);
        BSONObj partial = b.done();
        int posz = partial.objsize();
        int len = posz + obj.objsize() + 1 + 2 /*o:*/;

        Record *r;
        DEV verify( logNS == 0 );
        {
            const char *logns = rsoplog;
            if ( rsOplogDetails == 0 ) {
                Client::Context ctx( logns , dbpath, false);
                localDB = ctx.db();
                verify( localDB );
                rsOplogDetails = nsdetails(logns);
                massert(13347, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails);
            }
            Client::Context ctx( logns , localDB, false );
            r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len);
            /* todo: now() has code to handle clock skew.  but if the skew server to server is large it will get unhappy.
                     this code (or code in now() maybe) should be improved.
                     */
            if( theReplSet ) {
                if( !(theReplSet->lastOpTimeWritten<ts) ) {
                    log() << "replSet ERROR possible failover clock skew issue? " << theReplSet->lastOpTimeWritten << ' ' << ts << rsLog;
                    log() << "replSet " << theReplSet->isPrimary() << rsLog;
                }
                theReplSet->lastOpTimeWritten = ts;
                theReplSet->lastH = hashNew;
                ctx.getClient()->setLastOp( ts );
            }
        }

        append_O_Obj(r->data(), partial, obj);

        if ( logLevel >= 6 ) {
            BSONObj temp(r);
            log( 6 ) << "logOp:" << temp << endl;
        }
    }
Exemple #20
0
    /** @return number of skipped (invalid) documents */
    unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc diskloc, int n,
                const scoped_array<IndexSpec> &indexSpecs,
                scoped_array<SortPhaseOne>& phase1, int nidx, bool validate, 
                double pf, int pb)
    {
        log() << "compact begin extent #" << n << " for namespace " << ns << endl;
        unsigned oldObjSize = 0; // we'll report what the old padding was
        unsigned oldObjSizeWithPadding = 0;

        Extent *e = diskloc.ext();
        e->assertOk();
        verify( e->validates() );
        unsigned skipped = 0;

        {
            // the next/prev pointers within the extent might not be in order so we first page the whole thing in 
            // sequentially
            log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl;
            Timer t;
            MongoDataFile* mdf = cc().database()->getFile( diskloc.a() );
            HANDLE fd = mdf->getFd();
            int offset = diskloc.getOfs();
            Extent* ext = diskloc.ext();
            size_t length = ext->length;
                
            touch_pages(fd, offset, length, ext);
            int ms = t.millis();
            if( ms > 1000 ) 
                log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl;
        }

        {
            log() << "compact copying records" << endl;
            long long datasize = 0;
            long long nrecords = 0;
            DiskLoc L = e->firstRecord;
            if( !L.isNull() ) {
                while( 1 ) {
                    Record *recOld = L.rec();
                    L = recOld->nextInExtent(L);
                    BSONObj objOld = BSONObj::make(recOld);

                    if( !validate || objOld.valid() ) {
                        nrecords++;
                        unsigned sz = objOld.objsize();

                        oldObjSize += sz;
                        oldObjSizeWithPadding += recOld->netLength();

                        unsigned lenWHdr = sz + Record::HeaderSize;
                        unsigned lenWPadding = lenWHdr;
                        {
                            lenWPadding = static_cast<unsigned>(pf*lenWPadding);
                            lenWPadding += pb;
                            lenWPadding = lenWPadding & quantizeMask(lenWPadding);
                            if( lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { 
                                lenWPadding = lenWHdr;
                            }
                        }
                        DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false);
                        uassert(14024, "compact error out of space during compaction", !loc.isNull());
                        Record *recNew = loc.rec();
                        datasize += recNew->netLength();
                        recNew = (Record *) getDur().writingPtr(recNew, lenWHdr);
                        addRecordToRecListInExtent(recNew, loc);
                        memcpy(recNew->data(), objOld.objdata(), sz);

                        {
                            // extract keys for all indexes we will be rebuilding
                            for( int x = 0; x < nidx; x++ ) { 
                                phase1[x].addKeys(indexSpecs[x], objOld, loc);
                            }
                        }
                    }
                    else { 
                        if( ++skipped <= 10 )
                            log() << "compact skipping invalid object" << endl;
                    }

                    if( L.isNull() ) { 
                        // we just did the very last record from the old extent.  it's still pointed to 
                        // by the old extent ext, but that will be fixed below after this loop
                        break;
                    }

                    // remove the old records (orphan them) periodically so our commit block doesn't get too large
                    bool stopping = false;
                    RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0;
                    if( stopping || getDur().aCommitIsNeeded() ) {
                        e->firstRecord.writing() = L;
                        Record *r = L.rec();
                        getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs;
                        getDur().commitIfNeeded();
                        killCurrentOp.checkForInterrupt(false);
                    }
                }
            } // if !L.isNull()

            verify( d->firstExtent == diskloc );
            verify( d->lastExtent != diskloc );
            DiskLoc newFirst = e->xnext;
            d->firstExtent.writing() = newFirst;
            newFirst.ext()->xprev.writing().Null();
            getDur().writing(e)->markEmpty();
            freeExtents( diskloc, diskloc );
            // update datasize/record count for this namespace's extent
            {
                NamespaceDetails::Stats *s = getDur().writing(&d->stats);
                s->datasize += datasize;
                s->nrecords += nrecords;
            }

            getDur().commitIfNeeded();

            { 
                double op = 1.0;
                if( oldObjSize ) 
                    op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize;
                log() << "compact finished extent #" << n << " containing " << nrecords << " documents (" << datasize/1000000.0 << "MB)"
                    << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100
                    << endl;                    
            }
        }

        return skipped;
    }
Exemple #21
0
    DiskLoc DataFileMgr::insert(const char* ns,
                                const void* obuf,
                                int32_t len,
                                bool mayInterrupt,
                                bool god,
                                bool mayAddIndex,
                                bool* addedID) {

        Database* database = cc().database();

        bool wouldAddIndex = false;
        massert( 10093 , "cannot insert into reserved $ collection", god || NamespaceString::normal( ns ) );
        uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) );
        {
            const char *sys = strstr(ns, "system.");
            if ( sys && !insert_checkSys(sys, ns, wouldAddIndex, obuf, god) )
                return DiskLoc();
        }
        bool addIndex = wouldAddIndex && mayAddIndex;

        Collection* collection = database->getCollection( ns );
        if ( collection == NULL ) {
            collection = database->createCollection( ns, false, NULL );

            int ies = Extent::initialSize(len);
            if( str::contains(ns, '$') &&
                len + Record::HeaderSize >= BtreeData_V1::BucketSize - 256 &&
                len + Record::HeaderSize <= BtreeData_V1::BucketSize + 256 ) {
                // probably an index.  so we pick a value here for the first extent instead of using
                // initialExtentSize() which is more for user collections.
                // TODO: we could look at the # of records in the parent collection to be smarter here.
                ies = (32+4) * 1024;
            }
            collection->increaseStorageSize( ies, false);
            if ( !god )
                ensureIdIndexForNewNs(ns);
        }

        NamespaceDetails* d = collection->details();

        string tabletoidxns;
        Collection* collectionToIndex = 0;
        NamespaceDetails* tableToIndex = 0;

        BSONObj fixedIndexObject;
        if ( addIndex ) {
            verify( obuf );
            BSONObj io((const char *) obuf);

            tabletoidxns = io.getStringField( "ns" );
            uassert(10096, "invalid ns to index", tabletoidxns.find( '.' ) != string::npos);
            massert(10097,
                    str::stream() << "trying to create index on wrong db "
                    << " db: " << database->name() << " collection: " << tabletoidxns,
                    database->ownsNS( tabletoidxns ) );

            collectionToIndex = database->getCollection( tabletoidxns );
            if ( !collectionToIndex ) {
                collectionToIndex = database->createCollection( tabletoidxns, false, NULL );
                verify( collectionToIndex );
                if ( !god )
                    ensureIdIndexForNewNs( tabletoidxns.c_str() );
            }

            tableToIndex = collectionToIndex->details();

            Status status = collectionToIndex->getIndexCatalog()->okToAddIndex( io );
            if ( status.code() == ErrorCodes::IndexAlreadyExists ) {
                // dup index, we ignore
                return DiskLoc();
            }

            uassert( 17199,
                     str::stream() << "cannot build index on " << tabletoidxns
                     << " because of " << status.toString(),
                     status.isOK() );

            if( !prepareToBuildIndex(io,
                                     mayInterrupt,
                                     god,
                                     tabletoidxns ) ) {
                // prepare creates _id itself, or this indicates to fail the build silently (such 
                // as if index already exists)
                return DiskLoc();
            }

            fixedIndexObject = IndexCatalog::fixIndexSpec( io );

            obuf = fixedIndexObject.objdata();
            len = fixedIndexObject.objsize();
        }

        IDToInsert idToInsert; // only initialized if needed

        if( !god ) {
            /* Check if we have an _id field. If we don't, we'll add it.
               Note that btree buckets which we insert aren't BSONObj's, but in that case god==true.
            */
            BSONObj io((const char *) obuf);
            BSONElement idField = io.getField( "_id" );
            uassert( 10099 ,  "_id cannot be an array", idField.type() != Array );
            // we don't add _id for capped collections in local as they don't have an _id index
            if( idField.eoo() &&
                !wouldAddIndex &&
                nsToDatabase( ns ) != "local" &&
                d->haveIdIndex() ) {

                if( addedID )
                    *addedID = true;

                idToInsert.init();
                len += idToInsert.size();
            }

            BSONElementManipulator::lookForTimestamps( io );
        }

        int lenWHdr = d->getRecordAllocationSize( len + Record::HeaderSize );
        fassert( 16440, lenWHdr >= ( len + Record::HeaderSize ) );

        // If the collection is capped, check if the new object will violate a unique index
        // constraint before allocating space.
        if ( d->isCapped() && !god) {
            BSONObj temp = BSONObj( reinterpret_cast<const char *>( obuf ) );
            Status ret = collection->getIndexCatalog()->checkNoIndexConflicts( temp );
            uassert(12582, "duplicate key insert for unique index of capped collection", ret.isOK() );
        }

        DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god);

        if ( loc.isNull() ) {
            log() << "insert: couldn't alloc space for object ns:" << ns
                  << " capped:" << d->isCapped() << endl;
            verify(d->isCapped());
            return DiskLoc();
        }

        Record *r = loc.rec();
        {
            verify( r->lengthWithHeaders() >= lenWHdr );
            r = (Record*) getDur().writingPtr(r, lenWHdr);
            if( idToInsert.needed() ) {
                /* a little effort was made here to avoid a double copy when we add an ID */
                int originalSize = *((int*) obuf);
                ((int&)*r->data()) = originalSize + idToInsert.size();
                memcpy(r->data()+4, idToInsert.rawdata(), idToInsert.size());
                memcpy(r->data()+4+idToInsert.size(), ((char*)obuf)+4, originalSize-4);
            }
            else {
                if( obuf ) // obuf can be null from internal callers
                    memcpy(r->data(), obuf, len);
            }
        }

        addRecordToRecListInExtent(r, loc);

        d->incrementStats( r->netLength(), 1 );

        // we don't bother resetting query optimizer stats for the god tables - also god is true when adding a btree bucket
        if ( !god )
            collection->infoCache()->notifyOfWriteOp();

        if ( tableToIndex ) {
            insert_makeIndex(collectionToIndex, loc, mayInterrupt);
        }

        /* add this record to our indexes */
        if ( d->getTotalIndexCount() > 0 ) {
            try {
                BSONObj obj(r->data());
                collection->getIndexCatalog()->indexRecord(obj, loc);
            }
            catch( AssertionException& e ) {
                // should be a dup key error on _id index
                if( tableToIndex || d->isCapped() ) {
                    massert( 12583, "unexpected index insertion failure on capped collection", !d->isCapped() );
                    string s = e.toString();
                    s += " : on addIndex/capped - collection and its index will not match";
                    setLastError(0, s.c_str());
                    error() << s << endl;
                }
                else {
                    // normal case -- we can roll back
                    _deleteRecord(d, ns, r, loc);
                    throw;
                }
            }
        }

        d->paddingFits();

        return loc;
    }
Exemple #22
0
    /** @return number of skipped (invalid) documents */
    unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc diskloc, int n,
                           int nidx, bool validate, double pf, int pb, bool useDefaultPadding,
                           bool preservePadding) {

        log() << "compact begin extent #" << n << " for namespace " << ns << endl;
        unsigned oldObjSize = 0; // we'll report what the old padding was
        unsigned oldObjSizeWithPadding = 0;

        Extent *e = diskloc.ext();
        e->assertOk();
        verify( e->validates(diskloc) );
        unsigned skipped = 0;

        Database* db = cc().database();

        {
            // the next/prev pointers within the extent might not be in order so we first
            // page the whole thing in sequentially
            log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl;
            Timer t;
            Extent* ext = db->getExtentManager().getExtent( diskloc );
            size_t length = ext->length;

            touch_pages( reinterpret_cast<const char*>(ext), length );
            int ms = t.millis();
            if( ms > 1000 )
                log() << "compact end paging in " << ms << "ms "
                      << e->length/1000000.0/ms << "MB/sec" << endl;
        }

        {
            log() << "compact copying records" << endl;
            long long datasize = 0;
            long long nrecords = 0;
            DiskLoc L = e->firstRecord;
            if( !L.isNull() ) {
                while( 1 ) {
                    Record *recOld = L.rec();
                    L = db->getExtentManager().getNextRecordInExtent(L);
                    BSONObj objOld = BSONObj::make(recOld);

                    if( !validate || objOld.valid() ) {
                        nrecords++;
                        unsigned sz = objOld.objsize();

                        oldObjSize += sz;
                        oldObjSizeWithPadding += recOld->netLength();

                        unsigned lenWHdr = sz + Record::HeaderSize;
                        unsigned lenWPadding = lenWHdr;
                        // if we are preserving the padding, the record should not change size
                        if (preservePadding) {
                            lenWPadding = recOld->lengthWithHeaders();
                        }
                        // maintain UsePowerOf2Sizes if no padding values were passed in
                        else if (d->isUserFlagSet(NamespaceDetails::Flag_UsePowerOf2Sizes)
                                && useDefaultPadding) {
                            lenWPadding = d->quantizePowerOf2AllocationSpace(lenWPadding);
                        }
                        // otherwise use the padding values (pf and pb) that were passed in
                        else {
                            lenWPadding = static_cast<unsigned>(pf*lenWPadding);
                            lenWPadding += pb;
                            lenWPadding = lenWPadding & quantizeMask(lenWPadding);
                        }
                        if (lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { 
                            lenWPadding = lenWHdr;
                        }
                        DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false);
                        uassert(14024, "compact error out of space during compaction", !loc.isNull());
                        Record *recNew = loc.rec();
                        datasize += recNew->netLength();
                        recNew = (Record *) getDur().writingPtr(recNew, lenWHdr);
                        addRecordToRecListInExtent(recNew, loc);
                        memcpy(recNew->data(), objOld.objdata(), sz);
                    }
                    else { 
                        if( ++skipped <= 10 )
                            log() << "compact skipping invalid object" << endl;
                    }

                    if( L.isNull() ) { 
                        // we just did the very last record from the old extent.  it's still pointed to 
                        // by the old extent ext, but that will be fixed below after this loop
                        break;
                    }

                    // remove the old records (orphan them) periodically so our commit block doesn't get too large
                    bool stopping = false;
                    RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0;
                    if( stopping || getDur().aCommitIsNeeded() ) {
                        e->firstRecord.writing() = L;
                        Record *r = L.rec();
                        getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs;
                        getDur().commitIfNeeded();
                        killCurrentOp.checkForInterrupt(false);
                    }
                }
            } // if !L.isNull()

            verify( d->firstExtent() == diskloc );
            verify( d->lastExtent() != diskloc );
            DiskLoc newFirst = e->xnext;
            d->firstExtent().writing() = newFirst;
            newFirst.ext()->xprev.writing().Null();
            getDur().writing(e)->markEmpty();
            cc().database()->getExtentManager().freeExtents( diskloc, diskloc );

            // update datasize/record count for this namespace's extent
            d->incrementStats( datasize, nrecords );

            getDur().commitIfNeeded();

            { 
                double op = 1.0;
                if( oldObjSize ) 
                    op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize;
                log() << "compact finished extent #" << n << " containing " << nrecords << " documents (" << datasize/1000000.0 << "MB)"
                    << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100
                    << endl;                    
            }
        }

        return skipped;
    }
Exemple #23
0
    StatusWith<DiskLoc> Collection::updateDocument( OperationContext* txn,
                                                    const DiskLoc& oldLocation,
                                                    const BSONObj& objNew,
                                                    bool enforceQuota,
                                                    OpDebug* debug ) {

        Record* oldRecord = _recordStore->recordFor( oldLocation );
        BSONObj objOld( oldRecord->data() );

        if ( objOld.hasElement( "_id" ) ) {
            BSONElement oldId = objOld["_id"];
            BSONElement newId = objNew["_id"];
            if ( oldId != newId )
                return StatusWith<DiskLoc>( ErrorCodes::InternalError,
                                            "in Collection::updateDocument _id mismatch",
                                            13596 );
        }

        if ( ns().coll() == "system.users" ) {
            // XXX - andy and spencer think this should go away now
            V2UserDocumentParser parser;
            Status s = parser.checkValidUserDocument(objNew);
            if ( !s.isOK() )
                return StatusWith<DiskLoc>( s );
        }

        /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further
           below.  that is suboptimal, but it's pretty complicated to do it the other way without rollbacks...
        */
        OwnedPointerMap<IndexDescriptor*,UpdateTicket> updateTickets;
        IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator( true );
        while ( ii.more() ) {
            IndexDescriptor* descriptor = ii.next();
            IndexAccessMethod* iam = _indexCatalog.getIndex( descriptor );

            InsertDeleteOptions options;
            options.logIfError = false;
            options.dupsAllowed =
                !(KeyPattern::isIdKeyPattern(descriptor->keyPattern()) || descriptor->unique())
                || replset::ignoreUniqueIndex(descriptor);
            UpdateTicket* updateTicket = new UpdateTicket();
            updateTickets.mutableMap()[descriptor] = updateTicket;
            Status ret = iam->validateUpdate(objOld, objNew, oldLocation, options, updateTicket );
            if ( !ret.isOK() ) {
                return StatusWith<DiskLoc>( ret );
            }
        }

        // this can callback into Collection::recordStoreGoingToMove
        StatusWith<DiskLoc> newLocation = _recordStore->updateRecord( txn,
                                                                      oldLocation,
                                                                      objNew.objdata(),
                                                                      objNew.objsize(),
                                                                      enforceQuota ? largestFileNumberInQuota() : 0,
                                                                      this );

        if ( !newLocation.isOK() ) {
            return newLocation;
        }

        _infoCache.notifyOfWriteOp();

        if ( newLocation.getValue() != oldLocation ) {

            if ( debug ) {
                if (debug->nmoved == -1) // default of -1 rather than 0
                    debug->nmoved = 1;
                else
                    debug->nmoved += 1;
            }

            _indexCatalog.indexRecord(txn, objNew, newLocation.getValue());

            return newLocation;
        }

        if ( debug )
            debug->keyUpdates = 0;

        ii = _indexCatalog.getIndexIterator( true );
        while ( ii.more() ) {
            IndexDescriptor* descriptor = ii.next();
            IndexAccessMethod* iam = _indexCatalog.getIndex( descriptor );

            int64_t updatedKeys;
            Status ret = iam->update(txn, *updateTickets.mutableMap()[descriptor], &updatedKeys);
            if ( !ret.isOK() )
                return StatusWith<DiskLoc>( ret );
            if ( debug )
                debug->keyUpdates += updatedKeys;
        }

        // Broadcast the mutation so that query results stay correct.
        _cursorCache.invalidateDocument(oldLocation, INVALIDATION_MUTATION);

        return newLocation;
    }
Exemple #24
0
 BSONObj Collection::docFor(const DiskLoc& loc) const {
     Record* rec = _recordStore->recordFor( loc );
     return BSONObj( rec->data() );
 }
Exemple #25
0
int test(const unsigned pageSize) {
	// Bookkeeping
	unordered_map<TID, unsigned> values; // TID -> testData entry
	unordered_map<unsigned, unsigned> usage; // pageID -> bytes used within this page

	// Setting everything
	shared_ptr<BufferManager> bm(new BufferManager(100,
			shared_ptr<FileManager>(new FileManager("test_data"))));
	SPSegment sp(3, bm, true);
	Random64 rnd;

	// Insert some records
	for (unsigned i=0; i<maxInserts; ++i) {
		// Select string/record to insert
		uint64_t r = rnd.next()%testData.size();
		const string s = testData[r];

		// Check that there is space available for 's'
		bool full = true;
		for (unsigned p=0; p<initialSize; ++p) {
			if (loadFactor*pageSize - usage[p] > s.size()) {
#ifndef SILENT
				cerr << "in page " << hex << (p+1) << dec << " expecting "
						<< (loadFactor*pageSize - usage[p]) << endl;
#endif
				full = false;
				break;
			}
		}
		if (full)
			break;

		// Insert record
		TID tid = sp.insert(Record(s.size(), s.c_str()));
		EXPECT_EQ(values.end(), values.find(tid)) << "TIDs should not be overwritten";
		values[tid]=r;
		// extract the pageId from the TID
		// PageIDs start at 1
		unsigned pageId = util::extractPageIDFromTID(tid) - 1;
		EXPECT_LT(pageId, initialSize) << "pageId should be within [0, initialSize)";
		usage[pageId]+=s.size();
	}

	// Lookup & delete some records
	for (unsigned i=0; i<maxDeletes; ++i) {
		// Select operation
		bool del = rnd.next()%10 == 0;

		// Select victim
		TID tid = values.begin()->first;
		unsigned pageId = util::extractPageIDFromTID(tid)-1;
		const std::string& value = testData[(values.begin()->second)%testData.size()];
		unsigned len = value.size();

		// Lookup
		Record rec = sp.lookup(tid);
		EXPECT_EQ(len, rec.len());
		EXPECT_EQ(0, memcmp(rec.data(), value.c_str(), len));

		if (del) { // do delete
			EXPECT_TRUE(sp.remove(tid));
			values.erase(tid);
			usage[pageId]-=len;
		}
	}

	// Insert some records, again
	for (unsigned i=0; i<maxInserts; ++i) {
		// Select string/record to insert
		uint64_t r = rnd.next()%testData.size();
		const string s = testData[r];

		// Check that there is space available for 's'
		bool full = true;
		for (unsigned p=0; p<initialSize; ++p) {
			if (loadFactor*pageSize - usage[p] > s.size()) {
				cerr << "in page " << hex << (p+1) << dec << " expecting "
						<< (loadFactor*pageSize - usage[p]) << endl;
				full = false;
				break;
			}
		}
		if (full)
			break;

		// Insert record
		TID tid = sp.insert(Record(s.size(), s.c_str()));
		EXPECT_EQ(values.end(), values.find(tid)) << "TIDs should not be overwritten";
		values[tid]=r;
		// extract the pageId from the TID
		// PageIDs start at 1
		unsigned pageId = util::extractPageIDFromTID(tid) - 1;
		EXPECT_LT(pageId, initialSize) << "pageId should be within [0, initialSize)";
		usage[pageId]+=s.size();
	}

	/*// Update some values ('usage' counter invalid from here on)
	for (unsigned i=0; i<maxUpdates; ++i) {
		// Select victim
		TID tid = values.begin()->first;

		// Select new string/record
		uint64_t r = rnd.next()%testData.size();
		const string s = testData[r];

		// Replace old with new value
		sp.update(tid, Record(s.size(), s.c_str()));
		values[tid]=r;
	}*/

	// Lookups
	for (auto p : values) {
		TID tid = p.first;
		const std::string& value = testData[p.second];
		unsigned len = value.size();
		Record rec = sp.lookup(tid);
		EXPECT_EQ(len, rec.len());
		EXPECT_EQ(0, memcmp(rec.data(), value.c_str(), len));
	}

	return 0;
}
    void RecordStoreV1Base::deleteRecord( OperationContext* txn, const DiskLoc& dl ) {

        Record* todelete = recordFor( dl );
        invariant( todelete->netLength() >= 4 ); // this is required for defensive code

        /* remove ourself from the record next/prev chain */
        {
            if ( todelete->prevOfs() != DiskLoc::NullOfs ) {
                DiskLoc prev = getPrevRecordInExtent( txn, dl );
                Record* prevRecord = recordFor( prev );
                txn->recoveryUnit()->writingInt( prevRecord->nextOfs() ) = todelete->nextOfs();
            }

            if ( todelete->nextOfs() != DiskLoc::NullOfs ) {
                DiskLoc next = getNextRecord( txn, dl );
                Record* nextRecord = recordFor( next );
                txn->recoveryUnit()->writingInt( nextRecord->prevOfs() ) = todelete->prevOfs();
            }
        }

        /* remove ourself from extent pointers */
        {
            DiskLoc extentLoc = todelete->myExtentLoc(dl);
            Extent *e =  _getExtent( txn, extentLoc );
            if ( e->firstRecord == dl ) {
                txn->recoveryUnit()->writing(&e->firstRecord);
                if ( todelete->nextOfs() == DiskLoc::NullOfs )
                    e->firstRecord.Null();
                else
                    e->firstRecord.set(dl.a(), todelete->nextOfs() );
            }
            if ( e->lastRecord == dl ) {
                txn->recoveryUnit()->writing(&e->lastRecord);
                if ( todelete->prevOfs() == DiskLoc::NullOfs )
                    e->lastRecord.Null();
                else
                    e->lastRecord.set(dl.a(), todelete->prevOfs() );
            }
        }

        /* add to the free list */
        {
            _details->incrementStats( txn, -1 * todelete->netLength(), -1 );

            if ( _isSystemIndexes ) {
                /* temp: if in system.indexes, don't reuse, and zero out: we want to be
                   careful until validated more, as IndexDetails has pointers
                   to this disk location.  so an incorrectly done remove would cause
                   a lot of problems.
                */
                memset( txn->recoveryUnit()->writingPtr(todelete, todelete->lengthWithHeaders() ),
                        0, todelete->lengthWithHeaders() );
            }
            else {
                // this is defensive so we can detect if we are still using a location
                // that was deleted
                memset(txn->recoveryUnit()->writingPtr(todelete->data(), 4), 0xee, 4);
                addDeletedRec(txn, dl);
            }
        }

    }
TEST(SlottedPage, Randomized)
{
    const uint32_t kTestScale = 1;
    const uint32_t iterations = 10000;
    util::Random ranny;

    for(uint32_t j=0; j<kTestScale; j++) {
        std::unordered_map<RecordId, std::string> reference;
        SlottedPage* slottedPage = static_cast<SlottedPage*>(malloc(kPageSize));
        slottedPage->initialize();

        // Add some initial data
        for(uint32_t i=0; i<kPageSize/3/32; i++) {
            std::string data = util::randomWord(ranny, 8, 64);
            if(slottedPage->getBytesFreeForRecord() < data.size())
                continue;
            RecordId id = slottedPage->insert(Record(data));
            // std::cout << "insert " << id << " -> " << data << std::endl;
            ASSERT_TRUE(reference.count(id) == 0);
            reference.insert(make_pair(id, data));
        }

        // Work on it
        for(uint32_t i=0; i<iterations; i++) {
            int32_t operation = ranny.rand() % 100;

            // Do insert
            if(operation <= 40) {
                std::string data = util::randomWord(ranny, 8, 64);
                if(slottedPage->getBytesFreeForRecord() < data.size())
                    continue;
                RecordId id = slottedPage->insert(Record(data));
                // std::cout << "insert " << id << " -> " << data << std::endl;
                ASSERT_TRUE(reference.count(id) == 0);
                reference.insert(make_pair(id, data));
            }

            // Do remove
            else if(operation <= 80) {
                if(reference.empty())
                    continue;
                RecordId id = reference.begin()->first;
                // std::cout << "remove " << id << std::endl;
                Record record = slottedPage->lookup(id);
                ASSERT_EQ(slottedPage->isReference(id), kInvalidTupleId);
                ASSERT_EQ(std::string(record.data(), record.size()), reference.begin()->second);
                slottedPage->remove(id);
                reference.erase(reference.begin());
            }

            // Do update
            else if(operation <= 98) {
                if(reference.empty())
                    continue;
                RecordId id = reference.begin()->first;
                Record record = slottedPage->lookup(id);
                ASSERT_EQ(slottedPage->isReference(id), kInvalidTupleId);
                ASSERT_EQ(std::string(record.data(), record.size()), reference.begin()->second);
                std::string data = util::randomWord(ranny, 8, 64);
                if(slottedPage->canUpdateRecord(id, Record(data))) {
                    slottedPage->update(id, Record(data));
                    reference.erase(reference.begin());
                    reference.insert(make_pair(id, data));
                }
            }

            // Do consistency check
            else if(operation<=99 || i==iterations-1 || i==0) {
                ASSERT_TRUE(slottedPage->isValid());
                auto records = slottedPage->getAllRecords(kInvalidPageId); // page id does not matter
                ASSERT_EQ(records.size(), reference.size());
                for(auto& iter : records) {
                    ASSERT_TRUE(reference.count(iter.first.toRecordId()) > 0);
                    ASSERT_EQ(string(iter.second.data(), iter.second.size()), reference.find(iter.first.toRecordId())->second);
                }
                continue;
            }
        }
        free(slottedPage);
    }
}
Exemple #28
0
    StatusWith<DiskLoc> Collection::updateDocument( const DiskLoc& oldLocation,
                                                    const BSONObj& objNew,
                                                    bool enforceQuota,
                                                    OpDebug* debug ) {

        Record* oldRecord = getExtentManager()->recordFor( oldLocation );
        BSONObj objOld = BSONObj::make( oldRecord );

        if ( objOld.hasElement( "_id" ) ) {
            BSONElement oldId = objOld["_id"];
            BSONElement newId = objNew["_id"];
            if ( oldId != newId )
                return StatusWith<DiskLoc>( ErrorCodes::InternalError,
                                            "in Collection::updateDocument _id mismatch",
                                            13596 );
        }

        if ( ns().coll() == "system.users" ) {
            // XXX - andy and spencer think this should go away now
            V2UserDocumentParser parser;
            Status s = parser.checkValidUserDocument(objNew);
            if ( !s.isOK() )
                return StatusWith<DiskLoc>( s );
        }

        /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further
           below.  that is suboptimal, but it's pretty complicated to do it the other way without rollbacks...
        */
        OwnedPointerVector<UpdateTicket> updateTickets;
        updateTickets.mutableVector().resize(_indexCatalog.numIndexesTotal());
        for (int i = 0; i < _indexCatalog.numIndexesTotal(); ++i) {
            IndexDescriptor* descriptor = _indexCatalog.getDescriptor( i );
            IndexAccessMethod* iam = _indexCatalog.getIndex( descriptor );

            InsertDeleteOptions options;
            options.logIfError = false;
            options.dupsAllowed =
                !(KeyPattern::isIdKeyPattern(descriptor->keyPattern()) || descriptor->unique())
                || ignoreUniqueIndex(descriptor);
            updateTickets.mutableVector()[i] = new UpdateTicket();
            Status ret = iam->validateUpdate(objOld, objNew, oldLocation, options,
                                             updateTickets.mutableVector()[i]);
            if ( !ret.isOK() ) {
                return StatusWith<DiskLoc>( ret );
            }
        }

        if ( oldRecord->netLength() < objNew.objsize() ) {
            // doesn't fit, have to move to new location

            if ( _details->isCapped() )
                return StatusWith<DiskLoc>( ErrorCodes::InternalError,
                                            "failing update: objects in a capped ns cannot grow",
                                            10003 );

            moveCounter.increment();
            _details->paddingTooSmall();

            // unindex old record, don't delete
            // this way, if inserting new doc fails, we can re-index this one
            ClientCursor::aboutToDelete(_ns.ns(), _details, oldLocation);
            _indexCatalog.unindexRecord( objOld, oldLocation, true );

            if ( debug ) {
                if (debug->nmoved == -1) // default of -1 rather than 0
                    debug->nmoved = 1;
                else
                    debug->nmoved += 1;
            }

            StatusWith<DiskLoc> loc = insertDocument( objNew, enforceQuota );

            if ( loc.isOK() ) {
                // insert successful, now lets deallocate the old location
                // remember its already unindexed
                _recordStore.deallocRecord( oldLocation, oldRecord );
            }
            else {
                // new doc insert failed, so lets re-index the old document and location
                _indexCatalog.indexRecord( objOld, oldLocation );
            }

            return loc;
        }

        _infoCache.notifyOfWriteOp();
        _details->paddingFits();

        if ( debug )
            debug->keyUpdates = 0;

        for (int i = 0; i < _indexCatalog.numIndexesTotal(); ++i) {
            IndexDescriptor* descriptor = _indexCatalog.getDescriptor( i );
            IndexAccessMethod* iam = _indexCatalog.getIndex( descriptor );

            int64_t updatedKeys;
            Status ret = iam->update(*updateTickets.vector()[i], &updatedKeys);
            if ( !ret.isOK() )
                return StatusWith<DiskLoc>( ret );
            if ( debug )
                debug->keyUpdates += updatedKeys;
        }

        //  update in place
        int sz = objNew.objsize();
        memcpy(getDur().writingPtr(oldRecord->data(), sz), objNew.objdata(), sz);
        return StatusWith<DiskLoc>( oldLocation );
    }