StatusWith<DiskLoc> RecordStoreV1Base::insertRecord( OperationContext* txn, const DocWriter* doc, bool enforceQuota ) { int docSize = doc->documentSize(); if ( docSize < 4 ) { return StatusWith<DiskLoc>( ErrorCodes::InvalidLength, "record has to be >= 4 bytes" ); } int lenWHdr = docSize + Record::HeaderSize; if ( doc->addPadding() ) lenWHdr = getRecordAllocationSize( lenWHdr ); StatusWith<DiskLoc> loc = allocRecord( txn, lenWHdr, enforceQuota ); if ( !loc.isOK() ) return loc; Record *r = recordFor( loc.getValue() ); fassert( 17319, r->lengthWithHeaders() >= lenWHdr ); r = reinterpret_cast<Record*>( txn->recoveryUnit()->writingPtr(r, lenWHdr) ); doc->writeDocument( r->data() ); _addRecordToRecListInExtent(txn, r, loc.getValue()); _details->incrementStats( txn, r->netLength(), 1 ); _paddingFits( txn ); return loc; }
// bypass standard alloc/insert routines to use the extent we want. static DiskLoc insert( const DiskLoc& ext, int i ) { BSONObjBuilder b; b.append( "a", i ); BSONObj o = b.done(); int len = o.objsize(); Extent *e = ext.ext(); e = getDur().writing(e); int ofs; if ( e->lastRecord.isNull() ) ofs = ext.getOfs() + ( e->_extentData - (char *)e ); else ofs = e->lastRecord.getOfs() + e->lastRecord.rec()->lengthWithHeaders(); DiskLoc dl( ext.a(), ofs ); Record *r = dl.rec(); r = (Record*) getDur().writingPtr(r, Record::HeaderSize + len); r->lengthWithHeaders() = Record::HeaderSize + len; r->extentOfs() = e->myLoc.getOfs(); r->nextOfs() = DiskLoc::NullOfs; r->prevOfs() = e->lastRecord.isNull() ? DiskLoc::NullOfs : e->lastRecord.getOfs(); memcpy( r->data(), o.objdata(), len ); if ( e->firstRecord.isNull() ) e->firstRecord = dl; else getDur().writingInt(e->lastRecord.rec()->nextOfs()) = ofs; e->lastRecord = dl; return dl; }
RelationSchema::RelationSchema(const Record& record) { // De-serialize relation meta data istringstream in(string(record.data(), record.size()), ios::binary); util::readBinary(sid, in); util::readBinary(name, in); // De-serialize its attributes size_t len; util::readBinary(len, in); attributes.resize(len); for(auto& iter : attributes) { util::readBinary(iter.name, in); util::readBinary(iter.type, in); util::readBinary(iter.notNull, in); util::readBinary(iter.offset, in); } // De-serialize its indexes util::readBinary(len, in); indexes.resize(len); for(auto& iter : indexes) { util::readBinary(iter.sid, in); util::readBinary(iter.indexedColumns, in); util::readBinary(iter.type, in); util::readBinary(iter.unique, in); } assert(in.good()); }
/** write an op to the oplog that is already built. todo : make _logOpRS() call this so we don't repeat ourself? */ void _logOpObjRS(const BSONObj& op) { Lock::DBWrite lk("local"); const OpTime ts = op["ts"]._opTime(); long long h = op["h"].numberLong(); { const char *logns = rsoplog; if ( rsOplogDetails == 0 ) { Client::Context ctx( logns , dbpath, false); localDB = ctx.db(); verify( localDB ); rsOplogDetails = nsdetails(logns); massert(13389, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails); } Client::Context ctx( logns , localDB, false ); { int len = op.objsize(); Record *r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len); memcpy(getDur().writingPtr(r->data(), len), op.objdata(), len); } /* todo: now() has code to handle clock skew. but if the skew server to server is large it will get unhappy. this code (or code in now() maybe) should be improved. */ if( theReplSet ) { if( !(theReplSet->lastOpTimeWritten<ts) ) { log() << "replSet error possible failover clock skew issue? " << theReplSet->lastOpTimeWritten.toString() << ' ' << endl; } theReplSet->lastOpTimeWritten = ts; theReplSet->lastH = h; ctx.getClient()->setLastOp( ts ); } } }
StatusWith<DiskLoc> RecordStoreV1Base::_insertRecord( OperationContext* txn, const char* data, int len, bool enforceQuota ) { int lenWHdr = getRecordAllocationSize( len + Record::HeaderSize ); fassert( 17208, lenWHdr >= ( len + Record::HeaderSize ) ); StatusWith<DiskLoc> loc = allocRecord( txn, lenWHdr, enforceQuota ); if ( !loc.isOK() ) return loc; Record *r = recordFor( loc.getValue() ); fassert( 17210, r->lengthWithHeaders() >= lenWHdr ); // copy the data r = reinterpret_cast<Record*>( txn->recoveryUnit()->writingPtr(r, lenWHdr) ); memcpy( r->data(), data, len ); _addRecordToRecListInExtent(txn, r, loc.getValue()); _details->incrementStats( txn, r->netLength(), 1 ); return loc; }
void RecordStoreV1Base::deleteRecord( TransactionExperiment* txn, const DiskLoc& dl ) { Record* todelete = recordFor( dl ); /* remove ourself from the record next/prev chain */ { if ( todelete->prevOfs() != DiskLoc::NullOfs ) { DiskLoc prev = getPrevRecordInExtent( dl ); Record* prevRecord = recordFor( prev ); txn->writingInt( prevRecord->nextOfs() ) = todelete->nextOfs(); } if ( todelete->nextOfs() != DiskLoc::NullOfs ) { DiskLoc next = getNextRecord( dl ); Record* nextRecord = recordFor( next ); txn->writingInt( nextRecord->prevOfs() ) = todelete->prevOfs(); } } /* remove ourself from extent pointers */ { Extent *e = txn->writing( _getExtent( _getExtentLocForRecord( dl ) ) ); if ( e->firstRecord == dl ) { if ( todelete->nextOfs() == DiskLoc::NullOfs ) e->firstRecord.Null(); else e->firstRecord.set(dl.a(), todelete->nextOfs() ); } if ( e->lastRecord == dl ) { if ( todelete->prevOfs() == DiskLoc::NullOfs ) e->lastRecord.Null(); else e->lastRecord.set(dl.a(), todelete->prevOfs() ); } } /* add to the free list */ { _details->incrementStats( txn, -1 * todelete->netLength(), -1 ); if ( _isSystemIndexes ) { /* temp: if in system.indexes, don't reuse, and zero out: we want to be careful until validated more, as IndexDetails has pointers to this disk location. so an incorrectly done remove would cause a lot of problems. */ memset( txn->writingPtr(todelete, todelete->lengthWithHeaders() ), 0, todelete->lengthWithHeaders() ); } else { DEV { unsigned long long *p = reinterpret_cast<unsigned long long *>( todelete->data() ); *txn->writing(p) = 0; } addDeletedRec(txn, dl); } } }
vector<harriet::Value> RelationSchema::recordToTuple(const Record& record) const { vector<harriet::Value> result; result.reserve(attributes.size()); for(auto& attribute : attributes) result.emplace_back(harriet::Value::createFromRecord(attribute.type, record.data()+attribute.offset)); return result; }
void profile( const Client& c , CurOp& currentOp ) { verify( Lock::somethingWriteLocked() ); Database *db = c.database(); DEV verify( db ); const char *ns = db->profileName.c_str(); // build object profileBufBuilder.reset(); BSONObjBuilder b(profileBufBuilder); const bool isQueryObjTooBig = !currentOp.debug().append(currentOp, b, MAX_PROFILE_DOC_SIZE_BYTES); b.appendDate("ts", jsTime()); b.append("client", c.clientAddress()); if (c.getAuthenticationInfo()) { b.append("user", c.getAuthenticationInfo()->getUser(nsToDatabase(ns))); } BSONObj p = b.done(); if (static_cast<size_t>(p.objsize()) > MAX_PROFILE_DOC_SIZE_BYTES || isQueryObjTooBig) { string small = p.toString(/*isArray*/false, /*full*/false); warning() << "can't add full line to system.profile: " << small << endl; // rebuild with limited info BSONObjBuilder b(profileBufBuilder); b.appendDate("ts", jsTime()); b.append("client", c.clientAddress() ); if ( c.getAuthenticationInfo() ) b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) ); b.append("err", "profile line too large (max is 100KB)"); // should be much smaller but if not don't break anything if (small.size() < MAX_PROFILE_DOC_SIZE_BYTES){ b.append("abbreviated", small); } p = b.done(); } // write: not replicated // get or create the profiling collection NamespaceDetails *details = getOrCreateProfileCollection(db); if (details) { int len = p.objsize(); Record *r = theDataFileMgr.fast_oplog_insert(details, ns, len); memcpy(getDur().writingPtr(r->data(), len), p.objdata(), len); } }
StatusWith<DiskLoc> Collection::insertDocument( const BSONObj& docToInsert, bool enforceQuota ) { int lenWHdr = _details->getRecordAllocationSize( docToInsert.objsize() + Record::HeaderSize ); fassert( 17208, lenWHdr >= ( docToInsert.objsize() + Record::HeaderSize ) ); if ( _details->isCapped() ) { // TOOD: old god not done Status ret = _indexCatalog.checkNoIndexConflicts( docToInsert ); uassert(17209, "duplicate key insert for unique index of capped collection", ret.isOK() ); } // TODO: for now, capped logic lives inside NamespaceDetails, which is hidden // under the RecordStore, this feels broken since that should be a // collection access method probably StatusWith<DiskLoc> loc = _recordStore.allocRecord( lenWHdr, enforceQuota ? largestFileNumberInQuota() : 0 ); if ( !loc.isOK() ) return loc; Record *r = loc.getValue().rec(); fassert( 17210, r->lengthWithHeaders() >= lenWHdr ); // copy the data r = reinterpret_cast<Record*>( getDur().writingPtr(r, lenWHdr) ); memcpy( r->data(), docToInsert.objdata(), docToInsert.objsize() ); addRecordToRecListInExtent(r, loc.getValue()); // XXX move down into record store _details->incrementStats( r->netLength(), 1 ); // TOOD: old god not done _infoCache.notifyOfWriteOp(); try { _indexCatalog.indexRecord( docToInsert, loc.getValue() ); } catch( AssertionException& e ) { if ( _details->isCapped() ) { return StatusWith<DiskLoc>( ErrorCodes::InternalError, str::stream() << "unexpected index insertion failure on" << " capped collection" << e.toString() << " - collection and its index will not match" ); } // normal case -- we can roll back deleteDocument( loc.getValue(), false, true, NULL ); throw; } // TODO: this is what the old code did, but is it correct? _details->paddingFits(); return loc; }
StatusWith<DiskLoc> RecordStoreV1Base::updateRecord( OperationContext* txn, const DiskLoc& oldLocation, const char* data, int dataSize, bool enforceQuota, UpdateMoveNotifier* notifier ) { Record* oldRecord = recordFor( oldLocation ); if ( oldRecord->netLength() >= dataSize ) { // we fit _paddingFits( txn ); memcpy( txn->recoveryUnit()->writingPtr( oldRecord->data(), dataSize ), data, dataSize ); return StatusWith<DiskLoc>( oldLocation ); } if ( isCapped() ) return StatusWith<DiskLoc>( ErrorCodes::InternalError, "failing update: objects in a capped ns cannot grow", 10003 ); // we have to move _paddingTooSmall( txn ); StatusWith<DiskLoc> newLocation = _insertRecord( txn, data, dataSize, enforceQuota ); if ( !newLocation.isOK() ) return newLocation; // insert worked, so we delete old record if ( notifier ) { Status moveStatus = notifier->recordStoreGoingToMove( txn, oldLocation, oldRecord->data(), oldRecord->netLength() ); if ( !moveStatus.isOK() ) return StatusWith<DiskLoc>( moveStatus ); } deleteRecord( txn, oldLocation ); return newLocation; }
static void _profile(const Client& c, CurOp& currentOp, BufBuilder& profileBufBuilder) { Database *db = c.database(); DEV verify( db ); const char *ns = db->profileName.c_str(); // build object BSONObjBuilder b(profileBufBuilder); b.appendDate("ts", jsTime()); currentOp.debug().append( currentOp , b ); b.append("client", c.clientAddress() ); if ( c.getAuthenticationInfo() ) b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) ); BSONObj p = b.done(); if (p.objsize() > 100*1024){ string small = p.toString(/*isArray*/false, /*full*/false); warning() << "can't add full line to system.profile: " << small; // rebuild with limited info BSONObjBuilder b(profileBufBuilder); b.appendDate("ts", jsTime()); b.append("client", c.clientAddress() ); if ( c.getAuthenticationInfo() ) b.append( "user" , c.getAuthenticationInfo()->getUser( nsToDatabase( ns ) ) ); b.append("err", "profile line too large (max is 100KB)"); if (small.size() < 100*1024){ // should be much smaller but if not don't break anything b.append("abbreviated", small); } p = b.done(); } // write: not replicated // get or create the profiling collection NamespaceDetails *details = getOrCreateProfileCollection(db); if (details) { int len = p.objsize(); Record *r = theDataFileMgr.fast_oplog_insert(details, ns, len); memcpy(getDur().writingPtr(r->data(), len), p.objdata(), len); } }
IndexCatalog::IndexBuildBlock* halfAddIndex(const std::string& key) { string name = key + "_1"; BSONObj indexInfo = BSON( "v" << 1 << "key" << BSON( key << 1 ) << "ns" << _ns << "name" << name ); int32_t lenWHdr = indexInfo.objsize() + Record::HeaderSize; const char* systemIndexes = "unittests.system.indexes"; DiskLoc infoLoc = allocateSpaceForANewRecord( systemIndexes, nsdetails( systemIndexes ), lenWHdr, false ); Record* infoRecord = reinterpret_cast<Record*>( getDur().writingPtr( infoLoc.rec(), lenWHdr ) ); memcpy( infoRecord->data(), indexInfo.objdata(), indexInfo.objsize() ); addRecordToRecListInExtent( infoRecord, infoLoc ); return new IndexCatalog::IndexBuildBlock( _ctx.ctx().db()->getCollection( _ns )->getIndexCatalog(), name, infoLoc ); }
Status RecordStoreV1Base::updateWithDamages( OperationContext* txn, const DiskLoc& loc, const char* damageSource, const mutablebson::DamageVector& damages ) { _paddingFits( txn ); Record* rec = recordFor( loc ); char* root = rec->data(); // All updates were in place. Apply them via durability and writing pointer. mutablebson::DamageVector::const_iterator where = damages.begin(); const mutablebson::DamageVector::const_iterator end = damages.end(); for( ; where != end; ++where ) { const char* sourcePtr = damageSource + where->sourceOffset; void* targetPtr = txn->recoveryUnit()->writingPtr(root + where->targetOffset, where->size); std::memcpy(targetPtr, sourcePtr, where->size); } return Status::OK(); }
/** @return IndexDetails for a new index on a:1, with the info field populated. */ IndexDetails& addIndexWithInfo() { BSONObj indexInfo = BSON( "v" << 1 << "key" << BSON( "a" << 1 ) << "ns" << _ns << "name" << "a_1" ); int32_t lenWHdr = indexInfo.objsize() + Record::HeaderSize; const char* systemIndexes = "unittests.system.indexes"; DiskLoc infoLoc = allocateSpaceForANewRecord( systemIndexes, nsdetails( systemIndexes ), lenWHdr, false ); Record* infoRecord = reinterpret_cast<Record*>( getDur().writingPtr( infoLoc.rec(), lenWHdr ) ); memcpy( infoRecord->data(), indexInfo.objdata(), indexInfo.objsize() ); addRecordToRecListInExtent( infoRecord, infoLoc ); IndexDetails& id = nsdetails( _ns )->getNextIndexDetails( _ns ); nsdetails( _ns )->addIndex(); id.info.writing() = infoLoc; return id; }
/** @return IndexDetails for a new index on a:1, with the info field populated. */ IndexDescriptor* addIndexWithInfo() { BSONObj indexInfo = BSON( "v" << 1 << "key" << BSON( "a" << 1 ) << "ns" << _ns << "name" << "a_1" ); int32_t lenWHdr = indexInfo.objsize() + Record::HeaderSize; const char* systemIndexes = "unittests.system.indexes"; DiskLoc infoLoc = allocateSpaceForANewRecord( systemIndexes, nsdetails( systemIndexes ), lenWHdr, false ); Record* infoRecord = reinterpret_cast<Record*>( getDur().writingPtr( infoLoc.rec(), lenWHdr ) ); memcpy( infoRecord->data(), indexInfo.objdata(), indexInfo.objsize() ); addRecordToRecListInExtent( infoRecord, infoLoc ); IndexCatalog::IndexBuildBlock blk( collection()->getIndexCatalog(), "a_1", infoLoc ); blk.success(); return collection()->getIndexCatalog()->findIndexByName( "a_1" ); }
StatusWith<DiskLoc> RecordStoreV1Base::insertRecord( TransactionExperiment* txn, const DocWriter* doc, int quotaMax ) { int lenWHdr = doc->documentSize() + Record::HeaderSize; if ( doc->addPadding() ) lenWHdr = getRecordAllocationSize( lenWHdr ); StatusWith<DiskLoc> loc = allocRecord( txn, lenWHdr, quotaMax ); if ( !loc.isOK() ) return loc; Record *r = recordFor( loc.getValue() ); fassert( 17319, r->lengthWithHeaders() >= lenWHdr ); r = reinterpret_cast<Record*>( txn->writingPtr(r, lenWHdr) ); doc->writeDocument( r->data() ); _addRecordToRecListInExtent(txn, r, loc.getValue()); _details->incrementStats( txn, r->netLength(), 1 ); return loc; }
void RelationSchema::loadTuple(const Record& record, harriet::Value& target, uint32_t position) const { target = harriet::Value::createFromRecord(attributes[position].type, record.data()+attributes[position].offset); }
/* we write to local.oplog.$main: { ts : ..., op: ..., ns: ..., o: ... } ts: an OpTime timestamp op: "i" insert "u" update "d" delete "c" db cmd "db" declares presence of a database (ns is set to the db name + '.') "n" no op logNS: where to log it. 0/null means "local.oplog.$main". bb: if not null, specifies a boolean to pass along to the other side as b: param. used for "justOne" or "upsert" flags on 'd', 'u' first: true when set, indicates this is the first thing we have logged for this database. thus, the slave does not need to copy down all the data when it sees this. note this is used for single collection logging even when --replSet is enabled. */ static void _logOpOld(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb, bool fromMigrate ) { Lock::DBWrite lk("local"); static BufBuilder bufbuilder(8*1024); // todo there is likely a mutex on this constructor if ( strncmp(ns, "local.", 6) == 0 ) { if ( strncmp(ns, "local.slaves", 12) == 0 ) { resetSlaveCache(); } return; } mutex::scoped_lock lk2(OpTime::m); const OpTime ts = OpTime::now(lk2); Client::Context context("",0,false); /* we jump through a bunch of hoops here to avoid copying the obj buffer twice -- instead we do a single copy to the destination position in the memory mapped file. */ bufbuilder.reset(); BSONObjBuilder b(bufbuilder); b.appendTimestamp("ts", ts.asDate()); b.append("op", opstr); b.append("ns", ns); if (fromMigrate) b.appendBool("fromMigrate", true); if ( bb ) b.appendBool("b", *bb); if ( o2 ) b.append("o2", *o2); BSONObj partial = b.done(); // partial is everything except the o:... part. int po_sz = partial.objsize(); int len = po_sz + obj.objsize() + 1 + 2 /*o:*/; Record *r; if( logNS == 0 ) { logNS = "local.oplog.$main"; if ( localOplogMainDetails == 0 ) { Client::Context ctx( logNS , dbpath, false); localDB = ctx.db(); verify( localDB ); localOplogMainDetails = nsdetails(logNS); verify( localOplogMainDetails ); } Client::Context ctx( logNS , localDB, false ); r = theDataFileMgr.fast_oplog_insert(localOplogMainDetails, logNS, len); } else { Client::Context ctx( logNS, dbpath, false ); verify( nsdetails( logNS ) ); // first we allocate the space, then we fill it below. r = theDataFileMgr.fast_oplog_insert( nsdetails( logNS ), logNS, len); } append_O_Obj(r->data(), partial, obj); context.getClient()->setLastOp( ts ); if ( logLevel >= 6 ) { BSONObj temp(r); log( 6 ) << "logging op:" << temp << endl; } }
static void _logOpRS(const char *opstr, const char *ns, const char *logNS, const BSONObj& obj, BSONObj *o2, bool *bb, bool fromMigrate ) { Lock::DBWrite lk1("local"); if ( strncmp(ns, "local.", 6) == 0 ) { if ( strncmp(ns, "local.slaves", 12) == 0 ) resetSlaveCache(); return; } mutex::scoped_lock lk2(OpTime::m); const OpTime ts = OpTime::now(lk2); long long hashNew; if( theReplSet ) { massert(13312, "replSet error : logOp() but not primary?", theReplSet->box.getState().primary()); hashNew = (theReplSet->lastH * 131 + ts.asLL()) * 17 + theReplSet->selfId(); } else { // must be initiation verify( *ns == 0 ); hashNew = 0; } /* we jump through a bunch of hoops here to avoid copying the obj buffer twice -- instead we do a single copy to the destination position in the memory mapped file. */ logopbufbuilder.reset(); BSONObjBuilder b(logopbufbuilder); b.appendTimestamp("ts", ts.asDate()); b.append("h", hashNew); b.append("op", opstr); b.append("ns", ns); if (fromMigrate) b.appendBool("fromMigrate", true); if ( bb ) b.appendBool("b", *bb); if ( o2 ) b.append("o2", *o2); BSONObj partial = b.done(); int posz = partial.objsize(); int len = posz + obj.objsize() + 1 + 2 /*o:*/; Record *r; DEV verify( logNS == 0 ); { const char *logns = rsoplog; if ( rsOplogDetails == 0 ) { Client::Context ctx( logns , dbpath, false); localDB = ctx.db(); verify( localDB ); rsOplogDetails = nsdetails(logns); massert(13347, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails); } Client::Context ctx( logns , localDB, false ); r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len); /* todo: now() has code to handle clock skew. but if the skew server to server is large it will get unhappy. this code (or code in now() maybe) should be improved. */ if( theReplSet ) { if( !(theReplSet->lastOpTimeWritten<ts) ) { log() << "replSet ERROR possible failover clock skew issue? " << theReplSet->lastOpTimeWritten << ' ' << ts << rsLog; log() << "replSet " << theReplSet->isPrimary() << rsLog; } theReplSet->lastOpTimeWritten = ts; theReplSet->lastH = hashNew; ctx.getClient()->setLastOp( ts ); } } append_O_Obj(r->data(), partial, obj); if ( logLevel >= 6 ) { BSONObj temp(r); log( 6 ) << "logOp:" << temp << endl; } }
/** @return number of skipped (invalid) documents */ unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc diskloc, int n, const scoped_array<IndexSpec> &indexSpecs, scoped_array<SortPhaseOne>& phase1, int nidx, bool validate, double pf, int pb) { log() << "compact begin extent #" << n << " for namespace " << ns << endl; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent *e = diskloc.ext(); e->assertOk(); verify( e->validates() ); unsigned skipped = 0; { // the next/prev pointers within the extent might not be in order so we first page the whole thing in // sequentially log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl; Timer t; MongoDataFile* mdf = cc().database()->getFile( diskloc.a() ); HANDLE fd = mdf->getFd(); int offset = diskloc.getOfs(); Extent* ext = diskloc.ext(); size_t length = ext->length; touch_pages(fd, offset, length, ext); int ms = t.millis(); if( ms > 1000 ) log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl; } { log() << "compact copying records" << endl; long long datasize = 0; long long nrecords = 0; DiskLoc L = e->firstRecord; if( !L.isNull() ) { while( 1 ) { Record *recOld = L.rec(); L = recOld->nextInExtent(L); BSONObj objOld = BSONObj::make(recOld); if( !validate || objOld.valid() ) { nrecords++; unsigned sz = objOld.objsize(); oldObjSize += sz; oldObjSizeWithPadding += recOld->netLength(); unsigned lenWHdr = sz + Record::HeaderSize; unsigned lenWPadding = lenWHdr; { lenWPadding = static_cast<unsigned>(pf*lenWPadding); lenWPadding += pb; lenWPadding = lenWPadding & quantizeMask(lenWPadding); if( lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { lenWPadding = lenWHdr; } } DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false); uassert(14024, "compact error out of space during compaction", !loc.isNull()); Record *recNew = loc.rec(); datasize += recNew->netLength(); recNew = (Record *) getDur().writingPtr(recNew, lenWHdr); addRecordToRecListInExtent(recNew, loc); memcpy(recNew->data(), objOld.objdata(), sz); { // extract keys for all indexes we will be rebuilding for( int x = 0; x < nidx; x++ ) { phase1[x].addKeys(indexSpecs[x], objOld, loc); } } } else { if( ++skipped <= 10 ) log() << "compact skipping invalid object" << endl; } if( L.isNull() ) { // we just did the very last record from the old extent. it's still pointed to // by the old extent ext, but that will be fixed below after this loop break; } // remove the old records (orphan them) periodically so our commit block doesn't get too large bool stopping = false; RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0; if( stopping || getDur().aCommitIsNeeded() ) { e->firstRecord.writing() = L; Record *r = L.rec(); getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs; getDur().commitIfNeeded(); killCurrentOp.checkForInterrupt(false); } } } // if !L.isNull() verify( d->firstExtent == diskloc ); verify( d->lastExtent != diskloc ); DiskLoc newFirst = e->xnext; d->firstExtent.writing() = newFirst; newFirst.ext()->xprev.writing().Null(); getDur().writing(e)->markEmpty(); freeExtents( diskloc, diskloc ); // update datasize/record count for this namespace's extent { NamespaceDetails::Stats *s = getDur().writing(&d->stats); s->datasize += datasize; s->nrecords += nrecords; } getDur().commitIfNeeded(); { double op = 1.0; if( oldObjSize ) op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize; log() << "compact finished extent #" << n << " containing " << nrecords << " documents (" << datasize/1000000.0 << "MB)" << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100 << endl; } } return skipped; }
DiskLoc DataFileMgr::insert(const char* ns, const void* obuf, int32_t len, bool mayInterrupt, bool god, bool mayAddIndex, bool* addedID) { Database* database = cc().database(); bool wouldAddIndex = false; massert( 10093 , "cannot insert into reserved $ collection", god || NamespaceString::normal( ns ) ); uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) ); { const char *sys = strstr(ns, "system."); if ( sys && !insert_checkSys(sys, ns, wouldAddIndex, obuf, god) ) return DiskLoc(); } bool addIndex = wouldAddIndex && mayAddIndex; Collection* collection = database->getCollection( ns ); if ( collection == NULL ) { collection = database->createCollection( ns, false, NULL ); int ies = Extent::initialSize(len); if( str::contains(ns, '$') && len + Record::HeaderSize >= BtreeData_V1::BucketSize - 256 && len + Record::HeaderSize <= BtreeData_V1::BucketSize + 256 ) { // probably an index. so we pick a value here for the first extent instead of using // initialExtentSize() which is more for user collections. // TODO: we could look at the # of records in the parent collection to be smarter here. ies = (32+4) * 1024; } collection->increaseStorageSize( ies, false); if ( !god ) ensureIdIndexForNewNs(ns); } NamespaceDetails* d = collection->details(); string tabletoidxns; Collection* collectionToIndex = 0; NamespaceDetails* tableToIndex = 0; BSONObj fixedIndexObject; if ( addIndex ) { verify( obuf ); BSONObj io((const char *) obuf); tabletoidxns = io.getStringField( "ns" ); uassert(10096, "invalid ns to index", tabletoidxns.find( '.' ) != string::npos); massert(10097, str::stream() << "trying to create index on wrong db " << " db: " << database->name() << " collection: " << tabletoidxns, database->ownsNS( tabletoidxns ) ); collectionToIndex = database->getCollection( tabletoidxns ); if ( !collectionToIndex ) { collectionToIndex = database->createCollection( tabletoidxns, false, NULL ); verify( collectionToIndex ); if ( !god ) ensureIdIndexForNewNs( tabletoidxns.c_str() ); } tableToIndex = collectionToIndex->details(); Status status = collectionToIndex->getIndexCatalog()->okToAddIndex( io ); if ( status.code() == ErrorCodes::IndexAlreadyExists ) { // dup index, we ignore return DiskLoc(); } uassert( 17199, str::stream() << "cannot build index on " << tabletoidxns << " because of " << status.toString(), status.isOK() ); if( !prepareToBuildIndex(io, mayInterrupt, god, tabletoidxns ) ) { // prepare creates _id itself, or this indicates to fail the build silently (such // as if index already exists) return DiskLoc(); } fixedIndexObject = IndexCatalog::fixIndexSpec( io ); obuf = fixedIndexObject.objdata(); len = fixedIndexObject.objsize(); } IDToInsert idToInsert; // only initialized if needed if( !god ) { /* Check if we have an _id field. If we don't, we'll add it. Note that btree buckets which we insert aren't BSONObj's, but in that case god==true. */ BSONObj io((const char *) obuf); BSONElement idField = io.getField( "_id" ); uassert( 10099 , "_id cannot be an array", idField.type() != Array ); // we don't add _id for capped collections in local as they don't have an _id index if( idField.eoo() && !wouldAddIndex && nsToDatabase( ns ) != "local" && d->haveIdIndex() ) { if( addedID ) *addedID = true; idToInsert.init(); len += idToInsert.size(); } BSONElementManipulator::lookForTimestamps( io ); } int lenWHdr = d->getRecordAllocationSize( len + Record::HeaderSize ); fassert( 16440, lenWHdr >= ( len + Record::HeaderSize ) ); // If the collection is capped, check if the new object will violate a unique index // constraint before allocating space. if ( d->isCapped() && !god) { BSONObj temp = BSONObj( reinterpret_cast<const char *>( obuf ) ); Status ret = collection->getIndexCatalog()->checkNoIndexConflicts( temp ); uassert(12582, "duplicate key insert for unique index of capped collection", ret.isOK() ); } DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god); if ( loc.isNull() ) { log() << "insert: couldn't alloc space for object ns:" << ns << " capped:" << d->isCapped() << endl; verify(d->isCapped()); return DiskLoc(); } Record *r = loc.rec(); { verify( r->lengthWithHeaders() >= lenWHdr ); r = (Record*) getDur().writingPtr(r, lenWHdr); if( idToInsert.needed() ) { /* a little effort was made here to avoid a double copy when we add an ID */ int originalSize = *((int*) obuf); ((int&)*r->data()) = originalSize + idToInsert.size(); memcpy(r->data()+4, idToInsert.rawdata(), idToInsert.size()); memcpy(r->data()+4+idToInsert.size(), ((char*)obuf)+4, originalSize-4); } else { if( obuf ) // obuf can be null from internal callers memcpy(r->data(), obuf, len); } } addRecordToRecListInExtent(r, loc); d->incrementStats( r->netLength(), 1 ); // we don't bother resetting query optimizer stats for the god tables - also god is true when adding a btree bucket if ( !god ) collection->infoCache()->notifyOfWriteOp(); if ( tableToIndex ) { insert_makeIndex(collectionToIndex, loc, mayInterrupt); } /* add this record to our indexes */ if ( d->getTotalIndexCount() > 0 ) { try { BSONObj obj(r->data()); collection->getIndexCatalog()->indexRecord(obj, loc); } catch( AssertionException& e ) { // should be a dup key error on _id index if( tableToIndex || d->isCapped() ) { massert( 12583, "unexpected index insertion failure on capped collection", !d->isCapped() ); string s = e.toString(); s += " : on addIndex/capped - collection and its index will not match"; setLastError(0, s.c_str()); error() << s << endl; } else { // normal case -- we can roll back _deleteRecord(d, ns, r, loc); throw; } } } d->paddingFits(); return loc; }
/** @return number of skipped (invalid) documents */ unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc diskloc, int n, int nidx, bool validate, double pf, int pb, bool useDefaultPadding, bool preservePadding) { log() << "compact begin extent #" << n << " for namespace " << ns << endl; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent *e = diskloc.ext(); e->assertOk(); verify( e->validates(diskloc) ); unsigned skipped = 0; Database* db = cc().database(); { // the next/prev pointers within the extent might not be in order so we first // page the whole thing in sequentially log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl; Timer t; Extent* ext = db->getExtentManager().getExtent( diskloc ); size_t length = ext->length; touch_pages( reinterpret_cast<const char*>(ext), length ); int ms = t.millis(); if( ms > 1000 ) log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl; } { log() << "compact copying records" << endl; long long datasize = 0; long long nrecords = 0; DiskLoc L = e->firstRecord; if( !L.isNull() ) { while( 1 ) { Record *recOld = L.rec(); L = db->getExtentManager().getNextRecordInExtent(L); BSONObj objOld = BSONObj::make(recOld); if( !validate || objOld.valid() ) { nrecords++; unsigned sz = objOld.objsize(); oldObjSize += sz; oldObjSizeWithPadding += recOld->netLength(); unsigned lenWHdr = sz + Record::HeaderSize; unsigned lenWPadding = lenWHdr; // if we are preserving the padding, the record should not change size if (preservePadding) { lenWPadding = recOld->lengthWithHeaders(); } // maintain UsePowerOf2Sizes if no padding values were passed in else if (d->isUserFlagSet(NamespaceDetails::Flag_UsePowerOf2Sizes) && useDefaultPadding) { lenWPadding = d->quantizePowerOf2AllocationSpace(lenWPadding); } // otherwise use the padding values (pf and pb) that were passed in else { lenWPadding = static_cast<unsigned>(pf*lenWPadding); lenWPadding += pb; lenWPadding = lenWPadding & quantizeMask(lenWPadding); } if (lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { lenWPadding = lenWHdr; } DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false); uassert(14024, "compact error out of space during compaction", !loc.isNull()); Record *recNew = loc.rec(); datasize += recNew->netLength(); recNew = (Record *) getDur().writingPtr(recNew, lenWHdr); addRecordToRecListInExtent(recNew, loc); memcpy(recNew->data(), objOld.objdata(), sz); } else { if( ++skipped <= 10 ) log() << "compact skipping invalid object" << endl; } if( L.isNull() ) { // we just did the very last record from the old extent. it's still pointed to // by the old extent ext, but that will be fixed below after this loop break; } // remove the old records (orphan them) periodically so our commit block doesn't get too large bool stopping = false; RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0; if( stopping || getDur().aCommitIsNeeded() ) { e->firstRecord.writing() = L; Record *r = L.rec(); getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs; getDur().commitIfNeeded(); killCurrentOp.checkForInterrupt(false); } } } // if !L.isNull() verify( d->firstExtent() == diskloc ); verify( d->lastExtent() != diskloc ); DiskLoc newFirst = e->xnext; d->firstExtent().writing() = newFirst; newFirst.ext()->xprev.writing().Null(); getDur().writing(e)->markEmpty(); cc().database()->getExtentManager().freeExtents( diskloc, diskloc ); // update datasize/record count for this namespace's extent d->incrementStats( datasize, nrecords ); getDur().commitIfNeeded(); { double op = 1.0; if( oldObjSize ) op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize; log() << "compact finished extent #" << n << " containing " << nrecords << " documents (" << datasize/1000000.0 << "MB)" << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100 << endl; } } return skipped; }
StatusWith<DiskLoc> Collection::updateDocument( OperationContext* txn, const DiskLoc& oldLocation, const BSONObj& objNew, bool enforceQuota, OpDebug* debug ) { Record* oldRecord = _recordStore->recordFor( oldLocation ); BSONObj objOld( oldRecord->data() ); if ( objOld.hasElement( "_id" ) ) { BSONElement oldId = objOld["_id"]; BSONElement newId = objNew["_id"]; if ( oldId != newId ) return StatusWith<DiskLoc>( ErrorCodes::InternalError, "in Collection::updateDocument _id mismatch", 13596 ); } if ( ns().coll() == "system.users" ) { // XXX - andy and spencer think this should go away now V2UserDocumentParser parser; Status s = parser.checkValidUserDocument(objNew); if ( !s.isOK() ) return StatusWith<DiskLoc>( s ); } /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further below. that is suboptimal, but it's pretty complicated to do it the other way without rollbacks... */ OwnedPointerMap<IndexDescriptor*,UpdateTicket> updateTickets; IndexCatalog::IndexIterator ii = _indexCatalog.getIndexIterator( true ); while ( ii.more() ) { IndexDescriptor* descriptor = ii.next(); IndexAccessMethod* iam = _indexCatalog.getIndex( descriptor ); InsertDeleteOptions options; options.logIfError = false; options.dupsAllowed = !(KeyPattern::isIdKeyPattern(descriptor->keyPattern()) || descriptor->unique()) || replset::ignoreUniqueIndex(descriptor); UpdateTicket* updateTicket = new UpdateTicket(); updateTickets.mutableMap()[descriptor] = updateTicket; Status ret = iam->validateUpdate(objOld, objNew, oldLocation, options, updateTicket ); if ( !ret.isOK() ) { return StatusWith<DiskLoc>( ret ); } } // this can callback into Collection::recordStoreGoingToMove StatusWith<DiskLoc> newLocation = _recordStore->updateRecord( txn, oldLocation, objNew.objdata(), objNew.objsize(), enforceQuota ? largestFileNumberInQuota() : 0, this ); if ( !newLocation.isOK() ) { return newLocation; } _infoCache.notifyOfWriteOp(); if ( newLocation.getValue() != oldLocation ) { if ( debug ) { if (debug->nmoved == -1) // default of -1 rather than 0 debug->nmoved = 1; else debug->nmoved += 1; } _indexCatalog.indexRecord(txn, objNew, newLocation.getValue()); return newLocation; } if ( debug ) debug->keyUpdates = 0; ii = _indexCatalog.getIndexIterator( true ); while ( ii.more() ) { IndexDescriptor* descriptor = ii.next(); IndexAccessMethod* iam = _indexCatalog.getIndex( descriptor ); int64_t updatedKeys; Status ret = iam->update(txn, *updateTickets.mutableMap()[descriptor], &updatedKeys); if ( !ret.isOK() ) return StatusWith<DiskLoc>( ret ); if ( debug ) debug->keyUpdates += updatedKeys; } // Broadcast the mutation so that query results stay correct. _cursorCache.invalidateDocument(oldLocation, INVALIDATION_MUTATION); return newLocation; }
BSONObj Collection::docFor(const DiskLoc& loc) const { Record* rec = _recordStore->recordFor( loc ); return BSONObj( rec->data() ); }
int test(const unsigned pageSize) { // Bookkeeping unordered_map<TID, unsigned> values; // TID -> testData entry unordered_map<unsigned, unsigned> usage; // pageID -> bytes used within this page // Setting everything shared_ptr<BufferManager> bm(new BufferManager(100, shared_ptr<FileManager>(new FileManager("test_data")))); SPSegment sp(3, bm, true); Random64 rnd; // Insert some records for (unsigned i=0; i<maxInserts; ++i) { // Select string/record to insert uint64_t r = rnd.next()%testData.size(); const string s = testData[r]; // Check that there is space available for 's' bool full = true; for (unsigned p=0; p<initialSize; ++p) { if (loadFactor*pageSize - usage[p] > s.size()) { #ifndef SILENT cerr << "in page " << hex << (p+1) << dec << " expecting " << (loadFactor*pageSize - usage[p]) << endl; #endif full = false; break; } } if (full) break; // Insert record TID tid = sp.insert(Record(s.size(), s.c_str())); EXPECT_EQ(values.end(), values.find(tid)) << "TIDs should not be overwritten"; values[tid]=r; // extract the pageId from the TID // PageIDs start at 1 unsigned pageId = util::extractPageIDFromTID(tid) - 1; EXPECT_LT(pageId, initialSize) << "pageId should be within [0, initialSize)"; usage[pageId]+=s.size(); } // Lookup & delete some records for (unsigned i=0; i<maxDeletes; ++i) { // Select operation bool del = rnd.next()%10 == 0; // Select victim TID tid = values.begin()->first; unsigned pageId = util::extractPageIDFromTID(tid)-1; const std::string& value = testData[(values.begin()->second)%testData.size()]; unsigned len = value.size(); // Lookup Record rec = sp.lookup(tid); EXPECT_EQ(len, rec.len()); EXPECT_EQ(0, memcmp(rec.data(), value.c_str(), len)); if (del) { // do delete EXPECT_TRUE(sp.remove(tid)); values.erase(tid); usage[pageId]-=len; } } // Insert some records, again for (unsigned i=0; i<maxInserts; ++i) { // Select string/record to insert uint64_t r = rnd.next()%testData.size(); const string s = testData[r]; // Check that there is space available for 's' bool full = true; for (unsigned p=0; p<initialSize; ++p) { if (loadFactor*pageSize - usage[p] > s.size()) { cerr << "in page " << hex << (p+1) << dec << " expecting " << (loadFactor*pageSize - usage[p]) << endl; full = false; break; } } if (full) break; // Insert record TID tid = sp.insert(Record(s.size(), s.c_str())); EXPECT_EQ(values.end(), values.find(tid)) << "TIDs should not be overwritten"; values[tid]=r; // extract the pageId from the TID // PageIDs start at 1 unsigned pageId = util::extractPageIDFromTID(tid) - 1; EXPECT_LT(pageId, initialSize) << "pageId should be within [0, initialSize)"; usage[pageId]+=s.size(); } /*// Update some values ('usage' counter invalid from here on) for (unsigned i=0; i<maxUpdates; ++i) { // Select victim TID tid = values.begin()->first; // Select new string/record uint64_t r = rnd.next()%testData.size(); const string s = testData[r]; // Replace old with new value sp.update(tid, Record(s.size(), s.c_str())); values[tid]=r; }*/ // Lookups for (auto p : values) { TID tid = p.first; const std::string& value = testData[p.second]; unsigned len = value.size(); Record rec = sp.lookup(tid); EXPECT_EQ(len, rec.len()); EXPECT_EQ(0, memcmp(rec.data(), value.c_str(), len)); } return 0; }
void RecordStoreV1Base::deleteRecord( OperationContext* txn, const DiskLoc& dl ) { Record* todelete = recordFor( dl ); invariant( todelete->netLength() >= 4 ); // this is required for defensive code /* remove ourself from the record next/prev chain */ { if ( todelete->prevOfs() != DiskLoc::NullOfs ) { DiskLoc prev = getPrevRecordInExtent( txn, dl ); Record* prevRecord = recordFor( prev ); txn->recoveryUnit()->writingInt( prevRecord->nextOfs() ) = todelete->nextOfs(); } if ( todelete->nextOfs() != DiskLoc::NullOfs ) { DiskLoc next = getNextRecord( txn, dl ); Record* nextRecord = recordFor( next ); txn->recoveryUnit()->writingInt( nextRecord->prevOfs() ) = todelete->prevOfs(); } } /* remove ourself from extent pointers */ { DiskLoc extentLoc = todelete->myExtentLoc(dl); Extent *e = _getExtent( txn, extentLoc ); if ( e->firstRecord == dl ) { txn->recoveryUnit()->writing(&e->firstRecord); if ( todelete->nextOfs() == DiskLoc::NullOfs ) e->firstRecord.Null(); else e->firstRecord.set(dl.a(), todelete->nextOfs() ); } if ( e->lastRecord == dl ) { txn->recoveryUnit()->writing(&e->lastRecord); if ( todelete->prevOfs() == DiskLoc::NullOfs ) e->lastRecord.Null(); else e->lastRecord.set(dl.a(), todelete->prevOfs() ); } } /* add to the free list */ { _details->incrementStats( txn, -1 * todelete->netLength(), -1 ); if ( _isSystemIndexes ) { /* temp: if in system.indexes, don't reuse, and zero out: we want to be careful until validated more, as IndexDetails has pointers to this disk location. so an incorrectly done remove would cause a lot of problems. */ memset( txn->recoveryUnit()->writingPtr(todelete, todelete->lengthWithHeaders() ), 0, todelete->lengthWithHeaders() ); } else { // this is defensive so we can detect if we are still using a location // that was deleted memset(txn->recoveryUnit()->writingPtr(todelete->data(), 4), 0xee, 4); addDeletedRec(txn, dl); } } }
TEST(SlottedPage, Randomized) { const uint32_t kTestScale = 1; const uint32_t iterations = 10000; util::Random ranny; for(uint32_t j=0; j<kTestScale; j++) { std::unordered_map<RecordId, std::string> reference; SlottedPage* slottedPage = static_cast<SlottedPage*>(malloc(kPageSize)); slottedPage->initialize(); // Add some initial data for(uint32_t i=0; i<kPageSize/3/32; i++) { std::string data = util::randomWord(ranny, 8, 64); if(slottedPage->getBytesFreeForRecord() < data.size()) continue; RecordId id = slottedPage->insert(Record(data)); // std::cout << "insert " << id << " -> " << data << std::endl; ASSERT_TRUE(reference.count(id) == 0); reference.insert(make_pair(id, data)); } // Work on it for(uint32_t i=0; i<iterations; i++) { int32_t operation = ranny.rand() % 100; // Do insert if(operation <= 40) { std::string data = util::randomWord(ranny, 8, 64); if(slottedPage->getBytesFreeForRecord() < data.size()) continue; RecordId id = slottedPage->insert(Record(data)); // std::cout << "insert " << id << " -> " << data << std::endl; ASSERT_TRUE(reference.count(id) == 0); reference.insert(make_pair(id, data)); } // Do remove else if(operation <= 80) { if(reference.empty()) continue; RecordId id = reference.begin()->first; // std::cout << "remove " << id << std::endl; Record record = slottedPage->lookup(id); ASSERT_EQ(slottedPage->isReference(id), kInvalidTupleId); ASSERT_EQ(std::string(record.data(), record.size()), reference.begin()->second); slottedPage->remove(id); reference.erase(reference.begin()); } // Do update else if(operation <= 98) { if(reference.empty()) continue; RecordId id = reference.begin()->first; Record record = slottedPage->lookup(id); ASSERT_EQ(slottedPage->isReference(id), kInvalidTupleId); ASSERT_EQ(std::string(record.data(), record.size()), reference.begin()->second); std::string data = util::randomWord(ranny, 8, 64); if(slottedPage->canUpdateRecord(id, Record(data))) { slottedPage->update(id, Record(data)); reference.erase(reference.begin()); reference.insert(make_pair(id, data)); } } // Do consistency check else if(operation<=99 || i==iterations-1 || i==0) { ASSERT_TRUE(slottedPage->isValid()); auto records = slottedPage->getAllRecords(kInvalidPageId); // page id does not matter ASSERT_EQ(records.size(), reference.size()); for(auto& iter : records) { ASSERT_TRUE(reference.count(iter.first.toRecordId()) > 0); ASSERT_EQ(string(iter.second.data(), iter.second.size()), reference.find(iter.first.toRecordId())->second); } continue; } } free(slottedPage); } }
StatusWith<DiskLoc> Collection::updateDocument( const DiskLoc& oldLocation, const BSONObj& objNew, bool enforceQuota, OpDebug* debug ) { Record* oldRecord = getExtentManager()->recordFor( oldLocation ); BSONObj objOld = BSONObj::make( oldRecord ); if ( objOld.hasElement( "_id" ) ) { BSONElement oldId = objOld["_id"]; BSONElement newId = objNew["_id"]; if ( oldId != newId ) return StatusWith<DiskLoc>( ErrorCodes::InternalError, "in Collection::updateDocument _id mismatch", 13596 ); } if ( ns().coll() == "system.users" ) { // XXX - andy and spencer think this should go away now V2UserDocumentParser parser; Status s = parser.checkValidUserDocument(objNew); if ( !s.isOK() ) return StatusWith<DiskLoc>( s ); } /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further below. that is suboptimal, but it's pretty complicated to do it the other way without rollbacks... */ OwnedPointerVector<UpdateTicket> updateTickets; updateTickets.mutableVector().resize(_indexCatalog.numIndexesTotal()); for (int i = 0; i < _indexCatalog.numIndexesTotal(); ++i) { IndexDescriptor* descriptor = _indexCatalog.getDescriptor( i ); IndexAccessMethod* iam = _indexCatalog.getIndex( descriptor ); InsertDeleteOptions options; options.logIfError = false; options.dupsAllowed = !(KeyPattern::isIdKeyPattern(descriptor->keyPattern()) || descriptor->unique()) || ignoreUniqueIndex(descriptor); updateTickets.mutableVector()[i] = new UpdateTicket(); Status ret = iam->validateUpdate(objOld, objNew, oldLocation, options, updateTickets.mutableVector()[i]); if ( !ret.isOK() ) { return StatusWith<DiskLoc>( ret ); } } if ( oldRecord->netLength() < objNew.objsize() ) { // doesn't fit, have to move to new location if ( _details->isCapped() ) return StatusWith<DiskLoc>( ErrorCodes::InternalError, "failing update: objects in a capped ns cannot grow", 10003 ); moveCounter.increment(); _details->paddingTooSmall(); // unindex old record, don't delete // this way, if inserting new doc fails, we can re-index this one ClientCursor::aboutToDelete(_ns.ns(), _details, oldLocation); _indexCatalog.unindexRecord( objOld, oldLocation, true ); if ( debug ) { if (debug->nmoved == -1) // default of -1 rather than 0 debug->nmoved = 1; else debug->nmoved += 1; } StatusWith<DiskLoc> loc = insertDocument( objNew, enforceQuota ); if ( loc.isOK() ) { // insert successful, now lets deallocate the old location // remember its already unindexed _recordStore.deallocRecord( oldLocation, oldRecord ); } else { // new doc insert failed, so lets re-index the old document and location _indexCatalog.indexRecord( objOld, oldLocation ); } return loc; } _infoCache.notifyOfWriteOp(); _details->paddingFits(); if ( debug ) debug->keyUpdates = 0; for (int i = 0; i < _indexCatalog.numIndexesTotal(); ++i) { IndexDescriptor* descriptor = _indexCatalog.getDescriptor( i ); IndexAccessMethod* iam = _indexCatalog.getIndex( descriptor ); int64_t updatedKeys; Status ret = iam->update(*updateTickets.vector()[i], &updatedKeys); if ( !ret.isOK() ) return StatusWith<DiskLoc>( ret ); if ( debug ) debug->keyUpdates += updatedKeys; } // update in place int sz = objNew.objsize(); memcpy(getDur().writingPtr(oldRecord->data(), sz), objNew.objdata(), sz); return StatusWith<DiskLoc>( oldLocation ); }