StatusWith<DiskLoc> Collection::insertDocument( const BSONObj& docToInsert, bool enforceQuota ) { int lenWHdr = _details->getRecordAllocationSize( docToInsert.objsize() + Record::HeaderSize ); fassert( 17208, lenWHdr >= ( docToInsert.objsize() + Record::HeaderSize ) ); if ( _details->isCapped() ) { // TOOD: old god not done Status ret = _indexCatalog.checkNoIndexConflicts( docToInsert ); uassert(17209, "duplicate key insert for unique index of capped collection", ret.isOK() ); } // TODO: for now, capped logic lives inside NamespaceDetails, which is hidden // under the RecordStore, this feels broken since that should be a // collection access method probably StatusWith<DiskLoc> loc = _recordStore.allocRecord( lenWHdr, enforceQuota ? largestFileNumberInQuota() : 0 ); if ( !loc.isOK() ) return loc; Record *r = loc.getValue().rec(); fassert( 17210, r->lengthWithHeaders() >= lenWHdr ); // copy the data r = reinterpret_cast<Record*>( getDur().writingPtr(r, lenWHdr) ); memcpy( r->data(), docToInsert.objdata(), docToInsert.objsize() ); addRecordToRecListInExtent(r, loc.getValue()); // XXX move down into record store _details->incrementStats( r->netLength(), 1 ); // TOOD: old god not done _infoCache.notifyOfWriteOp(); try { _indexCatalog.indexRecord( docToInsert, loc.getValue() ); } catch( AssertionException& e ) { if ( _details->isCapped() ) { return StatusWith<DiskLoc>( ErrorCodes::InternalError, str::stream() << "unexpected index insertion failure on" << " capped collection" << e.toString() << " - collection and its index will not match" ); } // normal case -- we can roll back deleteDocument( loc.getValue(), false, true, NULL ); throw; } // TODO: this is what the old code did, but is it correct? _details->paddingFits(); return loc; }
IndexCatalog::IndexBuildBlock* halfAddIndex(const std::string& key) { string name = key + "_1"; BSONObj indexInfo = BSON( "v" << 1 << "key" << BSON( key << 1 ) << "ns" << _ns << "name" << name ); int32_t lenWHdr = indexInfo.objsize() + Record::HeaderSize; const char* systemIndexes = "unittests.system.indexes"; DiskLoc infoLoc = allocateSpaceForANewRecord( systemIndexes, nsdetails( systemIndexes ), lenWHdr, false ); Record* infoRecord = reinterpret_cast<Record*>( getDur().writingPtr( infoLoc.rec(), lenWHdr ) ); memcpy( infoRecord->data(), indexInfo.objdata(), indexInfo.objsize() ); addRecordToRecListInExtent( infoRecord, infoLoc ); return new IndexCatalog::IndexBuildBlock( _ctx.ctx().db()->getCollection( _ns )->getIndexCatalog(), name, infoLoc ); }
/** @return IndexDetails for a new index on a:1, with the info field populated. */ IndexDetails& addIndexWithInfo() { BSONObj indexInfo = BSON( "v" << 1 << "key" << BSON( "a" << 1 ) << "ns" << _ns << "name" << "a_1" ); int32_t lenWHdr = indexInfo.objsize() + Record::HeaderSize; const char* systemIndexes = "unittests.system.indexes"; DiskLoc infoLoc = allocateSpaceForANewRecord( systemIndexes, nsdetails( systemIndexes ), lenWHdr, false ); Record* infoRecord = reinterpret_cast<Record*>( getDur().writingPtr( infoLoc.rec(), lenWHdr ) ); memcpy( infoRecord->data(), indexInfo.objdata(), indexInfo.objsize() ); addRecordToRecListInExtent( infoRecord, infoLoc ); IndexDetails& id = nsdetails( _ns )->getNextIndexDetails( _ns ); nsdetails( _ns )->addIndex(); id.info.writing() = infoLoc; return id; }
/** @return IndexDetails for a new index on a:1, with the info field populated. */ IndexDescriptor* addIndexWithInfo() { BSONObj indexInfo = BSON( "v" << 1 << "key" << BSON( "a" << 1 ) << "ns" << _ns << "name" << "a_1" ); int32_t lenWHdr = indexInfo.objsize() + Record::HeaderSize; const char* systemIndexes = "unittests.system.indexes"; DiskLoc infoLoc = allocateSpaceForANewRecord( systemIndexes, nsdetails( systemIndexes ), lenWHdr, false ); Record* infoRecord = reinterpret_cast<Record*>( getDur().writingPtr( infoLoc.rec(), lenWHdr ) ); memcpy( infoRecord->data(), indexInfo.objdata(), indexInfo.objsize() ); addRecordToRecListInExtent( infoRecord, infoLoc ); IndexCatalog::IndexBuildBlock blk( collection()->getIndexCatalog(), "a_1", infoLoc ); blk.success(); return collection()->getIndexCatalog()->findIndexByName( "a_1" ); }
/** @return number of skipped (invalid) documents */ unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc ext, int n, const scoped_array<IndexSpec> &indexSpecs, scoped_array<SortPhaseOne>& phase1, int nidx, bool validate, double pf, int pb) { log() << "compact extent #" << n << endl; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent *e = ext.ext(); e->assertOk(); assert( e->validates() ); unsigned skipped = 0; { // the next/prev pointers within the extent might not be in order so we first page the whole thing in // sequentially log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl; Timer t; MAdvise adv(e, e->length, MAdvise::Sequential); const char *p = (const char *) e; for( int i = 0; i < e->length; i += 4096 ) { faux += p[i]; } int ms = t.millis(); if( ms > 1000 ) log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl; } { log() << "compact copying records" << endl; unsigned totalSize = 0; int nrecs = 0; DiskLoc L = e->firstRecord; if( !L.isNull() ) { while( 1 ) { Record *recOld = L.rec(); L = recOld->nextInExtent(L); nrecs++; BSONObj objOld(recOld); if( !validate || objOld.valid() ) { unsigned sz = objOld.objsize(); oldObjSize += sz; oldObjSizeWithPadding += recOld->netLength(); unsigned lenWHdr = sz + Record::HeaderSize; unsigned lenWPadding = lenWHdr; { lenWPadding = static_cast<unsigned>(pf*lenWPadding); lenWPadding += pb; lenWPadding = lenWPadding & quantizeMask(lenWPadding); if( lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { lenWPadding = lenWHdr; } } totalSize += lenWPadding; DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false); uassert(14024, "compact error out of space during compaction", !loc.isNull()); Record *recNew = loc.rec(); recNew = (Record *) getDur().writingPtr(recNew, lenWHdr); addRecordToRecListInExtent(recNew, loc); memcpy(recNew->data, objOld.objdata(), sz); { // extract keys for all indexes we will be rebuilding for( int x = 0; x < nidx; x++ ) { phase1[x].addKeys(indexSpecs[x], objOld, loc); } } } else { if( ++skipped <= 10 ) log() << "compact skipping invalid object" << endl; } if( L.isNull() ) { // we just did the very last record from the old extent. it's still pointed to // by the old extent ext, but that will be fixed below after this loop break; } // remove the old records (orphan them) periodically so our commit block doesn't get too large bool stopping = false; RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0; if( stopping || getDur().aCommitIsNeeded() ) { e->firstRecord.writing() = L; Record *r = L.rec(); getDur().writingInt(r->prevOfs) = DiskLoc::NullOfs; getDur().commitIfNeeded(); killCurrentOp.checkForInterrupt(false); } } } // if !L.isNull() assert( d->firstExtent == ext ); assert( d->lastExtent != ext ); DiskLoc newFirst = e->xnext; d->firstExtent.writing() = newFirst; newFirst.ext()->xprev.writing().Null(); getDur().writing(e)->markEmpty(); freeExtents(ext,ext); getDur().commitIfNeeded(); { double op = 1.0; if( oldObjSize ) op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize; log() << "compact " << nrecs << " documents " << totalSize/1000000.0 << "MB" << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100 << endl; } } return skipped; }
/** @return number of skipped (invalid) documents */ unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc diskloc, int n, const scoped_array<IndexSpec> &indexSpecs, scoped_array<SortPhaseOne>& phase1, int nidx, bool validate, double pf, int pb) { log() << "compact begin extent #" << n << " for namespace " << ns << endl; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent *e = diskloc.ext(); e->assertOk(); verify( e->validates() ); unsigned skipped = 0; { // the next/prev pointers within the extent might not be in order so we first page the whole thing in // sequentially log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl; Timer t; MongoDataFile* mdf = cc().database()->getFile( diskloc.a() ); HANDLE fd = mdf->getFd(); int offset = diskloc.getOfs(); Extent* ext = diskloc.ext(); size_t length = ext->length; touch_pages(fd, offset, length, ext); int ms = t.millis(); if( ms > 1000 ) log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl; } { log() << "compact copying records" << endl; long long datasize = 0; long long nrecords = 0; DiskLoc L = e->firstRecord; if( !L.isNull() ) { while( 1 ) { Record *recOld = L.rec(); L = recOld->nextInExtent(L); BSONObj objOld = BSONObj::make(recOld); if( !validate || objOld.valid() ) { nrecords++; unsigned sz = objOld.objsize(); oldObjSize += sz; oldObjSizeWithPadding += recOld->netLength(); unsigned lenWHdr = sz + Record::HeaderSize; unsigned lenWPadding = lenWHdr; { lenWPadding = static_cast<unsigned>(pf*lenWPadding); lenWPadding += pb; lenWPadding = lenWPadding & quantizeMask(lenWPadding); if( lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { lenWPadding = lenWHdr; } } DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false); uassert(14024, "compact error out of space during compaction", !loc.isNull()); Record *recNew = loc.rec(); datasize += recNew->netLength(); recNew = (Record *) getDur().writingPtr(recNew, lenWHdr); addRecordToRecListInExtent(recNew, loc); memcpy(recNew->data(), objOld.objdata(), sz); { // extract keys for all indexes we will be rebuilding for( int x = 0; x < nidx; x++ ) { phase1[x].addKeys(indexSpecs[x], objOld, loc); } } } else { if( ++skipped <= 10 ) log() << "compact skipping invalid object" << endl; } if( L.isNull() ) { // we just did the very last record from the old extent. it's still pointed to // by the old extent ext, but that will be fixed below after this loop break; } // remove the old records (orphan them) periodically so our commit block doesn't get too large bool stopping = false; RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0; if( stopping || getDur().aCommitIsNeeded() ) { e->firstRecord.writing() = L; Record *r = L.rec(); getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs; getDur().commitIfNeeded(); killCurrentOp.checkForInterrupt(false); } } } // if !L.isNull() verify( d->firstExtent == diskloc ); verify( d->lastExtent != diskloc ); DiskLoc newFirst = e->xnext; d->firstExtent.writing() = newFirst; newFirst.ext()->xprev.writing().Null(); getDur().writing(e)->markEmpty(); freeExtents( diskloc, diskloc ); // update datasize/record count for this namespace's extent { NamespaceDetails::Stats *s = getDur().writing(&d->stats); s->datasize += datasize; s->nrecords += nrecords; } getDur().commitIfNeeded(); { double op = 1.0; if( oldObjSize ) op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize; log() << "compact finished extent #" << n << " containing " << nrecords << " documents (" << datasize/1000000.0 << "MB)" << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100 << endl; } } return skipped; }
DiskLoc DataFileMgr::insert(const char* ns, const void* obuf, int32_t len, bool mayInterrupt, bool god, bool mayAddIndex, bool* addedID) { Database* database = cc().database(); bool wouldAddIndex = false; massert( 10093 , "cannot insert into reserved $ collection", god || NamespaceString::normal( ns ) ); uassert( 10094 , str::stream() << "invalid ns: " << ns , isValidNS( ns ) ); { const char *sys = strstr(ns, "system."); if ( sys && !insert_checkSys(sys, ns, wouldAddIndex, obuf, god) ) return DiskLoc(); } bool addIndex = wouldAddIndex && mayAddIndex; Collection* collection = database->getCollection( ns ); if ( collection == NULL ) { collection = database->createCollection( ns, false, NULL ); int ies = Extent::initialSize(len); if( str::contains(ns, '$') && len + Record::HeaderSize >= BtreeData_V1::BucketSize - 256 && len + Record::HeaderSize <= BtreeData_V1::BucketSize + 256 ) { // probably an index. so we pick a value here for the first extent instead of using // initialExtentSize() which is more for user collections. // TODO: we could look at the # of records in the parent collection to be smarter here. ies = (32+4) * 1024; } collection->increaseStorageSize( ies, false); if ( !god ) ensureIdIndexForNewNs(ns); } NamespaceDetails* d = collection->details(); string tabletoidxns; Collection* collectionToIndex = 0; NamespaceDetails* tableToIndex = 0; BSONObj fixedIndexObject; if ( addIndex ) { verify( obuf ); BSONObj io((const char *) obuf); tabletoidxns = io.getStringField( "ns" ); uassert(10096, "invalid ns to index", tabletoidxns.find( '.' ) != string::npos); massert(10097, str::stream() << "trying to create index on wrong db " << " db: " << database->name() << " collection: " << tabletoidxns, database->ownsNS( tabletoidxns ) ); collectionToIndex = database->getCollection( tabletoidxns ); if ( !collectionToIndex ) { collectionToIndex = database->createCollection( tabletoidxns, false, NULL ); verify( collectionToIndex ); if ( !god ) ensureIdIndexForNewNs( tabletoidxns.c_str() ); } tableToIndex = collectionToIndex->details(); Status status = collectionToIndex->getIndexCatalog()->okToAddIndex( io ); if ( status.code() == ErrorCodes::IndexAlreadyExists ) { // dup index, we ignore return DiskLoc(); } uassert( 17199, str::stream() << "cannot build index on " << tabletoidxns << " because of " << status.toString(), status.isOK() ); if( !prepareToBuildIndex(io, mayInterrupt, god, tabletoidxns ) ) { // prepare creates _id itself, or this indicates to fail the build silently (such // as if index already exists) return DiskLoc(); } fixedIndexObject = IndexCatalog::fixIndexSpec( io ); obuf = fixedIndexObject.objdata(); len = fixedIndexObject.objsize(); } IDToInsert idToInsert; // only initialized if needed if( !god ) { /* Check if we have an _id field. If we don't, we'll add it. Note that btree buckets which we insert aren't BSONObj's, but in that case god==true. */ BSONObj io((const char *) obuf); BSONElement idField = io.getField( "_id" ); uassert( 10099 , "_id cannot be an array", idField.type() != Array ); // we don't add _id for capped collections in local as they don't have an _id index if( idField.eoo() && !wouldAddIndex && nsToDatabase( ns ) != "local" && d->haveIdIndex() ) { if( addedID ) *addedID = true; idToInsert.init(); len += idToInsert.size(); } BSONElementManipulator::lookForTimestamps( io ); } int lenWHdr = d->getRecordAllocationSize( len + Record::HeaderSize ); fassert( 16440, lenWHdr >= ( len + Record::HeaderSize ) ); // If the collection is capped, check if the new object will violate a unique index // constraint before allocating space. if ( d->isCapped() && !god) { BSONObj temp = BSONObj( reinterpret_cast<const char *>( obuf ) ); Status ret = collection->getIndexCatalog()->checkNoIndexConflicts( temp ); uassert(12582, "duplicate key insert for unique index of capped collection", ret.isOK() ); } DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god); if ( loc.isNull() ) { log() << "insert: couldn't alloc space for object ns:" << ns << " capped:" << d->isCapped() << endl; verify(d->isCapped()); return DiskLoc(); } Record *r = loc.rec(); { verify( r->lengthWithHeaders() >= lenWHdr ); r = (Record*) getDur().writingPtr(r, lenWHdr); if( idToInsert.needed() ) { /* a little effort was made here to avoid a double copy when we add an ID */ int originalSize = *((int*) obuf); ((int&)*r->data()) = originalSize + idToInsert.size(); memcpy(r->data()+4, idToInsert.rawdata(), idToInsert.size()); memcpy(r->data()+4+idToInsert.size(), ((char*)obuf)+4, originalSize-4); } else { if( obuf ) // obuf can be null from internal callers memcpy(r->data(), obuf, len); } } addRecordToRecListInExtent(r, loc); d->incrementStats( r->netLength(), 1 ); // we don't bother resetting query optimizer stats for the god tables - also god is true when adding a btree bucket if ( !god ) collection->infoCache()->notifyOfWriteOp(); if ( tableToIndex ) { insert_makeIndex(collectionToIndex, loc, mayInterrupt); } /* add this record to our indexes */ if ( d->getTotalIndexCount() > 0 ) { try { BSONObj obj(r->data()); collection->getIndexCatalog()->indexRecord(obj, loc); } catch( AssertionException& e ) { // should be a dup key error on _id index if( tableToIndex || d->isCapped() ) { massert( 12583, "unexpected index insertion failure on capped collection", !d->isCapped() ); string s = e.toString(); s += " : on addIndex/capped - collection and its index will not match"; setLastError(0, s.c_str()); error() << s << endl; } else { // normal case -- we can roll back _deleteRecord(d, ns, r, loc); throw; } } } d->paddingFits(); return loc; }
/** @return number of skipped (invalid) documents */ unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc diskloc, int n, int nidx, bool validate, double pf, int pb, bool useDefaultPadding, bool preservePadding) { log() << "compact begin extent #" << n << " for namespace " << ns << endl; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent *e = diskloc.ext(); e->assertOk(); verify( e->validates(diskloc) ); unsigned skipped = 0; Database* db = cc().database(); { // the next/prev pointers within the extent might not be in order so we first // page the whole thing in sequentially log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl; Timer t; Extent* ext = db->getExtentManager().getExtent( diskloc ); size_t length = ext->length; touch_pages( reinterpret_cast<const char*>(ext), length ); int ms = t.millis(); if( ms > 1000 ) log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl; } { log() << "compact copying records" << endl; long long datasize = 0; long long nrecords = 0; DiskLoc L = e->firstRecord; if( !L.isNull() ) { while( 1 ) { Record *recOld = L.rec(); L = db->getExtentManager().getNextRecordInExtent(L); BSONObj objOld = BSONObj::make(recOld); if( !validate || objOld.valid() ) { nrecords++; unsigned sz = objOld.objsize(); oldObjSize += sz; oldObjSizeWithPadding += recOld->netLength(); unsigned lenWHdr = sz + Record::HeaderSize; unsigned lenWPadding = lenWHdr; // if we are preserving the padding, the record should not change size if (preservePadding) { lenWPadding = recOld->lengthWithHeaders(); } // maintain UsePowerOf2Sizes if no padding values were passed in else if (d->isUserFlagSet(NamespaceDetails::Flag_UsePowerOf2Sizes) && useDefaultPadding) { lenWPadding = d->quantizePowerOf2AllocationSpace(lenWPadding); } // otherwise use the padding values (pf and pb) that were passed in else { lenWPadding = static_cast<unsigned>(pf*lenWPadding); lenWPadding += pb; lenWPadding = lenWPadding & quantizeMask(lenWPadding); } if (lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { lenWPadding = lenWHdr; } DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false); uassert(14024, "compact error out of space during compaction", !loc.isNull()); Record *recNew = loc.rec(); datasize += recNew->netLength(); recNew = (Record *) getDur().writingPtr(recNew, lenWHdr); addRecordToRecListInExtent(recNew, loc); memcpy(recNew->data(), objOld.objdata(), sz); } else { if( ++skipped <= 10 ) log() << "compact skipping invalid object" << endl; } if( L.isNull() ) { // we just did the very last record from the old extent. it's still pointed to // by the old extent ext, but that will be fixed below after this loop break; } // remove the old records (orphan them) periodically so our commit block doesn't get too large bool stopping = false; RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0; if( stopping || getDur().aCommitIsNeeded() ) { e->firstRecord.writing() = L; Record *r = L.rec(); getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs; getDur().commitIfNeeded(); killCurrentOp.checkForInterrupt(false); } } } // if !L.isNull() verify( d->firstExtent() == diskloc ); verify( d->lastExtent() != diskloc ); DiskLoc newFirst = e->xnext; d->firstExtent().writing() = newFirst; newFirst.ext()->xprev.writing().Null(); getDur().writing(e)->markEmpty(); cc().database()->getExtentManager().freeExtents( diskloc, diskloc ); // update datasize/record count for this namespace's extent d->incrementStats( datasize, nrecords ); getDur().commitIfNeeded(); { double op = 1.0; if( oldObjSize ) op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize; log() << "compact finished extent #" << n << " containing " << nrecords << " documents (" << datasize/1000000.0 << "MB)" << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100 << endl; } } return skipped; }