void touchNs( const std::string& ns ) { std::vector< touch_location > ranges; Client::ReadContext ctx(ns); { NamespaceDetails *nsd = nsdetails(ns.c_str()); uassert( 16154, "namespace does not exist", nsd ); for( DiskLoc L = nsd->firstExtent; !L.isNull(); L = L.ext()->xnext ) { MongoDataFile* mdf = cc().database()->getFile( L.a() ); massert( 16238, "can't fetch extent file structure", mdf ); touch_location tl; tl.fd = mdf->getFd(); tl.offset = L.getOfs(); tl.ext = L.ext(); tl.length = tl.ext->length; ranges.push_back(tl); } } LockMongoFilesShared lk; Lock::TempRelease tr; std::string progress_msg = "touch " + ns + " extents"; ProgressMeterHolder pm( cc().curop()->setMessage( progress_msg.c_str() , ranges.size() ) ); for ( std::vector< touch_location >::iterator it = ranges.begin(); it != ranges.end(); ++it ) { touch_pages( it->fd, it->offset, it->length, it->ext ); pm.hit(); killCurrentOp.checkForInterrupt(false); } pm.finished(); }
void NamespaceDetails::dumpExtents() { cout << "dumpExtents:" << endl; for ( DiskLoc i = _firstExtent; !i.isNull(); i = i.ext()->xnext ) { Extent *e = i.ext(); stringstream ss; e->dump(ss); cout << ss.str() << endl; } }
int nRecords() const { int count = 0; for ( DiskLoc i = nsd()->firstExtent; !i.isNull(); i = i.ext()->xnext ) { int fileNo = i.ext()->firstRecord.a(); if ( fileNo == -1 ) continue; for ( int j = i.ext()->firstRecord.getOfs(); j != DiskLoc::NullOfs; j = DiskLoc( fileNo, j ).rec()->nextOfs ) { ++count; } } ASSERT_EQUALS( count, nsd()->nrecords ); return count; }
// bypass standard alloc/insert routines to use the extent we want. static DiskLoc insert( DiskLoc ext, int i ) { BSONObjBuilder b; b.append( "a", i ); BSONObj o = b.done(); int len = o.objsize(); Extent *e = ext.ext(); int ofs; if ( e->lastRecord.isNull() ) ofs = ext.getOfs() + ( e->extentData - (char *)e ); else ofs = e->lastRecord.getOfs() + e->lastRecord.rec()->lengthWithHeaders; DiskLoc dl( ext.a(), ofs ); Record *r = dl.rec(); r->lengthWithHeaders = Record::HeaderSize + len; r->extentOfs = e->myLoc.getOfs(); r->nextOfs = DiskLoc::NullOfs; r->prevOfs = e->lastRecord.isNull() ? DiskLoc::NullOfs : e->lastRecord.getOfs(); memcpy( r->data, o.objdata(), len ); if ( e->firstRecord.isNull() ) e->firstRecord = dl; else e->lastRecord.rec()->nextOfs = ofs; e->lastRecord = dl; return dl; }
void NamespaceDetails::emptyCappedCollection( const char *ns ) { DEV verify( this == nsdetails(ns) ); massert( 13424, "collection must be capped", capped ); massert( 13425, "background index build in progress", !indexBuildInProgress ); massert( 13426, "indexes present", nIndexes == 0 ); // Clear all references to this namespace. ClientCursor::invalidate( ns ); NamespaceDetailsTransient::clearForPrefix( ns ); // Get a writeable reference to 'this' and reset all pertinent // attributes. NamespaceDetails *t = writingWithoutExtra(); t->cappedLastDelRecLastExtent() = DiskLoc(); t->cappedListOfAllDeletedRecords() = DiskLoc(); // preserve firstExtent/lastExtent t->capExtent = firstExtent; t->stats.datasize = stats.nrecords = 0; // lastExtentSize preserve // nIndexes preserve 0 // capped preserve true // max preserve t->paddingFactor = 1.0; t->flags = 0; t->capFirstNewRecord = DiskLoc(); t->capFirstNewRecord.setInvalid(); t->cappedLastDelRecLastExtent().setInvalid(); // dataFileVersion preserve // indexFileVersion preserve t->multiKeyIndexBits = 0; t->reservedA = 0; t->extraOffset = 0; // indexBuildInProgress preserve 0 memset(t->reserved, 0, sizeof(t->reserved)); // Reset all existing extents and recreate the deleted list. for( DiskLoc ext = firstExtent; !ext.isNull(); ext = ext.ext()->xnext ) { DiskLoc prev = ext.ext()->xprev; DiskLoc next = ext.ext()->xnext; DiskLoc empty = ext.ext()->reuse( ns, true ); ext.ext()->xprev.writing() = prev; ext.ext()->xnext.writing() = next; addDeletedRec( empty.drec(), empty ); } }
void NamespaceDetails::emptyCappedCollection( const char *ns ) { DEV assert( this == nsdetails(ns) ); massert( 13424, "collection must be capped", capped ); massert( 13425, "background index build in progress", !backgroundIndexBuildInProgress ); massert( 13426, "indexes present", nIndexes == 0 ); ClientCursor::invalidate( ns ); NamespaceDetailsTransient::clearForPrefix( ns ); cappedLastDelRecLastExtent() = DiskLoc(); cappedListOfAllDeletedRecords() = DiskLoc(); // preserve firstExtent/lastExtent capExtent = firstExtent; stats.datasize = stats.nrecords = 0; // lastExtentSize preserve // nIndexes preserve 0 // capped preserve true // max preserve paddingFactor = 1.0; flags = 0; capFirstNewRecord = DiskLoc(); capFirstNewRecord.setInvalid(); cappedLastDelRecLastExtent().setInvalid(); // dataFileVersion preserve // indexFileVersion preserve multiKeyIndexBits = 0; reservedA = 0; extraOffset = 0; // backgroundIndexBuildInProgress preserve 0 memset(reserved, 0, sizeof(reserved)); for( DiskLoc ext = firstExtent; !ext.isNull(); ext = ext.ext()->xnext ) { DiskLoc prev = ext.ext()->xprev; DiskLoc next = ext.ext()->xnext; DiskLoc empty = ext.ext()->reuse( ns ); ext.ext()->xprev = prev; ext.ext()->xnext = next; addDeletedRec( empty.drec(), empty ); } }
bool _compact(const char *ns, NamespaceDetails *d, string& errmsg, bool validate, BSONObjBuilder& result, double pf, int pb) { // this is a big job, so might as well make things tidy before we start just to be nice. getDur().commitIfNeeded(); list<DiskLoc> extents; for( DiskLoc L = d->firstExtent; !L.isNull(); L = L.ext()->xnext ) extents.push_back(L); log() << "compact " << extents.size() << " extents" << endl; ProgressMeterHolder pm( cc().curop()->setMessage( "compact extent" , extents.size() ) ); // same data, but might perform a little different after compact? NamespaceDetailsTransient::get(ns).clearQueryCache(); int nidx = d->nIndexes; scoped_array<IndexSpec> indexSpecs( new IndexSpec[nidx] ); scoped_array<SortPhaseOne> phase1( new SortPhaseOne[nidx] ); { NamespaceDetails::IndexIterator ii = d->ii(); // For each existing index... for( int idxNo = 0; ii.more(); ++idxNo ) { // Build a new index spec based on the old index spec. BSONObjBuilder b; BSONObj::iterator i(ii.next().info.obj()); while( i.more() ) { BSONElement e = i.next(); if ( str::equals( e.fieldName(), "v" ) ) { // Drop any preexisting index version spec. The default index version will // be used instead for the new index. continue; } if ( str::equals( e.fieldName(), "background" ) ) { // Create the new index in the foreground. continue; } // Pass the element through to the new index spec. b.append(e); } // Add the new index spec to 'indexSpecs'. BSONObj o = b.obj().getOwned(); indexSpecs[idxNo].reset(o); // Create an external sorter. phase1[idxNo].sorter.reset ( new BSONObjExternalSorter // Use the default index interface, since the new index will be created // with the default index version. ( IndexInterface::defaultVersion(), o.getObjectField("key") ) ); phase1[idxNo].sorter->hintNumObjects( d->stats.nrecords ); } } log() << "compact orphan deleted lists" << endl; for( int i = 0; i < Buckets; i++ ) { d->deletedList[i].writing().Null(); } // Start over from scratch with our extent sizing and growth d->lastExtentSize=0; // before dropping indexes, at least make sure we can allocate one extent! uassert(14025, "compact error no space available to allocate", !allocateSpaceForANewRecord(ns, d, Record::HeaderSize+1, false).isNull()); // note that the drop indexes call also invalidates all clientcursors for the namespace, which is important and wanted here log() << "compact dropping indexes" << endl; BSONObjBuilder b; if( !dropIndexes(d, ns, "*", errmsg, b, true) ) { errmsg = "compact drop indexes failed"; log() << errmsg << endl; return false; } getDur().commitIfNeeded(); long long skipped = 0; int n = 0; // reset data size and record counts to 0 for this namespace // as we're about to tally them up again for each new extent { NamespaceDetails::Stats *s = getDur().writing(&d->stats); s->datasize = 0; s->nrecords = 0; } for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) { skipped += compactExtent(ns, d, *i, n++, indexSpecs, phase1, nidx, validate, pf, pb); pm.hit(); } if( skipped ) { result.append("invalidObjects", skipped); } verify( d->firstExtent.ext()->xprev.isNull() ); // indexes will do their own progress meter? pm.finished(); // build indexes NamespaceString s(ns); string si = s.db + ".system.indexes"; for( int i = 0; i < nidx; i++ ) { killCurrentOp.checkForInterrupt(false); BSONObj info = indexSpecs[i].info; log() << "compact create index " << info["key"].Obj().toString() << endl; try { precalced = &phase1[i]; theDataFileMgr.insert(si.c_str(), info.objdata(), info.objsize()); } catch(...) { precalced = 0; throw; } precalced = 0; } return true; }
StatusWith<CompactStats> Collection::compact( const CompactOptions* compactOptions ) { if ( isCapped() ) return StatusWith<CompactStats>( ErrorCodes::BadValue, "cannot compact capped collection" ); if ( _indexCatalog.numIndexesInProgress() ) return StatusWith<CompactStats>( ErrorCodes::BadValue, "cannot compact when indexes in progress" ); NamespaceDetails* d = details(); // this is a big job, so might as well make things tidy before we start just to be nice. getDur().commitIfNeeded(); list<DiskLoc> extents; for( DiskLoc L = d->firstExtent(); !L.isNull(); L = L.ext()->xnext ) extents.push_back(L); log() << "compact " << extents.size() << " extents" << endl; // same data, but might perform a little different after compact? _infoCache.reset(); vector<BSONObj> indexSpecs; { IndexCatalog::IndexIterator ii( _indexCatalog.getIndexIterator( false ) ); while ( ii.more() ) { IndexDescriptor* descriptor = ii.next(); const BSONObj spec = _compactAdjustIndexSpec(descriptor->infoObj()); const BSONObj key = spec.getObjectField("key"); const Status keyStatus = validateKeyPattern(key); if (!keyStatus.isOK()) { return StatusWith<CompactStats>( ErrorCodes::CannotCreateIndex, str::stream() << "Cannot rebuild index " << spec << ": " << keyStatus.reason() << " For more info see" << " http://dochub.mongodb.org/core/index-validation"); } indexSpecs.push_back(spec); } } log() << "compact orphan deleted lists" << endl; d->orphanDeletedList(); // Start over from scratch with our extent sizing and growth d->setLastExtentSize( 0 ); // before dropping indexes, at least make sure we can allocate one extent! // this will allocate an extent and add to free list // if it cannot, it will throw an exception increaseStorageSize( _details->lastExtentSize(), true ); // note that the drop indexes call also invalidates all clientcursors for the namespace, // which is important and wanted here log() << "compact dropping indexes" << endl; Status status = _indexCatalog.dropAllIndexes( true ); if ( !status.isOK() ) { return StatusWith<CompactStats>( status ); } getDur().commitIfNeeded(); killCurrentOp.checkForInterrupt(); CompactStats stats; MultiIndexBlock multiIndexBlock( this ); status = multiIndexBlock.init( indexSpecs ); if ( !status.isOK() ) return StatusWith<CompactStats>( status ); // reset data size and record counts to 0 for this namespace // as we're about to tally them up again for each new extent d->setStats( 0, 0 ); ProgressMeterHolder pm(cc().curop()->setMessage("compact extent", "Extent Compacting Progress", extents.size())); int extentNumber = 0; for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) { _compactExtent(*i, extentNumber++, multiIndexBlock, compactOptions, &stats ); pm.hit(); } verify( d->firstExtent().ext()->xprev.isNull() ); // indexes will do their own progress meter? pm.finished(); log() << "starting index commits"; status = multiIndexBlock.commit(); if ( !status.isOK() ) return StatusWith<CompactStats>( status ); return StatusWith<CompactStats>( stats ); }
void Collection::_compactExtent(const DiskLoc diskloc, int extentNumber, MultiIndexBlock& indexesToInsertTo, const CompactOptions* compactOptions, CompactStats* stats ) { log() << "compact begin extent #" << extentNumber << " for namespace " << _ns << " " << diskloc; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent *e = diskloc.ext(); e->assertOk(); verify( e->validates(diskloc) ); { // the next/prev pointers within the extent might not be in order so we first // page the whole thing in sequentially log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl; Timer t; size_t length = e->length; touch_pages( reinterpret_cast<const char*>(e), length ); int ms = t.millis(); if( ms > 1000 ) log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/t.seconds() << "MB/sec" << endl; } { log() << "compact copying records" << endl; long long datasize = 0; long long nrecords = 0; DiskLoc L = e->firstRecord; if( !L.isNull() ) { while( 1 ) { Record *recOld = L.rec(); L = getExtentManager()->getNextRecordInExtent(L); BSONObj objOld = BSONObj::make(recOld); if ( compactOptions->validateDocuments && !objOld.valid() ) { // object is corrupt! log() << "compact skipping corrupt document!"; stats->corruptDocuments++; } else { unsigned docSize = objOld.objsize(); nrecords++; oldObjSize += docSize; oldObjSizeWithPadding += recOld->netLength(); unsigned lenWHdr = docSize + Record::HeaderSize; unsigned lenWPadding = lenWHdr; switch( compactOptions->paddingMode ) { case CompactOptions::NONE: if ( details()->isUserFlagSet(NamespaceDetails::Flag_UsePowerOf2Sizes) ) lenWPadding = details()->quantizePowerOf2AllocationSpace(lenWPadding); break; case CompactOptions::PRESERVE: // if we are preserving the padding, the record should not change size lenWPadding = recOld->lengthWithHeaders(); break; case CompactOptions::MANUAL: lenWPadding = compactOptions->computeRecordSize(lenWPadding); if (lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { lenWPadding = lenWHdr; } break; } CompactDocWriter writer( objOld, lenWPadding ); StatusWith<DiskLoc> status = _recordStore->insertRecord( &writer, 0 ); uassertStatusOK( status.getStatus() ); datasize += _recordStore->recordFor( status.getValue() )->netLength(); InsertDeleteOptions options; options.logIfError = false; options.dupsAllowed = true; // in compact we should be doing no checking indexesToInsertTo.insert( objOld, status.getValue(), options ); } if( L.isNull() ) { // we just did the very last record from the old extent. it's still pointed to // by the old extent ext, but that will be fixed below after this loop break; } // remove the old records (orphan them) periodically so our commit block doesn't get too large bool stopping = false; RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0; if( stopping || getDur().aCommitIsNeeded() ) { e->firstRecord.writing() = L; Record *r = L.rec(); getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs; getDur().commitIfNeeded(); killCurrentOp.checkForInterrupt(); } } } // if !L.isNull() verify( details()->firstExtent() == diskloc ); verify( details()->lastExtent() != diskloc ); DiskLoc newFirst = e->xnext; details()->firstExtent().writing() = newFirst; newFirst.ext()->xprev.writing().Null(); getDur().writing(e)->markEmpty(); getExtentManager()->freeExtents( diskloc, diskloc ); getDur().commitIfNeeded(); { double op = 1.0; if( oldObjSize ) op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize; log() << "compact finished extent #" << extentNumber << " containing " << nrecords << " documents (" << datasize/1000000.0 << "MB)" << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100; } } }
string validateNS(const char *ns, NamespaceDetails *d, BSONObj *cmdObj) { bool scanData = true; if( cmdObj && cmdObj->hasElement("scandata") && !cmdObj->getBoolField("scandata") ) scanData = false; bool valid = true; stringstream ss; ss << "\nvalidate\n"; //ss << " details: " << hex << d << " ofs:" << nsindex(ns)->detailsOffset(d) << dec << endl; if ( d->capped ) ss << " capped:" << d->capped << " max:" << d->max << '\n'; ss << " firstExtent:" << d->firstExtent.toString() << " ns:" << d->firstExtent.ext()->nsDiagnostic.toString()<< '\n'; ss << " lastExtent:" << d->lastExtent.toString() << " ns:" << d->lastExtent.ext()->nsDiagnostic.toString() << '\n'; try { d->firstExtent.ext()->assertOk(); d->lastExtent.ext()->assertOk(); DiskLoc el = d->firstExtent; int ne = 0; while( !el.isNull() ) { Extent *e = el.ext(); e->assertOk(); el = e->xnext; ne++; killCurrentOp.checkForInterrupt(); } ss << " # extents:" << ne << '\n'; } catch (...) { valid=false; ss << " extent asserted "; } ss << " datasize?:" << d->stats.datasize << " nrecords?:" << d->stats.nrecords << " lastExtentSize:" << d->lastExtentSize << '\n'; ss << " padding:" << d->paddingFactor << '\n'; try { try { ss << " first extent:\n"; d->firstExtent.ext()->dump(ss); valid = valid && d->firstExtent.ext()->validates(); } catch (...) { ss << "\n exception firstextent\n" << endl; } set<DiskLoc> recs; if( scanData ) { shared_ptr<Cursor> c = theDataFileMgr.findAll(ns); int n = 0; long long len = 0; long long nlen = 0; int outOfOrder = 0; DiskLoc cl_last; while ( c->ok() ) { n++; DiskLoc cl = c->currLoc(); if ( n < 1000000 ) recs.insert(cl); if ( d->capped ) { if ( cl < cl_last ) outOfOrder++; cl_last = cl; } Record *r = c->_current(); len += r->lengthWithHeaders; nlen += r->netLength(); c->advance(); } if ( d->capped && !d->capLooped() ) { ss << " capped outOfOrder:" << outOfOrder; if ( outOfOrder > 1 ) { valid = false; ss << " ???"; } else ss << " (OK)"; ss << '\n'; } ss << " " << n << " objects found, nobj:" << d->stats.nrecords << '\n'; ss << " " << len << " bytes data w/headers\n"; ss << " " << nlen << " bytes data wout/headers\n"; } ss << " deletedList: "; for ( int i = 0; i < Buckets; i++ ) { ss << (d->deletedList[i].isNull() ? '0' : '1'); } ss << endl; int ndel = 0; long long delSize = 0; int incorrect = 0; for ( int i = 0; i < Buckets; i++ ) { DiskLoc loc = d->deletedList[i]; try { int k = 0; while ( !loc.isNull() ) { if ( recs.count(loc) ) incorrect++; ndel++; if ( loc.questionable() ) { if( d->capped && !loc.isValid() && i == 1 ) { /* the constructor for NamespaceDetails intentionally sets deletedList[1] to invalid see comments in namespace.h */ break; } if ( loc.a() <= 0 || strstr(ns, "hudsonSmall") == 0 ) { ss << " ?bad deleted loc: " << loc.toString() << " bucket:" << i << " k:" << k << endl; valid = false; break; } } DeletedRecord *d = loc.drec(); delSize += d->lengthWithHeaders; loc = d->nextDeleted; k++; killCurrentOp.checkForInterrupt(); } } catch (...) { ss <<" ?exception in deleted chain for bucket " << i << endl; valid = false; } } ss << " deleted: n: " << ndel << " size: " << delSize << endl; if ( incorrect ) { ss << " ?corrupt: " << incorrect << " records from datafile are in deleted list\n"; valid = false; } int idxn = 0; try { ss << " nIndexes:" << d->nIndexes << endl; NamespaceDetails::IndexIterator i = d->ii(); while( i.more() ) { IndexDetails& id = i.next(); ss << " " << id.indexNamespace() << " keys:" << id.head.btree()->fullValidate(id.head, id.keyPattern()) << endl; } } catch (...) { ss << "\n exception during index validate idxn:" << idxn << endl; valid=false; } } catch (AssertionException) { ss << "\n exception during validate\n" << endl; valid = false; } if ( !valid ) ss << " ns corrupt, requires dbchk\n"; return ss.str(); }
virtual bool run(const char *ns, BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result, bool fromRepl) { string source = cmdObj.getStringField( name.c_str() ); string target = cmdObj.getStringField( "to" ); if ( source.empty() || target.empty() ) { errmsg = "invalid command syntax"; return false; } setClient( source.c_str() ); NamespaceDetails *nsd = nsdetails( source.c_str() ); uassert( "source namespace does not exist", nsd ); bool capped = nsd->capped; long long size = 0; if ( capped ) for( DiskLoc i = nsd->firstExtent; !i.isNull(); i = i.ext()->xnext ) size += i.ext()->length; setClient( target.c_str() ); uassert( "target namespace exists", !nsdetails( target.c_str() ) ); { char from[256]; nsToClient( source.c_str(), from ); char to[256]; nsToClient( target.c_str(), to ); if ( strcmp( from, to ) == 0 ) { renameNamespace( source.c_str(), target.c_str() ); return true; } } BSONObjBuilder spec; if ( capped ) { spec.appendBool( "capped", true ); spec.append( "size", double( size ) ); } if ( !userCreateNS( target.c_str(), spec.done(), errmsg, false ) ) return false; auto_ptr< DBClientCursor > c; DBDirectClient bridge; { c = bridge.query( source, BSONObj() ); } while( 1 ) { { if ( !c->more() ) break; } BSONObj o = c->next(); theDataFileMgr.insert( target.c_str(), o ); } char cl[256]; nsToClient( source.c_str(), cl ); string sourceIndexes = string( cl ) + ".system.indexes"; nsToClient( target.c_str(), cl ); string targetIndexes = string( cl ) + ".system.indexes"; { c = bridge.query( sourceIndexes, QUERY( "ns" << source ) ); } while( 1 ) { { if ( !c->more() ) break; } BSONObj o = c->next(); BSONObjBuilder b; BSONObjIterator i( o ); while( i.moreWithEOO() ) { BSONElement e = i.next(); if ( e.eoo() ) break; if ( strcmp( e.fieldName(), "ns" ) == 0 ) { b.append( "ns", target ); } else { b.append( e ); } } BSONObj n = b.done(); theDataFileMgr.insert( targetIndexes.c_str(), n ); } setClient( source.c_str() ); dropCollection( source, errmsg, result ); return true; }
void validateNS(const string& ns, Collection* collection, const BSONObj& cmdObj, BSONObjBuilder& result) { const bool full = cmdObj["full"].trueValue(); const bool scanData = full || cmdObj["scandata"].trueValue(); NamespaceDetails* nsd = collection->details(); bool valid = true; BSONArrayBuilder errors; // explanation(s) for why valid = false if ( collection->isCapped() ){ result.append("capped", nsd->isCapped()); result.appendNumber("max", nsd->maxCappedDocs()); } if ( nsd->firstExtent().isNull() ) result.append( "firstExtent", "null" ); else result.append( "firstExtent", str::stream() << nsd->firstExtent().toString() << " ns:" << nsd->firstExtent().ext()->nsDiagnostic.toString()); if ( nsd->lastExtent().isNull() ) result.append( "lastExtent", "null" ); else result.append( "lastExtent", str::stream() << nsd->lastExtent().toString() << " ns:" << nsd->lastExtent().ext()->nsDiagnostic.toString()); BSONArrayBuilder extentData; int extentCount = 0; try { if ( !nsd->firstExtent().isNull() ) { nsd->firstExtent().ext()->assertOk(); nsd->lastExtent().ext()->assertOk(); } DiskLoc extentDiskLoc = nsd->firstExtent(); while (!extentDiskLoc.isNull()) { Extent* thisExtent = extentDiskLoc.ext(); if (full) { extentData << thisExtent->dump(); } if (!thisExtent->validates(extentDiskLoc, &errors)) { valid = false; } DiskLoc nextDiskLoc = thisExtent->xnext; if (extentCount > 0 && !nextDiskLoc.isNull() && nextDiskLoc.ext()->xprev != extentDiskLoc) { StringBuilder sb; sb << "'xprev' pointer " << nextDiskLoc.ext()->xprev.toString() << " in extent " << nextDiskLoc.toString() << " does not point to extent " << extentDiskLoc.toString(); errors << sb.str(); valid = false; } if (nextDiskLoc.isNull() && extentDiskLoc != nsd->lastExtent()) { StringBuilder sb; sb << "'lastExtent' pointer " << nsd->lastExtent().toString() << " does not point to last extent in list " << extentDiskLoc.toString(); errors << sb.str(); valid = false; } extentDiskLoc = nextDiskLoc; extentCount++; killCurrentOp.checkForInterrupt(); } } catch (const DBException& e) { StringBuilder sb; sb << "exception validating extent " << extentCount << ": " << e.what(); errors << sb.str(); valid = false; } result.append("extentCount", extentCount); if ( full ) result.appendArray( "extents" , extentData.arr() ); result.appendNumber("datasize", nsd->dataSize()); result.appendNumber("nrecords", nsd->numRecords()); result.appendNumber("lastExtentSize", nsd->lastExtentSize()); result.appendNumber("padding", nsd->paddingFactor()); try { bool testingLastExtent = false; try { if (nsd->firstExtent().isNull()) { // this is ok } else { result.append("firstExtentDetails", nsd->firstExtent().ext()->dump()); if (!nsd->firstExtent().ext()->xprev.isNull()) { StringBuilder sb; sb << "'xprev' pointer in 'firstExtent' " << nsd->firstExtent().toString() << " is " << nsd->firstExtent().ext()->xprev.toString() << ", should be null"; errors << sb.str(); valid=false; } } testingLastExtent = true; if (nsd->lastExtent().isNull()) { // this is ok } else { if (nsd->firstExtent() != nsd->lastExtent()) { result.append("lastExtentDetails", nsd->lastExtent().ext()->dump()); if (!nsd->lastExtent().ext()->xnext.isNull()) { StringBuilder sb; sb << "'xnext' pointer in 'lastExtent' " << nsd->lastExtent().toString() << " is " << nsd->lastExtent().ext()->xnext.toString() << ", should be null"; errors << sb.str(); valid = false; } } } } catch (const DBException& e) { StringBuilder sb; sb << "exception processing '" << (testingLastExtent ? "lastExtent" : "firstExtent") << "': " << e.what(); errors << sb.str(); valid = false; } set<DiskLoc> recs; if( scanData ) { int n = 0; int nInvalid = 0; long long nQuantizedSize = 0; long long nPowerOf2QuantizedSize = 0; long long len = 0; long long nlen = 0; long long bsonLen = 0; int outOfOrder = 0; DiskLoc cl_last; DiskLoc cl; Runner::RunnerState state; auto_ptr<Runner> runner(InternalPlanner::collectionScan(ns)); while (Runner::RUNNER_ADVANCED == (state = runner->getNext(NULL, &cl))) { n++; if ( n < 1000000 ) recs.insert(cl); if ( nsd->isCapped() ) { if ( cl < cl_last ) outOfOrder++; cl_last = cl; } Record *r = cl.rec(); len += r->lengthWithHeaders(); nlen += r->netLength(); if ( r->lengthWithHeaders() == NamespaceDetails::quantizeAllocationSpace ( r->lengthWithHeaders() ) ) { // Count the number of records having a size consistent with // the quantizeAllocationSpace quantization implementation. ++nQuantizedSize; } if ( r->lengthWithHeaders() == NamespaceDetails::quantizePowerOf2AllocationSpace ( r->lengthWithHeaders() - 1 ) ) { // Count the number of records having a size consistent with the // quantizePowerOf2AllocationSpace quantization implementation. // Because of SERVER-8311, power of 2 quantization is not idempotent and // r->lengthWithHeaders() - 1 must be checked instead of the record // length itself. ++nPowerOf2QuantizedSize; } if (full){ BSONObj obj = BSONObj::make(r); if (!obj.isValid() || !obj.valid()){ // both fast and deep checks valid = false; if (nInvalid == 0) // only log once; errors << "invalid bson object detected (see logs for more info)"; nInvalid++; if (strcmp("_id", obj.firstElementFieldName()) == 0){ try { obj.firstElement().validate(); // throws on error log() << "Invalid bson detected in " << ns << " with _id: " << obj.firstElement().toString(false) << endl; } catch(...){ log() << "Invalid bson detected in " << ns << " with corrupt _id" << endl; } } else { log() << "Invalid bson detected in " << ns << " and couldn't find _id" << endl; } } else { bsonLen += obj.objsize(); } } } if (Runner::RUNNER_EOF != state) { // TODO: more descriptive logging. warning() << "Internal error while reading collection " << ns << endl; } if ( nsd->isCapped() && !nsd->capLooped() ) { result.append("cappedOutOfOrder", outOfOrder); if ( outOfOrder > 1 ) { valid = false; errors << "too many out of order records"; } } result.append("objectsFound", n); if (full) { result.append("invalidObjects", nInvalid); } result.appendNumber("nQuantizedSize", nQuantizedSize); result.appendNumber("nPowerOf2QuantizedSize", nPowerOf2QuantizedSize); result.appendNumber("bytesWithHeaders", len); result.appendNumber("bytesWithoutHeaders", nlen); if (full) { result.appendNumber("bytesBson", bsonLen); } } BSONArrayBuilder deletedListArray; for ( int i = 0; i < Buckets; i++ ) { deletedListArray << nsd->deletedListEntry(i).isNull(); } int ndel = 0; long long delSize = 0; BSONArrayBuilder delBucketSizes; int incorrect = 0; for ( int i = 0; i < Buckets; i++ ) { DiskLoc loc = nsd->deletedListEntry(i); try { int k = 0; while ( !loc.isNull() ) { if ( recs.count(loc) ) incorrect++; ndel++; if ( loc.questionable() ) { if( nsd->isCapped() && !loc.isValid() && i == 1 ) { /* the constructor for NamespaceDetails intentionally sets deletedList[1] to invalid see comments in namespace.h */ break; } string err( str::stream() << "bad pointer in deleted record list: " << loc.toString() << " bucket: " << i << " k: " << k ); errors << err; valid = false; break; } DeletedRecord *d = loc.drec(); delSize += d->lengthWithHeaders(); loc = d->nextDeleted(); k++; killCurrentOp.checkForInterrupt(); } delBucketSizes << k; } catch (...) { errors << ("exception in deleted chain for bucket " + BSONObjBuilder::numStr(i)); valid = false; } } result.appendNumber("deletedCount", ndel); result.appendNumber("deletedSize", delSize); if ( full ) { result << "delBucketSizes" << delBucketSizes.arr(); } if ( incorrect ) { errors << (BSONObjBuilder::numStr(incorrect) + " records from datafile are in deleted list"); valid = false; } int idxn = 0; try { IndexCatalog* indexCatalog = collection->getIndexCatalog(); result.append("nIndexes", nsd->getCompletedIndexCount()); BSONObjBuilder indexes; // not using subObjStart to be exception safe NamespaceDetails::IndexIterator i = nsd->ii(); while( i.more() ) { IndexDetails& id = i.next(); log() << "validating index " << idxn << ": " << id.indexNamespace() << endl; IndexDescriptor* descriptor = indexCatalog->getDescriptor( idxn ); verify( descriptor ); IndexAccessMethod* iam = indexCatalog->getIndex( descriptor ); verify( iam ); int64_t keys; iam->validate(&keys); indexes.appendNumber(id.indexNamespace(), static_cast<long long>(keys)); idxn++; } result.append("keysPerIndex", indexes.done()); } catch (...) { errors << ("exception during index validate idxn " + BSONObjBuilder::numStr(idxn)); valid=false; } } catch (AssertionException) { errors << "exception during validate"; valid = false; } result.appendBool("valid", valid); result.append("errors", errors.arr()); if ( !full ){ result.append("warning", "Some checks omitted for speed. use {full:true} option to do more thorough scan."); } if ( !valid ) { result.append("advice", "ns corrupt, requires repair"); } }
void NamespaceDetails::emptyCappedCollection( const char *ns ) { DEV verify( this == nsdetails(ns) ); massert( 13424, "collection must be capped", isCapped() ); massert( 13425, "background index build in progress", !_indexBuildsInProgress ); vector<BSONObj> indexes = Helpers::findAll( Namespace( ns ).getSisterNS( "system.indexes" ) , BSON( "ns" << ns ) ); for ( unsigned i=0; i<indexes.size(); i++ ) { indexes[i] = indexes[i].copy(); } if ( _nIndexes ) { string errmsg; BSONObjBuilder note; bool res = dropIndexes( this , ns , "*" , errmsg , note , true ); massert( 13426 , str::stream() << "failed during index drop: " << errmsg , res ); } // Clear all references to this namespace. ClientCursor::invalidate( ns ); NamespaceDetailsTransient::resetCollection( ns ); // Get a writeable reference to 'this' and reset all pertinent // attributes. NamespaceDetails *t = writingWithoutExtra(); t->cappedLastDelRecLastExtent() = DiskLoc(); t->cappedListOfAllDeletedRecords() = DiskLoc(); // preserve firstExtent/lastExtent t->_capExtent = _firstExtent; t->_stats.datasize = _stats.nrecords = 0; // lastExtentSize preserve // nIndexes preserve 0 // capped preserve true // max preserve t->_paddingFactor = 1.0; t->_systemFlags = 0; t->_capFirstNewRecord = DiskLoc(); t->_capFirstNewRecord.setInvalid(); t->cappedLastDelRecLastExtent().setInvalid(); // dataFileVersion preserve // indexFileVersion preserve t->_multiKeyIndexBits = 0; t->_reservedA = 0; t->_extraOffset = 0; // indexBuildInProgress preserve 0 memset(t->_reserved, 0, sizeof(t->_reserved)); // Reset all existing extents and recreate the deleted list. for( DiskLoc ext = _firstExtent; !ext.isNull(); ext = ext.ext()->xnext ) { DiskLoc prev = ext.ext()->xprev; DiskLoc next = ext.ext()->xnext; DiskLoc empty = ext.ext()->reuse( ns, true ); ext.ext()->xprev.writing() = prev; ext.ext()->xnext.writing() = next; addDeletedRec( empty.drec(), empty ); } for ( unsigned i=0; i<indexes.size(); i++ ) { theDataFileMgr.insertWithObjMod(Namespace( ns ).getSisterNS( "system.indexes" ).c_str(), indexes[i], false, true); } }
Status cloneCollectionAsCapped( Database* db, const string& shortFrom, const string& shortTo, double size, bool temp, bool logForReplication ) { string fromNs = db->name() + "." + shortFrom; string toNs = db->name() + "." + shortTo; Collection* fromCollection = db->getCollection( fromNs ); if ( !fromCollection ) return Status( ErrorCodes::NamespaceNotFound, str::stream() << "source collection " << fromNs << " does not exist" ); if ( db->getCollection( toNs ) ) return Status( ErrorCodes::NamespaceExists, "to collection already exists" ); // create new collection { Client::Context ctx( toNs ); BSONObjBuilder spec; spec.appendBool( "capped", true ); spec.append( "size", size ); if ( temp ) spec.appendBool( "temp", true ); string errmsg; if ( !userCreateNS( toNs.c_str(), spec.done(), errmsg, logForReplication ) ) return Status( ErrorCodes::InternalError, errmsg ); } auto_ptr<Runner> runner; { const NamespaceDetails* details = fromCollection->details(); DiskLoc extent = details->firstExtent(); // datasize and extentSize can't be compared exactly, so add some padding to 'size' long long excessSize = static_cast<long long>( fromCollection->dataSize() - size * 2 ); // skip ahead some extents since not all the data fits, // so we have to chop a bunch off for( ; excessSize > extent.ext()->length && extent != details->lastExtent(); extent = extent.ext()->xnext ) { excessSize -= extent.ext()->length; LOG( 2 ) << "cloneCollectionAsCapped skipping extent of size " << extent.ext()->length << endl; LOG( 6 ) << "excessSize: " << excessSize << endl; } DiskLoc startLoc = extent.ext()->firstRecord; runner.reset( InternalPlanner::collectionScan(fromNs, InternalPlanner::FORWARD, startLoc) ); } Collection* toCollection = db->getCollection( toNs ); verify( toCollection ); while ( true ) { BSONObj obj; Runner::RunnerState state = runner->getNext(&obj, NULL); switch( state ) { case Runner::RUNNER_EOF: return Status::OK(); case Runner::RUNNER_DEAD: db->dropCollection( toNs ); return Status( ErrorCodes::InternalError, "runner turned dead while iterating" ); case Runner::RUNNER_ERROR: return Status( ErrorCodes::InternalError, "runner error while iterating" ); case Runner::RUNNER_ADVANCED: toCollection->insertDocument( obj, true ); if ( logForReplication ) logOp( "i", toNs.c_str(), obj ); getDur().commitIfNeeded(); } } verify( false ); // unreachable }
void validateNS(const char *ns, NamespaceDetails *d, const BSONObj& cmdObj, BSONObjBuilder& result) { const bool full = cmdObj["full"].trueValue(); const bool scanData = full || cmdObj["scandata"].trueValue(); bool valid = true; BSONArrayBuilder errors; // explanation(s) for why valid = false if ( d->isCapped() ){ result.append("capped", d->isCapped()); result.appendNumber("max", d->maxCappedDocs()); } result.append("firstExtent", str::stream() << d->firstExtent.toString() << " ns:" << d->firstExtent.ext()->nsDiagnostic.toString()); result.append( "lastExtent", str::stream() << d->lastExtent.toString() << " ns:" << d->lastExtent.ext()->nsDiagnostic.toString()); BSONArrayBuilder extentData; try { d->firstExtent.ext()->assertOk(); d->lastExtent.ext()->assertOk(); DiskLoc el = d->firstExtent; int ne = 0; while( !el.isNull() ) { Extent *e = el.ext(); e->assertOk(); el = e->xnext; ne++; if ( full ) extentData << e->dump(); killCurrentOp.checkForInterrupt(); } result.append("extentCount", ne); } catch (...) { valid=false; errors << "extent asserted"; } if ( full ) result.appendArray( "extents" , extentData.arr() ); result.appendNumber("datasize", d->stats.datasize); result.appendNumber("nrecords", d->stats.nrecords); result.appendNumber("lastExtentSize", d->lastExtentSize); result.appendNumber("padding", d->paddingFactor()); try { try { result.append("firstExtentDetails", d->firstExtent.ext()->dump()); valid = valid && d->firstExtent.ext()->validates() && d->firstExtent.ext()->xprev.isNull(); } catch (...) { errors << "exception firstextent"; valid = false; } set<DiskLoc> recs; if( scanData ) { shared_ptr<Cursor> c = theDataFileMgr.findAll(ns); int n = 0; int nInvalid = 0; long long len = 0; long long nlen = 0; int outOfOrder = 0; DiskLoc cl_last; while ( c->ok() ) { n++; DiskLoc cl = c->currLoc(); if ( n < 1000000 ) recs.insert(cl); if ( d->isCapped() ) { if ( cl < cl_last ) outOfOrder++; cl_last = cl; } Record *r = c->_current(); len += r->lengthWithHeaders(); nlen += r->netLength(); if (full){ BSONObj obj = BSONObj::make(r); if (!obj.isValid() || !obj.valid()){ // both fast and deep checks valid = false; if (nInvalid == 0) // only log once; errors << "invalid bson object detected (see logs for more info)"; nInvalid++; if (strcmp("_id", obj.firstElementFieldName()) == 0){ try { obj.firstElement().validate(); // throws on error log() << "Invalid bson detected in " << ns << " with _id: " << obj.firstElement().toString(false) << endl; } catch(...){ log() << "Invalid bson detected in " << ns << " with corrupt _id" << endl; } } else { log() << "Invalid bson detected in " << ns << " and couldn't find _id" << endl; } } } c->advance(); } if ( d->isCapped() && !d->capLooped() ) { result.append("cappedOutOfOrder", outOfOrder); if ( outOfOrder > 1 ) { valid = false; errors << "too many out of order records"; } } result.append("objectsFound", n); if (full) { result.append("invalidObjects", nInvalid); } result.appendNumber("bytesWithHeaders", len); result.appendNumber("bytesWithoutHeaders", nlen); } BSONArrayBuilder deletedListArray; for ( int i = 0; i < Buckets; i++ ) { deletedListArray << d->deletedList[i].isNull(); } int ndel = 0; long long delSize = 0; int incorrect = 0; for ( int i = 0; i < Buckets; i++ ) { DiskLoc loc = d->deletedList[i]; try { int k = 0; while ( !loc.isNull() ) { if ( recs.count(loc) ) incorrect++; ndel++; if ( loc.questionable() ) { if( d->isCapped() && !loc.isValid() && i == 1 ) { /* the constructor for NamespaceDetails intentionally sets deletedList[1] to invalid see comments in namespace.h */ break; } if ( loc.a() <= 0 || strstr(ns, "hudsonSmall") == 0 ) { string err (str::stream() << "bad deleted loc: " << loc.toString() << " bucket:" << i << " k:" << k); errors << err; valid = false; break; } } DeletedRecord *d = loc.drec(); delSize += d->lengthWithHeaders(); loc = d->nextDeleted(); k++; killCurrentOp.checkForInterrupt(); } } catch (...) { errors << ("exception in deleted chain for bucket " + BSONObjBuilder::numStr(i)); valid = false; } } result.appendNumber("deletedCount", ndel); result.appendNumber("deletedSize", delSize); if ( incorrect ) { errors << (BSONObjBuilder::numStr(incorrect) + " records from datafile are in deleted list"); valid = false; } int idxn = 0; try { result.append("nIndexes", d->nIndexes); BSONObjBuilder indexes; // not using subObjStart to be exception safe NamespaceDetails::IndexIterator i = d->ii(); while( i.more() ) { IndexDetails& id = i.next(); log() << "validating index " << idxn << ": " << id.indexNamespace() << endl; long long keys = id.idxInterface().fullValidate(id.head, id.keyPattern()); indexes.appendNumber(id.indexNamespace(), keys); idxn++; } result.append("keysPerIndex", indexes.done()); } catch (...) { errors << ("exception during index validate idxn " + BSONObjBuilder::numStr(idxn)); valid=false; } } catch (AssertionException) { errors << "exception during validate"; valid = false; } result.appendBool("valid", valid); result.append("errors", errors.arr()); if ( !full ){ result.append("warning", "Some checks omitted for speed. use {full:true} option to do more thorough scan."); } if ( !valid ) { result.append("advice", "ns corrupt, requires repair"); } }
DiskLoc ExtentManager::allocFromFreeList( int approxSize, bool capped ) { if ( !_freeListDetails ) { return DiskLoc(); } // setup extent constraints int low, high; if ( capped ) { // be strict about the size low = approxSize; if ( low > 2048 ) low -= 256; high = (int) (approxSize * 1.05) + 256; } else { low = (int) (approxSize * 0.8); high = (int) (approxSize * 1.4); } if ( high <= 0 ) { // overflowed high = max(approxSize, Extent::maxSize()); } if ( high <= Extent::minSize() ) { // the minimum extent size is 4097 high = Extent::minSize() + 1; } // scan free list looking for something suitable int n = 0; Extent *best = 0; int bestDiff = 0x7fffffff; { Timer t; DiskLoc L = _freeListDetails->firstExtent(); while( !L.isNull() ) { Extent * e = L.ext(); if ( e->length >= low && e->length <= high ) { int diff = abs(e->length - approxSize); if ( diff < bestDiff ) { bestDiff = diff; best = e; if ( ((double) diff) / approxSize < 0.1 ) { // close enough break; } if ( t.seconds() >= 2 ) { // have spent lots of time in write lock, and we are in [low,high], so close enough // could come into play if extent freelist is very long break; } } else { OCCASIONALLY { if ( high < 64 * 1024 && t.seconds() >= 2 ) { // be less picky if it is taking a long time high = 64 * 1024; } } } } L = e->xnext; ++n; } if ( t.seconds() >= 10 ) { log() << "warning: slow scan in allocFromFreeList (in write lock)" << endl; } }
/** @return number of skipped (invalid) documents */ unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc ext, int n, const scoped_array<IndexSpec> &indexSpecs, scoped_array<SortPhaseOne>& phase1, int nidx, bool validate, double pf, int pb) { log() << "compact extent #" << n << endl; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent *e = ext.ext(); e->assertOk(); assert( e->validates() ); unsigned skipped = 0; { // the next/prev pointers within the extent might not be in order so we first page the whole thing in // sequentially log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl; Timer t; MAdvise adv(e, e->length, MAdvise::Sequential); const char *p = (const char *) e; for( int i = 0; i < e->length; i += 4096 ) { faux += p[i]; } int ms = t.millis(); if( ms > 1000 ) log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl; } { log() << "compact copying records" << endl; unsigned totalSize = 0; int nrecs = 0; DiskLoc L = e->firstRecord; if( !L.isNull() ) { while( 1 ) { Record *recOld = L.rec(); L = recOld->nextInExtent(L); nrecs++; BSONObj objOld(recOld); if( !validate || objOld.valid() ) { unsigned sz = objOld.objsize(); oldObjSize += sz; oldObjSizeWithPadding += recOld->netLength(); unsigned lenWHdr = sz + Record::HeaderSize; unsigned lenWPadding = lenWHdr; { lenWPadding = static_cast<unsigned>(pf*lenWPadding); lenWPadding += pb; lenWPadding = lenWPadding & quantizeMask(lenWPadding); if( lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { lenWPadding = lenWHdr; } } totalSize += lenWPadding; DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false); uassert(14024, "compact error out of space during compaction", !loc.isNull()); Record *recNew = loc.rec(); recNew = (Record *) getDur().writingPtr(recNew, lenWHdr); addRecordToRecListInExtent(recNew, loc); memcpy(recNew->data, objOld.objdata(), sz); { // extract keys for all indexes we will be rebuilding for( int x = 0; x < nidx; x++ ) { phase1[x].addKeys(indexSpecs[x], objOld, loc); } } } else { if( ++skipped <= 10 ) log() << "compact skipping invalid object" << endl; } if( L.isNull() ) { // we just did the very last record from the old extent. it's still pointed to // by the old extent ext, but that will be fixed below after this loop break; } // remove the old records (orphan them) periodically so our commit block doesn't get too large bool stopping = false; RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0; if( stopping || getDur().aCommitIsNeeded() ) { e->firstRecord.writing() = L; Record *r = L.rec(); getDur().writingInt(r->prevOfs) = DiskLoc::NullOfs; getDur().commitIfNeeded(); killCurrentOp.checkForInterrupt(false); } } } // if !L.isNull() assert( d->firstExtent == ext ); assert( d->lastExtent != ext ); DiskLoc newFirst = e->xnext; d->firstExtent.writing() = newFirst; newFirst.ext()->xprev.writing().Null(); getDur().writing(e)->markEmpty(); freeExtents(ext,ext); getDur().commitIfNeeded(); { double op = 1.0; if( oldObjSize ) op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize; log() << "compact " << nrecs << " documents " << totalSize/1000000.0 << "MB" << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100 << endl; } } return skipped; }
/** @return number of skipped (invalid) documents */ unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc diskloc, int n, const scoped_array<IndexSpec> &indexSpecs, scoped_array<SortPhaseOne>& phase1, int nidx, bool validate, double pf, int pb) { log() << "compact begin extent #" << n << " for namespace " << ns << endl; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent *e = diskloc.ext(); e->assertOk(); verify( e->validates() ); unsigned skipped = 0; { // the next/prev pointers within the extent might not be in order so we first page the whole thing in // sequentially log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl; Timer t; MongoDataFile* mdf = cc().database()->getFile( diskloc.a() ); HANDLE fd = mdf->getFd(); int offset = diskloc.getOfs(); Extent* ext = diskloc.ext(); size_t length = ext->length; touch_pages(fd, offset, length, ext); int ms = t.millis(); if( ms > 1000 ) log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl; } { log() << "compact copying records" << endl; long long datasize = 0; long long nrecords = 0; DiskLoc L = e->firstRecord; if( !L.isNull() ) { while( 1 ) { Record *recOld = L.rec(); L = recOld->nextInExtent(L); BSONObj objOld = BSONObj::make(recOld); if( !validate || objOld.valid() ) { nrecords++; unsigned sz = objOld.objsize(); oldObjSize += sz; oldObjSizeWithPadding += recOld->netLength(); unsigned lenWHdr = sz + Record::HeaderSize; unsigned lenWPadding = lenWHdr; { lenWPadding = static_cast<unsigned>(pf*lenWPadding); lenWPadding += pb; lenWPadding = lenWPadding & quantizeMask(lenWPadding); if( lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { lenWPadding = lenWHdr; } } DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false); uassert(14024, "compact error out of space during compaction", !loc.isNull()); Record *recNew = loc.rec(); datasize += recNew->netLength(); recNew = (Record *) getDur().writingPtr(recNew, lenWHdr); addRecordToRecListInExtent(recNew, loc); memcpy(recNew->data(), objOld.objdata(), sz); { // extract keys for all indexes we will be rebuilding for( int x = 0; x < nidx; x++ ) { phase1[x].addKeys(indexSpecs[x], objOld, loc); } } } else { if( ++skipped <= 10 ) log() << "compact skipping invalid object" << endl; } if( L.isNull() ) { // we just did the very last record from the old extent. it's still pointed to // by the old extent ext, but that will be fixed below after this loop break; } // remove the old records (orphan them) periodically so our commit block doesn't get too large bool stopping = false; RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0; if( stopping || getDur().aCommitIsNeeded() ) { e->firstRecord.writing() = L; Record *r = L.rec(); getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs; getDur().commitIfNeeded(); killCurrentOp.checkForInterrupt(false); } } } // if !L.isNull() verify( d->firstExtent == diskloc ); verify( d->lastExtent != diskloc ); DiskLoc newFirst = e->xnext; d->firstExtent.writing() = newFirst; newFirst.ext()->xprev.writing().Null(); getDur().writing(e)->markEmpty(); freeExtents( diskloc, diskloc ); // update datasize/record count for this namespace's extent { NamespaceDetails::Stats *s = getDur().writing(&d->stats); s->datasize += datasize; s->nrecords += nrecords; } getDur().commitIfNeeded(); { double op = 1.0; if( oldObjSize ) op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize; log() << "compact finished extent #" << n << " containing " << nrecords << " documents (" << datasize/1000000.0 << "MB)" << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100 << endl; } } return skipped; }
/** @return number of skipped (invalid) documents */ unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc diskloc, int n, int nidx, bool validate, double pf, int pb, bool useDefaultPadding, bool preservePadding) { log() << "compact begin extent #" << n << " for namespace " << ns << endl; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent *e = diskloc.ext(); e->assertOk(); verify( e->validates(diskloc) ); unsigned skipped = 0; Database* db = cc().database(); { // the next/prev pointers within the extent might not be in order so we first // page the whole thing in sequentially log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl; Timer t; Extent* ext = db->getExtentManager().getExtent( diskloc ); size_t length = ext->length; touch_pages( reinterpret_cast<const char*>(ext), length ); int ms = t.millis(); if( ms > 1000 ) log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl; } { log() << "compact copying records" << endl; long long datasize = 0; long long nrecords = 0; DiskLoc L = e->firstRecord; if( !L.isNull() ) { while( 1 ) { Record *recOld = L.rec(); L = db->getExtentManager().getNextRecordInExtent(L); BSONObj objOld = BSONObj::make(recOld); if( !validate || objOld.valid() ) { nrecords++; unsigned sz = objOld.objsize(); oldObjSize += sz; oldObjSizeWithPadding += recOld->netLength(); unsigned lenWHdr = sz + Record::HeaderSize; unsigned lenWPadding = lenWHdr; // if we are preserving the padding, the record should not change size if (preservePadding) { lenWPadding = recOld->lengthWithHeaders(); } // maintain UsePowerOf2Sizes if no padding values were passed in else if (d->isUserFlagSet(NamespaceDetails::Flag_UsePowerOf2Sizes) && useDefaultPadding) { lenWPadding = d->quantizePowerOf2AllocationSpace(lenWPadding); } // otherwise use the padding values (pf and pb) that were passed in else { lenWPadding = static_cast<unsigned>(pf*lenWPadding); lenWPadding += pb; lenWPadding = lenWPadding & quantizeMask(lenWPadding); } if (lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { lenWPadding = lenWHdr; } DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false); uassert(14024, "compact error out of space during compaction", !loc.isNull()); Record *recNew = loc.rec(); datasize += recNew->netLength(); recNew = (Record *) getDur().writingPtr(recNew, lenWHdr); addRecordToRecListInExtent(recNew, loc); memcpy(recNew->data(), objOld.objdata(), sz); } else { if( ++skipped <= 10 ) log() << "compact skipping invalid object" << endl; } if( L.isNull() ) { // we just did the very last record from the old extent. it's still pointed to // by the old extent ext, but that will be fixed below after this loop break; } // remove the old records (orphan them) periodically so our commit block doesn't get too large bool stopping = false; RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0; if( stopping || getDur().aCommitIsNeeded() ) { e->firstRecord.writing() = L; Record *r = L.rec(); getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs; getDur().commitIfNeeded(); killCurrentOp.checkForInterrupt(false); } } } // if !L.isNull() verify( d->firstExtent() == diskloc ); verify( d->lastExtent() != diskloc ); DiskLoc newFirst = e->xnext; d->firstExtent().writing() = newFirst; newFirst.ext()->xprev.writing().Null(); getDur().writing(e)->markEmpty(); cc().database()->getExtentManager().freeExtents( diskloc, diskloc ); // update datasize/record count for this namespace's extent d->incrementStats( datasize, nrecords ); getDur().commitIfNeeded(); { double op = 1.0; if( oldObjSize ) op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize; log() << "compact finished extent #" << n << " containing " << nrecords << " documents (" << datasize/1000000.0 << "MB)" << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100 << endl; } } return skipped; }
bool _compact(const char *ns, NamespaceDetails *d, string& errmsg, bool validate, BSONObjBuilder& result, double pf, int pb, bool useDefaultPadding, bool preservePadding) { // this is a big job, so might as well make things tidy before we start just to be nice. getDur().commitIfNeeded(); list<DiskLoc> extents; for( DiskLoc L = d->firstExtent(); !L.isNull(); L = L.ext()->xnext ) extents.push_back(L); log() << "compact " << extents.size() << " extents" << endl; ProgressMeterHolder pm(cc().curop()->setMessage("compact extent", "Extent Compacting Progress", extents.size())); // same data, but might perform a little different after compact? Collection* collection = cc().database()->getCollection( ns ); verify( collection ); collection->infoCache()->addedIndex(); verify( d->getCompletedIndexCount() == d->getTotalIndexCount() ); int nidx = d->getCompletedIndexCount(); scoped_array<BSONObj> indexSpecs( new BSONObj[nidx] ); { NamespaceDetails::IndexIterator ii = d->ii(); // For each existing index... for( int idxNo = 0; ii.more(); ++idxNo ) { // Build a new index spec based on the old index spec. BSONObjBuilder b; BSONObj::iterator i(ii.next().info.obj()); while( i.more() ) { BSONElement e = i.next(); if ( str::equals( e.fieldName(), "v" ) ) { // Drop any preexisting index version spec. The default index version will // be used instead for the new index. continue; } if ( str::equals( e.fieldName(), "background" ) ) { // Create the new index in the foreground. continue; } // Pass the element through to the new index spec. b.append(e); } indexSpecs[idxNo] = b.obj().getOwned(); } } log() << "compact orphan deleted lists" << endl; d->orphanDeletedList(); // Start over from scratch with our extent sizing and growth d->setLastExtentSize( 0 ); // before dropping indexes, at least make sure we can allocate one extent! uassert(14025, "compact error no space available to allocate", !allocateSpaceForANewRecord(ns, d, Record::HeaderSize+1, false).isNull()); // note that the drop indexes call also invalidates all clientcursors for the namespace, which is important and wanted here log() << "compact dropping indexes" << endl; Status status = collection->getIndexCatalog()->dropAllIndexes( true ); if ( !status.isOK() ) { errmsg = str::stream() << "compact drop indexes failed: " << status.toString(); log() << status.toString() << endl; return false; } getDur().commitIfNeeded(); long long skipped = 0; int n = 0; // reset data size and record counts to 0 for this namespace // as we're about to tally them up again for each new extent d->setStats( 0, 0 ); for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) { skipped += compactExtent(ns, d, *i, n++, nidx, validate, pf, pb, useDefaultPadding, preservePadding); pm.hit(); } if( skipped ) { result.append("invalidObjects", skipped); } verify( d->firstExtent().ext()->xprev.isNull() ); // indexes will do their own progress meter? pm.finished(); // build indexes for( int i = 0; i < nidx; i++ ) { killCurrentOp.checkForInterrupt(false); BSONObj info = indexSpecs[i]; log() << "compact create index " << info["key"].Obj().toString() << endl; Status status = collection->getIndexCatalog()->createIndex( info, false ); if ( !status.isOK() ) { log() << "failed to create index: " << status.toString(); uassertStatusOK( status ); } } return true; }
int nExtents() const { int count = 0; for ( DiskLoc i = nsd()->firstExtent; !i.isNull(); i = i.ext()->xnext ) ++count; return count; }
void NamespaceDetails::cappedTruncateAfter(const char *ns, DiskLoc end, bool inclusive) { DEV verify( this == nsdetails(ns) ); verify( cappedLastDelRecLastExtent().isValid() ); // We iteratively remove the newest document until the newest document // is 'end', then we remove 'end' if requested. bool foundLast = false; while( 1 ) { if ( foundLast ) { // 'end' has been found and removed, so break. break; } getDur().commitIfNeeded(); // 'curr' will point to the newest document in the collection. DiskLoc curr = theCapExtent()->lastRecord; verify( !curr.isNull() ); if ( curr == end ) { if ( inclusive ) { // 'end' has been found, so break next iteration. foundLast = true; } else { // 'end' has been found, so break. break; } } // TODO The algorithm used in this function cannot generate an // empty collection, but we could call emptyCappedCollection() in // this case instead of asserting. uassert( 13415, "emptying the collection is not allowed", _stats.nrecords > 1 ); // Delete the newest record, and coalesce the new deleted // record with existing deleted records. theDataFileMgr.deleteRecord(this, ns, curr.rec(), curr, true); compact(); // This is the case where we have not yet had to remove any // documents to make room for other documents, and we are allocating // documents from free space in fresh extents instead of reusing // space from familiar extents. if ( !capLooped() ) { // We just removed the last record from the 'capExtent', and // the 'capExtent' can't be empty, so we set 'capExtent' to // capExtent's prev extent. if ( theCapExtent()->lastRecord.isNull() ) { verify( !theCapExtent()->xprev.isNull() ); // NOTE Because we didn't delete the last document, and // capLooped() is false, capExtent is not the first extent // so xprev will be nonnull. _capExtent.writing() = theCapExtent()->xprev; theCapExtent()->assertOk(); // update cappedLastDelRecLastExtent() cappedTruncateLastDelUpdate(); } continue; } // This is the case where capLooped() is true, and we just deleted // from capExtent, and we just deleted capFirstNewRecord, which was // the last record on the fresh side of capExtent. // NOTE In this comparison, curr and potentially capFirstNewRecord // may point to invalid data, but we can still compare the // references themselves. if ( curr == _capFirstNewRecord ) { // Set 'capExtent' to the first nonempty extent prior to the // initial capExtent. There must be such an extent because we // have not deleted the last document in the collection. It is // possible that all extents other than the capExtent are empty. // In this case we will keep the initial capExtent and specify // that all records contained within are on the fresh rather than // stale side of the extent. DiskLoc newCapExtent = _capExtent; do { // Find the previous extent, looping if necessary. newCapExtent = ( newCapExtent == _firstExtent ) ? _lastExtent : newCapExtent.ext()->xprev; newCapExtent.ext()->assertOk(); } while ( newCapExtent.ext()->firstRecord.isNull() ); _capExtent.writing() = newCapExtent; // Place all documents in the new capExtent on the fresh side // of the capExtent by setting capFirstNewRecord to the first // document in the new capExtent. _capFirstNewRecord.writing() = theCapExtent()->firstRecord; // update cappedLastDelRecLastExtent() cappedTruncateLastDelUpdate(); } } }
bool _compact(const char *ns, NamespaceDetails *d, string& errmsg, bool validate, BSONObjBuilder& result, double pf, int pb) { //int les = d->lastExtentSize; // this is a big job, so might as well make things tidy before we start just to be nice. getDur().commitNow(); list<DiskLoc> extents; for( DiskLoc L = d->firstExtent; !L.isNull(); L = L.ext()->xnext ) extents.push_back(L); log() << "compact " << extents.size() << " extents" << endl; ProgressMeterHolder pm( cc().curop()->setMessage( "compact extent" , extents.size() ) ); // same data, but might perform a little different after compact? NamespaceDetailsTransient::get(ns).clearQueryCache(); int nidx = d->nIndexes; scoped_array<IndexSpec> indexSpecs( new IndexSpec[nidx] ); scoped_array<SortPhaseOne> phase1( new SortPhaseOne[nidx] ); { NamespaceDetails::IndexIterator ii = d->ii(); int x = 0; while( ii.more() ) { BSONObjBuilder b; IndexDetails& idx = ii.next(); BSONObj::iterator i(idx.info.obj()); while( i.more() ) { BSONElement e = i.next(); if( !str::equals(e.fieldName(), "v") && !str::equals(e.fieldName(), "background") ) { b.append(e); } } BSONObj o = b.obj().getOwned(); phase1[x].sorter.reset( new BSONObjExternalSorter( idx.idxInterface(), o.getObjectField("key") ) ); phase1[x].sorter->hintNumObjects( d->stats.nrecords ); indexSpecs[x++].reset(o); } } log() << "compact orphan deleted lists" << endl; for( int i = 0; i < Buckets; i++ ) { d->deletedList[i].writing().Null(); } // before dropping indexes, at least make sure we can allocate one extent! uassert(14025, "compact error no space available to allocate", !allocateSpaceForANewRecord(ns, d, Record::HeaderSize+1, false).isNull()); // note that the drop indexes call also invalidates all clientcursors for the namespace, which is important and wanted here log() << "compact dropping indexes" << endl; BSONObjBuilder b; if( !dropIndexes(d, ns, "*", errmsg, b, true) ) { errmsg = "compact drop indexes failed"; log() << errmsg << endl; return false; } getDur().commitNow(); long long skipped = 0; int n = 0; for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) { skipped += compactExtent(ns, d, *i, n++, indexSpecs, phase1, nidx, validate, pf, pb); pm.hit(); } if( skipped ) { result.append("invalidObjects", skipped); } assert( d->firstExtent.ext()->xprev.isNull() ); // indexes will do their own progress meter? pm.finished(); // build indexes NamespaceString s(ns); string si = s.db + ".system.indexes"; for( int i = 0; i < nidx; i++ ) { killCurrentOp.checkForInterrupt(false); BSONObj info = indexSpecs[i].info; log() << "compact create index " << info["key"].Obj().toString() << endl; try { precalced = &phase1[i]; theDataFileMgr.insert(si.c_str(), info.objdata(), info.objsize()); } catch(...) { precalced = 0; throw; } precalced = 0; } return true; }
StatusWith<CompactStats> Collection::compact( const CompactOptions* compactOptions ) { if ( isCapped() ) return StatusWith<CompactStats>( ErrorCodes::BadValue, "cannot compact capped collection" ); if ( _indexCatalog.numIndexesInProgress() ) return StatusWith<CompactStats>( ErrorCodes::BadValue, "cannot compact when indexes in progress" ); NamespaceDetails* d = details(); // this is a big job, so might as well make things tidy before we start just to be nice. getDur().commitIfNeeded(); list<DiskLoc> extents; for( DiskLoc L = d->firstExtent(); !L.isNull(); L = L.ext()->xnext ) extents.push_back(L); log() << "compact " << extents.size() << " extents" << endl; // same data, but might perform a little different after compact? _infoCache.reset(); vector<BSONObj> indexSpecs; { IndexCatalog::IndexIterator ii( _indexCatalog.getIndexIterator( false ) ); while ( ii.more() ) { IndexDescriptor* descriptor = ii.next(); indexSpecs.push_back( _compactAdjustIndexSpec( descriptor->infoObj() ) ); } } log() << "compact orphan deleted lists" << endl; d->orphanDeletedList(); // Start over from scratch with our extent sizing and growth d->setLastExtentSize( 0 ); // before dropping indexes, at least make sure we can allocate one extent! if ( allocateSpaceForANewRecord( _ns.ns().c_str(), d, Record::HeaderSize+1, false).isNull() ) { return StatusWith<CompactStats>( ErrorCodes::InternalError, "compact error no space available to allocate" ); } // note that the drop indexes call also invalidates all clientcursors for the namespace, // which is important and wanted here log() << "compact dropping indexes" << endl; Status status = _indexCatalog.dropAllIndexes( true ); if ( !status.isOK() ) { return StatusWith<CompactStats>( status ); } getDur().commitIfNeeded(); CompactStats stats; OwnedPointerVector<IndexCatalog::IndexBuildBlock> indexBuildBlocks; vector<IndexAccessMethod*> indexesToInsertTo; vector< std::pair<IndexAccessMethod*,IndexAccessMethod*> > bulkToCommit; for ( size_t i = 0; i < indexSpecs.size(); i++ ) { killCurrentOp.checkForInterrupt(false); BSONObj info = indexSpecs[i]; info = _compactAdjustIndexSpec( info ); info = _indexCatalog.fixIndexSpec( info ); auto_ptr<IndexCatalog::IndexBuildBlock> block( new IndexCatalog::IndexBuildBlock( this,info ) ); Status status = block->init(); if ( !status.isOK() ) return StatusWith<CompactStats>(status); IndexAccessMethod* accessMethod = block->getEntry()->accessMethod(); status = accessMethod->initializeAsEmpty(); if ( !status.isOK() ) return StatusWith<CompactStats>(status); IndexAccessMethod* bulk = accessMethod->initiateBulk(); if ( bulk ) { indexesToInsertTo.push_back( bulk ); bulkToCommit.push_back( std::pair<IndexAccessMethod*,IndexAccessMethod*>( accessMethod, bulk ) ); } else { indexesToInsertTo.push_back( accessMethod ); } indexBuildBlocks.mutableVector().push_back( block.release() ); } // reset data size and record counts to 0 for this namespace // as we're about to tally them up again for each new extent d->setStats( 0, 0 ); ProgressMeterHolder pm(cc().curop()->setMessage("compact extent", "Extent Compacting Progress", extents.size())); int extentNumber = 0; for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) { _compactExtent(*i, extentNumber++, indexesToInsertTo, compactOptions, &stats ); pm.hit(); } verify( d->firstExtent().ext()->xprev.isNull() ); // indexes will do their own progress meter? pm.finished(); log() << "starting index commits"; for ( size_t i = 0; i < bulkToCommit.size(); i++ ) { bulkToCommit[i].first->commitBulk( bulkToCommit[i].second, false, NULL ); } for ( size_t i = 0; i < indexBuildBlocks.size(); i++ ) { IndexCatalog::IndexBuildBlock* block = indexBuildBlocks.mutableVector()[i]; block->success(); } return StatusWith<CompactStats>( stats ); }
virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { string source = cmdObj.getStringField( name.c_str() ); string target = cmdObj.getStringField( "to" ); uassert(15967, "invalid collection name: " + target, NamespaceString::validCollectionComponent(target.c_str())); if ( source.empty() || target.empty() ) { errmsg = "invalid command syntax"; return false; } string sourceDB = nsToDatabase(source); string targetDB = nsToDatabase(target); string databaseName = sourceDB; databaseName += ".system.indexes"; int longestIndexNameLength = 0; vector<BSONObj> oldIndSpec = Helpers::findAll(databaseName, BSON("ns" << source)); for (size_t i = 0; i < oldIndSpec.size(); ++i) { int thisLength = oldIndSpec[i].getField("name").valuesize(); if (thisLength > longestIndexNameLength) { longestIndexNameLength = thisLength; } } unsigned int longestAllowed = maxNamespaceLen - longestIndexNameLength - 1; if (target.size() > longestAllowed) { StringBuilder sb; sb << "collection name length of " << target.size() << " exceeds maximum length of " << longestAllowed << ", allowing for index names"; uasserted(16451, sb.str()); } bool capped = false; long long size = 0; std::vector<BSONObj> indexesInProg; { Client::Context ctx( source ); NamespaceDetails *nsd = nsdetails( source ); uassert( 10026 , "source namespace does not exist", nsd ); indexesInProg = stopIndexBuilds(dbname, cmdObj); capped = nsd->isCapped(); if ( capped ) for( DiskLoc i = nsd->firstExtent(); !i.isNull(); i = i.ext()->xnext ) size += i.ext()->length; } Client::Context ctx( target ); if ( nsdetails( target ) ) { uassert( 10027 , "target namespace exists", cmdObj["dropTarget"].trueValue() ); Status s = cc().database()->dropCollection( target ); if ( !s.isOK() ) { errmsg = s.toString(); return false; } } // if we are renaming in the same database, just // rename the namespace and we're done. { if ( sourceDB == targetDB ) { Status s = ctx.db()->renameCollection( source, target, cmdObj["stayTemp"].trueValue() ); if ( !s.isOK() ) { errmsg = s.toString(); return false; } return true; } } // renaming across databases, so we must copy all // the data and then remove the source collection. BSONObjBuilder spec; if ( capped ) { spec.appendBool( "capped", true ); spec.append( "size", double( size ) ); } if ( !userCreateNS( target.c_str(), spec.done(), errmsg, false ) ) return false; auto_ptr< DBClientCursor > c; DBDirectClient bridge; { c = bridge.query( source, BSONObj(), 0, 0, 0, fromRepl ? QueryOption_SlaveOk : 0 ); } while( 1 ) { { if ( !c->more() ) break; } BSONObj o = c->next(); theDataFileMgr.insertWithObjMod( target.c_str(), o ); } string sourceIndexes = nsToDatabase( source ) + ".system.indexes"; string targetIndexes = nsToDatabase( target ) + ".system.indexes"; { c = bridge.query( sourceIndexes, QUERY( "ns" << source ), 0, 0, 0, fromRepl ? QueryOption_SlaveOk : 0 ); } while( 1 ) { { if ( !c->more() ) break; } BSONObj o = c->next(); BSONObjBuilder b; BSONObjIterator i( o ); while( i.moreWithEOO() ) { BSONElement e = i.next(); if ( e.eoo() ) break; if ( strcmp( e.fieldName(), "ns" ) == 0 ) { b.append( "ns", target ); } else { b.append( e ); } } BSONObj n = b.done(); theDataFileMgr.insertWithObjMod( targetIndexes.c_str(), n ); } { Client::Context ctx( source ); Status s = ctx.db()->dropCollection( source ); if ( !s.isOK() ) { errmsg = s.toString(); return false; } IndexBuilder::restoreIndexes(targetIndexes, indexesInProg); } return true; }
virtual bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { string source = cmdObj.getStringField( name.c_str() ); string target = cmdObj.getStringField( "to" ); if ( !NamespaceString::validCollectionComponent(target.c_str()) ) { errmsg = "invalid collection name: " + target; return false; } if ( source.empty() || target.empty() ) { errmsg = "invalid command syntax"; return false; } if (!fromRepl) { // If it got through on the master, need to allow it here too Status sourceStatus = userAllowedWriteNS(source); if (!sourceStatus.isOK()) { errmsg = "error with source namespace: " + sourceStatus.reason(); return false; } Status targetStatus = userAllowedWriteNS(target); if (!targetStatus.isOK()) { errmsg = "error with target namespace: " + targetStatus.reason(); return false; } } string sourceDB = nsToDatabase(source); string targetDB = nsToDatabase(target); bool capped = false; long long size = 0; std::vector<BSONObj> indexesInProg; { Client::Context srcCtx( source ); Collection* sourceColl = srcCtx.db()->getCollection( source ); if ( !sourceColl ) { errmsg = "source namespace does not exist"; return false; } // Ensure that collection name does not exceed maximum length. // Ensure that index names do not push the length over the max. // Iterator includes unfinished indexes. IndexCatalog::IndexIterator sourceIndIt = sourceColl->getIndexCatalog()->getIndexIterator( true ); int longestIndexNameLength = 0; while ( sourceIndIt.more() ) { int thisLength = sourceIndIt.next()->indexName().length(); if ( thisLength > longestIndexNameLength ) longestIndexNameLength = thisLength; } unsigned int longestAllowed = min(int(Namespace::MaxNsColletionLen), int(Namespace::MaxNsLen) - 2/*strlen(".$")*/ - longestIndexNameLength); if (target.size() > longestAllowed) { StringBuilder sb; sb << "collection name length of " << target.size() << " exceeds maximum length of " << longestAllowed << ", allowing for index names"; errmsg = sb.str(); return false; } { const NamespaceDetails *nsd = nsdetails( source ); indexesInProg = stopIndexBuilds( dbname, cmdObj ); capped = nsd->isCapped(); if ( capped ) for( DiskLoc i = nsd->firstExtent(); !i.isNull(); i = i.ext()->xnext ) size += i.ext()->length; } } { Client::Context ctx( target ); // Check if the target namespace exists and if dropTarget is true. // If target exists and dropTarget is not true, return false. if ( ctx.db()->getCollection( target ) ) { if ( !cmdObj["dropTarget"].trueValue() ) { errmsg = "target namespace exists"; return false; } Status s = cc().database()->dropCollection( target ); if ( !s.isOK() ) { errmsg = s.toString(); restoreIndexBuildsOnSource( indexesInProg, source ); return false; } } // If we are renaming in the same database, just // rename the namespace and we're done. if ( sourceDB == targetDB ) { Status s = ctx.db()->renameCollection( source, target, cmdObj["stayTemp"].trueValue() ); if ( !s.isOK() ) { errmsg = s.toString(); restoreIndexBuildsOnSource( indexesInProg, source ); return false; } return true; } // Otherwise, we are enaming across databases, so we must copy all // the data and then remove the source collection. // Create the target collection. Collection* targetColl = NULL; if ( capped ) { BSONObjBuilder spec; spec.appendBool( "capped", true ); spec.append( "size", double( size ) ); spec.appendBool( "autoIndexId", false ); userCreateNS( target.c_str(), spec.obj(), errmsg, false ); targetColl = ctx.db()->getCollection( target ); } else { CollectionOptions options; options.setNoIdIndex(); // No logOp necessary because the entire renameCollection command is one logOp. targetColl = ctx.db()->createCollection( target, options ); } if ( !targetColl ) { errmsg = "Failed to create target collection."; restoreIndexBuildsOnSource( indexesInProg, source ); return false; } } // Copy over all the data from source collection to target collection. bool insertSuccessful = true; boost::scoped_ptr<CollectionIterator> sourceIt; { Client::Context srcCtx( source ); Collection* sourceColl = srcCtx.db()->getCollection( source ); sourceIt.reset( sourceColl->getIterator( DiskLoc(), false, CollectionScanParams::FORWARD ) ); } Collection* targetColl = NULL; while ( !sourceIt->isEOF() ) { BSONObj o; { Client::Context srcCtx( source ); o = sourceIt->getNext().obj(); } // Insert and check return status of insert. { Client::Context ctx( target ); if ( !targetColl ) targetColl = ctx.db()->getCollection( target ); // No logOp necessary because the entire renameCollection command is one logOp. Status s = targetColl->insertDocument( o, true ).getStatus(); if ( !s.isOK() ) { insertSuccessful = false; errmsg = s.toString(); break; } } } // If inserts were unsuccessful, drop the target collection and return false. if ( !insertSuccessful ) { Client::Context ctx( target ); Status s = ctx.db()->dropCollection( target ); if ( !s.isOK() ) errmsg = s.toString(); restoreIndexBuildsOnSource( indexesInProg, source ); return false; } // Copy over the indexes to temp storage and then to the target.. vector<BSONObj> copiedIndexes; bool indexSuccessful = true; { Client::Context srcCtx( source ); Collection* sourceColl = srcCtx.db()->getCollection( source ); IndexCatalog::IndexIterator sourceIndIt = sourceColl->getIndexCatalog()->getIndexIterator( true ); while ( sourceIndIt.more() ) { BSONObj currIndex = sourceIndIt.next()->infoObj(); // Process the source index. BSONObjBuilder b; BSONObjIterator i( currIndex ); while( i.moreWithEOO() ) { BSONElement e = i.next(); if ( e.eoo() ) break; else if ( strcmp( e.fieldName(), "ns" ) == 0 ) b.append( "ns", target ); else b.append( e ); } BSONObj newIndex = b.obj(); copiedIndexes.push_back( newIndex ); } } { Client::Context ctx( target ); if ( !targetColl ) targetColl = ctx.db()->getCollection( target ); for ( vector<BSONObj>::iterator it = copiedIndexes.begin(); it != copiedIndexes.end(); ++it ) { Status s = targetColl->getIndexCatalog()->createIndex( *it, true ); if ( !s.isOK() ) { indexSuccessful = false; errmsg = s.toString(); break; } } // If indexes were unsuccessful, drop the target collection and return false. if ( !indexSuccessful ) { Status s = ctx.db()->dropCollection( target ); if ( !s.isOK() ) errmsg = s.toString(); restoreIndexBuildsOnSource( indexesInProg, source ); return false; } } // Drop the source collection. { Client::Context srcCtx( source ); Status s = srcCtx.db()->dropCollection( source ); if ( !s.isOK() ) { errmsg = s.toString(); restoreIndexBuildsOnSource( indexesInProg, source ); return false; } } return true; }