void FSyncLockThread::doRealWork() { SimpleMutex::scoped_lock lkf(filesLockedFsync); Lock::GlobalWrite global(true/*stopGreed*/); SimpleMutex::scoped_lock lk(fsyncCmd.m); verify( ! fsyncCmd.locked ); // impossible to get here if locked is true try { getDur().syncDataAndTruncateJournal(); } catch( std::exception& e ) { error() << "error doing syncDataAndTruncateJournal: " << e.what() << endl; fsyncCmd.err = e.what(); fsyncCmd._threadSync.notify_one(); fsyncCmd.locked = false; return; } global.downgrade(); try { MemoryMappedFile::flushAll(true); } catch( std::exception& e ) { error() << "error doing flushAll: " << e.what() << endl; fsyncCmd.err = e.what(); fsyncCmd._threadSync.notify_one(); fsyncCmd.locked = false; return; } verify( ! fsyncCmd.locked ); fsyncCmd.locked = true; fsyncCmd._threadSync.notify_one(); while ( ! fsyncCmd.pendingUnlock ) { fsyncCmd._unlockSync.wait(fsyncCmd.m); } fsyncCmd.pendingUnlock = false; fsyncCmd.locked = false; fsyncCmd.err = "unlocked"; fsyncCmd._unlockSync.notify_one(); }
void IndexRebuilder::checkDB(const std::string& dbName, bool* firstTime) { const std::string systemNS = dbName + ".system.namespaces"; DBDirectClient cli; scoped_ptr<DBClientCursor> cursor(cli.query(systemNS, Query())); // This depends on system.namespaces not changing while we iterate while (cursor->more()) { BSONObj nsDoc = cursor->next(); const char* ns = nsDoc["name"].valuestrsafe(); Client::WriteContext ctx(ns); NamespaceDetails* nsd = nsdetails(ns); if (!nsd || !nsd->indexBuildsInProgress) { continue; } log() << "Found interrupted index build on " << ns << endl; if (*firstTime) { log() << "Restart the server with --noIndexBuildRetry to skip index rebuilds" << endl; *firstTime = false; } // If the indexBuildRetry flag isn't set, just clear the inProg flag if (!cmdLine.indexBuildRetry) { // If we crash between unsetting the inProg flag and cleaning up the index, the // index space will be lost. int inProg = nsd->indexBuildsInProgress; getDur().writingInt(nsd->indexBuildsInProgress) = 0; for (int i = 0; i < inProg; i++) { nsd->idx(nsd->nIndexes+i).kill_idx(); } continue; } // We go from right to left building these indexes, so that indexBuildInProgress-- has // the correct effect of "popping" an index off the list. while (nsd->indexBuildsInProgress > 0) { retryIndexBuild(dbName, nsd, nsd->nIndexes+nsd->indexBuildsInProgress-1); } } }
NOINLINE_DECL void insertMulti(bool keepGoing, const char *ns, vector<BSONObj>& objs, CurOp& op) { size_t i; for (i=0; i<objs.size(); i++){ try { checkAndInsert(ns, objs[i]); getDur().commitIfNeeded(); } catch (const UserException&) { if (!keepGoing || i == objs.size()-1){ globalOpCounters.incInsertInWriteLock(i); throw; } // otherwise ignore and keep going } } globalOpCounters.incInsertInWriteLock(i); op.debug().ninserted = i; }
/** * Perform a single insert into a collection. Requires the insert be preprocessed and the * collection already has been created. * * Might fault or error, otherwise populates the result. */ static void singleInsert( const BSONObj& docToInsert, Collection* collection, WriteOpResult* result ) { const string& insertNS = collection->ns().ns(); Lock::assertWriteLocked( insertNS ); StatusWith<DiskLoc> status = collection->insertDocument( docToInsert, true ); if ( !status.isOK() ) { result->setError(toWriteError(status.getStatus())); } else { logOp( "i", insertNS.c_str(), docToInsert ); getDur().commitIfNeeded(); result->getStats().n = 1; } }
void DurRecoveryUnit::commitUnitOfWork() { #if ROLLBACK_ENABLED invariant(_state != MUST_ROLLBACK); invariant(_nestingLevel > 0); if (_nestingLevel != 1) { // If we are nested, punt to outer UnitOfWork. These changes will only be pushed to the // global damages list when the outer UnitOfWork commits (which it must now do). if (haveUncommitedChanges()) _state = MUST_COMMIT; return; } publishChanges(); #endif // global journal flush getDur().commitIfNeeded(_txn); }
void DurRecoveryUnit::commitUnitOfWork() { invariant(inAUnitOfWork()); invariant(!_mustRollback); if (!inOutermostUnitOfWork()) { // If we are nested, make all changes for this level part of the containing UnitOfWork. // They will be added to the global damages list once the outermost UnitOfWork commits, // which it must now do. if (haveUncommitedChangesAtCurrentLevel()) { _startOfUncommittedChangesForLevel.back() = _changes.size(); } return; } publishChanges(); // global journal flush opportunity getDur().commitIfNeeded(_txn); }
IndexCatalog::IndexBuildBlock* halfAddIndex(const std::string& key) { string name = key + "_1"; BSONObj indexInfo = BSON( "v" << 1 << "key" << BSON( key << 1 ) << "ns" << _ns << "name" << name ); int32_t lenWHdr = indexInfo.objsize() + Record::HeaderSize; const char* systemIndexes = "unittests.system.indexes"; DiskLoc infoLoc = allocateSpaceForANewRecord( systemIndexes, nsdetails( systemIndexes ), lenWHdr, false ); Record* infoRecord = reinterpret_cast<Record*>( getDur().writingPtr( infoLoc.rec(), lenWHdr ) ); memcpy( infoRecord->data(), indexInfo.objdata(), indexInfo.objsize() ); addRecordToRecListInExtent( infoRecord, infoLoc ); return new IndexCatalog::IndexBuildBlock( _ctx.ctx().db()->getCollection( _ns )->getIndexCatalog(), name, infoLoc ); }
/** given a BSON object, create a new one at dst which is the existing (partial) object with a new object element appended at the end with fieldname "o". @param partial already build object with everything except the o member. e.g. something like: { ts:..., ns:..., os2:... } @param o a bson object to be added with fieldname "o" @dst where to put the newly built combined object. e.g. ends up as something like: { ts:..., ns:..., os2:..., o:... } */ void append_O_Obj(char *dst, const BSONObj& partial, const BSONObj& o) { const int size1 = partial.objsize() - 1; // less the EOO char const int oOfs = size1+3; // 3 = byte BSONOBJTYPE + byte 'o' + byte \0 void *p = getDur().writingPtr(dst, oOfs+o.objsize()+1); memcpy(p, partial.objdata(), size1); // adjust overall bson object size for the o: field *(static_cast<unsigned*>(p)) += o.objsize() + 1/*fieldtype byte*/ + 2/*"o" fieldname*/; char *b = static_cast<char *>(p); b += size1; *b++ = (char) Object; *b++ = 'o'; // { o : ... } *b++ = 0; // null terminate "o" fieldname memcpy(b, o.objdata(), o.objsize()); b += o.objsize(); *b = EOO; }
/* assumes already zeroed -- insufficient for block 'reuse' perhaps */ DiskLoc Extent::init(const char *nsname, int _length, int _fileNo, int _offset, bool capped) { magic = extentSignature; myLoc.set(_fileNo, _offset); xnext.Null(); xprev.Null(); nsDiagnostic = nsname; length = _length; firstRecord.Null(); lastRecord.Null(); DiskLoc emptyLoc; int delRecLength; extent_getEmptyLoc(nsname, myLoc, _length, capped, emptyLoc, delRecLength); DeletedRecord* empty = getDur().writing(DataFileMgr::getDeletedRecord(emptyLoc)); empty->lengthWithHeaders() = delRecLength; empty->extentOfs() = myLoc.getOfs(); empty->nextDeleted().Null(); return emptyLoc; }
/** @return IndexDetails for a new index on a:1, with the info field populated. */ IndexDetails& addIndexWithInfo() { BSONObj indexInfo = BSON( "v" << 1 << "key" << BSON( "a" << 1 ) << "ns" << _ns << "name" << "a_1" ); int32_t lenWHdr = indexInfo.objsize() + Record::HeaderSize; const char* systemIndexes = "unittests.system.indexes"; DiskLoc infoLoc = allocateSpaceForANewRecord( systemIndexes, nsdetails( systemIndexes ), lenWHdr, false ); Record* infoRecord = reinterpret_cast<Record*>( getDur().writingPtr( infoLoc.rec(), lenWHdr ) ); memcpy( infoRecord->data(), indexInfo.objdata(), indexInfo.objsize() ); addRecordToRecListInExtent( infoRecord, infoLoc ); IndexDetails& id = nsdetails( _ns )->getNextIndexDetails( _ns ); nsdetails( _ns )->addIndex(); id.info.writing() = infoLoc; return id; }
bool DatabaseHolder::closeAll( const string& path , BSONObjBuilder& result , bool force ) { log() << "DatabaseHolder::closeAll path:" << path << endl; verify( Lock::isW() ); getDur().commitNow(); // bad things happen if we close a DB with outstanding writes map<string,Database*>& m = _paths[path]; _size -= m.size(); set< string > dbs; for ( map<string,Database*>::iterator i = m.begin(); i != m.end(); i++ ) { wassert( i->second->path() == path ); dbs.insert( i->first ); } currentClient.get()->getContext()->_clear(); BSONObjBuilder bb( result.subarrayStart( "dbs" ) ); int n = 0; int nNotClosed = 0; for( set< string >::iterator i = dbs.begin(); i != dbs.end(); ++i ) { string name = *i; LOG(2) << "DatabaseHolder::closeAll path:" << path << " name:" << name << endl; Client::Context ctx( name , path ); if( !force && BackgroundOperation::inProgForDb(name) ) { log() << "WARNING: can't close database " << name << " because a bg job is in progress - try killOp command" << endl; nNotClosed++; } else { Database::closeDatabase( name.c_str() , path ); bb.append( bb.numStr( n++ ) , name ); } } bb.done(); if( nNotClosed ) result.append("nNotClosed", nNotClosed); else { ClientCursor::assertNoCursors(); } return true; }
/** @return IndexDetails for a new index on a:1, with the info field populated. */ IndexDescriptor* addIndexWithInfo() { BSONObj indexInfo = BSON( "v" << 1 << "key" << BSON( "a" << 1 ) << "ns" << _ns << "name" << "a_1" ); int32_t lenWHdr = indexInfo.objsize() + Record::HeaderSize; const char* systemIndexes = "unittests.system.indexes"; DiskLoc infoLoc = allocateSpaceForANewRecord( systemIndexes, nsdetails( systemIndexes ), lenWHdr, false ); Record* infoRecord = reinterpret_cast<Record*>( getDur().writingPtr( infoLoc.rec(), lenWHdr ) ); memcpy( infoRecord->data(), indexInfo.objdata(), indexInfo.objsize() ); addRecordToRecListInExtent( infoRecord, infoLoc ); IndexCatalog::IndexBuildBlock blk( collection()->getIndexCatalog(), "a_1", infoLoc ); blk.success(); return collection()->getIndexCatalog()->findIndexByName( "a_1" ); }
void DurRecoveryUnit::commitChanges() { if (!inAUnitOfWork()) return; invariant(!_mustRollback); invariant(inOutermostUnitOfWork()); invariant(_startOfUncommittedChangesForLevel.front().changeIndex == 0); invariant(_startOfUncommittedChangesForLevel.front().writeIndex == 0); if (getDur().isDurable()) pushChangesToDurSubSystem(); for (Changes::const_iterator it = _changes.begin(), end = _changes.end(); it != end; ++it) { (*it)->commit(); } // We now reset to a "clean" state without any uncommited changes. _changes.clear(); _writes.clear(); _preimageBuffer.clear(); }
bool DatabaseHolder::closeAll(OperationContext* txn, BSONObjBuilder& result, bool force) { invariant(txn->lockState()->isW()); getDur().commitNow(txn); // bad things happen if we close a DB with outstanding writes set< string > dbs; for ( map<string,Database*>::iterator i = _dbs.begin(); i != _dbs.end(); i++ ) { dbs.insert( i->first ); } BSONObjBuilder bb( result.subarrayStart( "dbs" ) ); int n = 0; int nNotClosed = 0; for( set< string >::iterator i = dbs.begin(); i != dbs.end(); ++i ) { string name = *i; LOG(2) << "DatabaseHolder::closeAll name:" << name; Client::Context ctx(txn, name); if( !force && BackgroundOperation::inProgForDb(name) ) { log() << "WARNING: can't close database " << name << " because a bg job is in progress - try killOp command" << endl; nNotClosed++; } else { Database::closeDatabase(txn, name.c_str()); bb.append( bb.numStr( n++ ) , name ); } } bb.done(); if( nNotClosed ) { result.append("nNotClosed", nNotClosed); } return true; }
~RepairFileDeleter() { if ( _success ) return; log() << "cleaning up failed repair " << "db: " << _dbName << " path: " << _pathString; try { getDur().syncDataAndTruncateJournal(); MongoFile::flushAll(true); // need both in case journaling is disabled { Client::Context tempContext( _dbName, _pathString ); Database::closeDatabase( _dbName, _pathString ); } MONGO_ASSERT_ON_EXCEPTION( boost::filesystem::remove_all( _path ) ); } catch ( DBException& e ) { error() << "RepairFileDeleter failed to cleanup: " << e; error() << "aborting"; fassertFailed( 17402 ); } }
/** write an op to the oplog that is already built. todo : make _logOpRS() call this so we don't repeat ourself? */ void _logOpObjRS(const BSONObj& op) { Lock::DBWrite lk("local"); const OpTime ts = op["ts"]._opTime(); long long h = op["h"].numberLong(); { const char *logns = rsoplog; if ( rsOplogDetails == 0 ) { Client::Context ctx(logns , dbpath); localDB = ctx.db(); verify( localDB ); rsOplogDetails = nsdetails(logns); massert(13389, "local.oplog.rs missing. did you drop it? if so restart server", rsOplogDetails); } Client::Context ctx(logns , localDB); { int len = op.objsize(); Record *r = theDataFileMgr.fast_oplog_insert(rsOplogDetails, logns, len); memcpy(getDur().writingPtr(r->data(), len), op.objdata(), len); } /* todo: now() has code to handle clock skew. but if the skew server to server is large it will get unhappy. this code (or code in now() maybe) should be improved. */ if( theReplSet ) { if( !(theReplSet->lastOpTimeWritten<ts) ) { log() << "replSet error possible failover clock skew issue? " << theReplSet->lastOpTimeWritten.toString() << ' ' << endl; } theReplSet->lastOpTimeWritten = ts; theReplSet->lastH = h; ctx.getClient()->setLastOp( ts ); replset::BackgroundSync::notify(); } } OpTime::setLast( ts ); }
DiskLoc Extent::_reuse(const char *nsname, bool capped) { LOG(3) << "_reuse extent was:" << nsDiagnostic.toString() << " now:" << nsname << endl; if (magic != extentSignature) { StringBuilder sb; sb << "bad extent signature " << integerToHex(magic) << " for namespace '" << nsDiagnostic.toString() << "' found in Extent::_reuse"; msgasserted(10360, sb.str()); } nsDiagnostic = nsname; markEmpty(); DiskLoc emptyLoc; int delRecLength; extent_getEmptyLoc(nsname, myLoc, length, capped, emptyLoc, delRecLength); // todo: some dup code here and below in Extent::init DeletedRecord* empty = getDur().writing(DataFileMgr::getDeletedRecord(emptyLoc)); empty->lengthWithHeaders() = delRecLength; empty->extentOfs() = myLoc.getOfs(); empty->nextDeleted().Null(); return emptyLoc; }
/* apply the log op that is in param o @return bool success (true) or failure (false) */ bool SyncTail::syncApply(const BSONObj &op, bool convertUpdateToUpsert) { const char *ns = op.getStringField("ns"); verify(ns); if ( (*ns == '\0') || (*ns == '.') ) { // this is ugly // this is often a no-op // but can't be 100% sure if( *op.getStringField("op") != 'n' ) { error() << "replSet skipping bad op in oplog: " << op.toString() << rsLog; } return true; } bool isCommand(op["op"].valuestrsafe()[0] == 'c'); boost::scoped_ptr<Lock::ScopedLock> lk; if(isCommand) { // a command may need a global write lock. so we will conservatively go // ahead and grab one here. suboptimal. :-( lk.reset(new Lock::GlobalWrite()); } else { // DB level lock for this operation lk.reset(new Lock::DBWrite(ns)); } Client::Context ctx(ns, dbpath); ctx.getClient()->curop()->reset(); // For non-initial-sync, we convert updates to upserts // to suppress errors when replaying oplog entries. bool ok = !applyOperation_inlock(op, true, convertUpdateToUpsert); opsAppliedStats.increment(); getDur().commitIfNeeded(); return ok; }
/** * Perform a single insert into a collection. Requires the insert be preprocessed and the * collection already has been created. * * Might fault or error, otherwise populates the result. */ static void singleInsert( const BatchItemRef& insertItem, const BSONObj& normalInsert, Collection* collection, WriteOpResult* result ) { const string& insertNS = insertItem.getRequest()->getNS(); Lock::assertWriteLocked( insertNS ); try { // XXX - are we 100% sure that all !OK statuses do not write a document? StatusWith<DiskLoc> status = collection->insertDocument( normalInsert, true ); if ( !status.isOK() ) { result->error = toWriteError( status.getStatus() ); } else { logOp( "i", insertNS.c_str(), normalInsert ); getDur().commitIfNeeded(); result->stats.n = 1; } } catch ( const PageFaultException& ex ) { // TODO: An actual data structure that's not an exception for this result->fault = new PageFaultException( ex ); } catch ( const DBException& ex ) { Status status(ex.toStatus()); if (ErrorCodes::isInterruption(status.code())) { throw; } result->error = toWriteError(status); } }
Status validateWriteConcern( const WriteConcernOptions& writeConcern ) { const bool isJournalEnabled = getDur().isDurable(); if ( writeConcern.syncMode == WriteConcernOptions::JOURNAL && !isJournalEnabled ) { return Status( ErrorCodes::BadValue, "cannot use 'j' option when a host does not have journaling enabled" ); } const bool isConfigServer = serverGlobalParams.configsvr; const repl::ReplicationCoordinator::Mode replMode = repl::getGlobalReplicationCoordinator()->getReplicationMode(); if ( isConfigServer || replMode == repl::ReplicationCoordinator::modeNone ) { // Note that config servers can be replicated (have an oplog), but we still don't allow // w > 1 if ( writeConcern.wNumNodes > 1 ) { return Status( ErrorCodes::BadValue, string( "cannot use 'w' > 1 " ) + ( isConfigServer ? "on a config server host" : "when a host is not replicated" ) ); } } if ( replMode != repl::ReplicationCoordinator::modeReplSet && !writeConcern.wMode.empty() && writeConcern.wMode != "majority" ) { return Status( ErrorCodes::BadValue, string( "cannot use non-majority 'w' mode " ) + writeConcern.wMode + " when a host is not a member of a replica set" ); } return Status::OK(); }
uint64_t BtreeBasedBuilder::fastBuildIndex(const char* ns, NamespaceDetails* d, IndexDetails& idx, bool mayInterrupt, int idxNo) { CurOp * op = cc().curop(); Timer t; tlog(1) << "fastBuildIndex " << ns << ' ' << idx.info.obj().toString() << endl; bool dupsAllowed = !idx.unique() || ignoreUniqueIndex(idx); bool dropDups = idx.dropDups() || inDBRepair; BSONObj order = idx.keyPattern(); getDur().writingDiskLoc(idx.head).Null(); if ( logLevel > 1 ) printMemInfo( "before index start" ); /* get and sort all the keys ----- */ ProgressMeterHolder pm(op->setMessage("index: (1/3) external sort", "Index: (1/3) External Sort Progress", d->stats.nrecords, 10)); SortPhaseOne phase1; addKeysToPhaseOne(d, ns, idx, order, &phase1, d->stats.nrecords, pm.get(), mayInterrupt, idxNo ); pm.finished(); BSONObjExternalSorter& sorter = *(phase1.sorter); if( phase1.multi ) { d->setIndexIsMultikey(ns, idxNo); } if ( logLevel > 1 ) printMemInfo( "before final sort" ); phase1.sorter->sort( mayInterrupt ); if ( logLevel > 1 ) printMemInfo( "after final sort" ); LOG(t.seconds() > 5 ? 0 : 1) << "\t external sort used : " << sorter.numFiles() << " files " << " in " << t.seconds() << " secs" << endl; set<DiskLoc> dupsToDrop; /* build index --- */ if( idx.version() == 0 ) buildBottomUpPhases2And3<V0>(dupsAllowed, idx, sorter, dropDups, dupsToDrop, op, &phase1, pm, t, mayInterrupt); else if( idx.version() == 1 ) buildBottomUpPhases2And3<V1>(dupsAllowed, idx, sorter, dropDups, dupsToDrop, op, &phase1, pm, t, mayInterrupt); else verify(false); if( dropDups ) log() << "\t fastBuildIndex dupsToDrop:" << dupsToDrop.size() << endl; BtreeBasedBuilder::doDropDups(ns, d, dupsToDrop, mayInterrupt); return phase1.n; }
/** @return number of skipped (invalid) documents */ unsigned compactExtent(const char *ns, NamespaceDetails *d, const DiskLoc ext, int n, const scoped_array<IndexSpec> &indexSpecs, scoped_array<SortPhaseOne>& phase1, int nidx, bool validate, double pf, int pb) { log() << "compact extent #" << n << endl; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent *e = ext.ext(); e->assertOk(); assert( e->validates() ); unsigned skipped = 0; { // the next/prev pointers within the extent might not be in order so we first page the whole thing in // sequentially log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl; Timer t; MAdvise adv(e, e->length, MAdvise::Sequential); const char *p = (const char *) e; for( int i = 0; i < e->length; i += 4096 ) { faux += p[i]; } int ms = t.millis(); if( ms > 1000 ) log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/ms << "MB/sec" << endl; } { log() << "compact copying records" << endl; unsigned totalSize = 0; int nrecs = 0; DiskLoc L = e->firstRecord; if( !L.isNull() ) { while( 1 ) { Record *recOld = L.rec(); L = recOld->nextInExtent(L); nrecs++; BSONObj objOld(recOld); if( !validate || objOld.valid() ) { unsigned sz = objOld.objsize(); oldObjSize += sz; oldObjSizeWithPadding += recOld->netLength(); unsigned lenWHdr = sz + Record::HeaderSize; unsigned lenWPadding = lenWHdr; { lenWPadding = static_cast<unsigned>(pf*lenWPadding); lenWPadding += pb; lenWPadding = lenWPadding & quantizeMask(lenWPadding); if( lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { lenWPadding = lenWHdr; } } totalSize += lenWPadding; DiskLoc loc = allocateSpaceForANewRecord(ns, d, lenWPadding, false); uassert(14024, "compact error out of space during compaction", !loc.isNull()); Record *recNew = loc.rec(); recNew = (Record *) getDur().writingPtr(recNew, lenWHdr); addRecordToRecListInExtent(recNew, loc); memcpy(recNew->data, objOld.objdata(), sz); { // extract keys for all indexes we will be rebuilding for( int x = 0; x < nidx; x++ ) { phase1[x].addKeys(indexSpecs[x], objOld, loc); } } } else { if( ++skipped <= 10 ) log() << "compact skipping invalid object" << endl; } if( L.isNull() ) { // we just did the very last record from the old extent. it's still pointed to // by the old extent ext, but that will be fixed below after this loop break; } // remove the old records (orphan them) periodically so our commit block doesn't get too large bool stopping = false; RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0; if( stopping || getDur().aCommitIsNeeded() ) { e->firstRecord.writing() = L; Record *r = L.rec(); getDur().writingInt(r->prevOfs) = DiskLoc::NullOfs; getDur().commitIfNeeded(); killCurrentOp.checkForInterrupt(false); } } } // if !L.isNull() assert( d->firstExtent == ext ); assert( d->lastExtent != ext ); DiskLoc newFirst = e->xnext; d->firstExtent.writing() = newFirst; newFirst.ext()->xprev.writing().Null(); getDur().writing(e)->markEmpty(); freeExtents(ext,ext); getDur().commitIfNeeded(); { double op = 1.0; if( oldObjSize ) op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize; log() << "compact " << nrecs << " documents " << totalSize/1000000.0 << "MB" << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100 << endl; } } return skipped; }
bool _compact(const char *ns, NamespaceDetails *d, string& errmsg, bool validate, BSONObjBuilder& result, double pf, int pb) { //int les = d->lastExtentSize; // this is a big job, so might as well make things tidy before we start just to be nice. getDur().commitNow(); list<DiskLoc> extents; for( DiskLoc L = d->firstExtent; !L.isNull(); L = L.ext()->xnext ) extents.push_back(L); log() << "compact " << extents.size() << " extents" << endl; ProgressMeterHolder pm( cc().curop()->setMessage( "compact extent" , extents.size() ) ); // same data, but might perform a little different after compact? NamespaceDetailsTransient::get(ns).clearQueryCache(); int nidx = d->nIndexes; scoped_array<IndexSpec> indexSpecs( new IndexSpec[nidx] ); scoped_array<SortPhaseOne> phase1( new SortPhaseOne[nidx] ); { NamespaceDetails::IndexIterator ii = d->ii(); int x = 0; while( ii.more() ) { BSONObjBuilder b; IndexDetails& idx = ii.next(); BSONObj::iterator i(idx.info.obj()); while( i.more() ) { BSONElement e = i.next(); if( !str::equals(e.fieldName(), "v") && !str::equals(e.fieldName(), "background") ) { b.append(e); } } BSONObj o = b.obj().getOwned(); phase1[x].sorter.reset( new BSONObjExternalSorter( idx.idxInterface(), o.getObjectField("key") ) ); phase1[x].sorter->hintNumObjects( d->stats.nrecords ); indexSpecs[x++].reset(o); } } log() << "compact orphan deleted lists" << endl; for( int i = 0; i < Buckets; i++ ) { d->deletedList[i].writing().Null(); } // before dropping indexes, at least make sure we can allocate one extent! uassert(14025, "compact error no space available to allocate", !allocateSpaceForANewRecord(ns, d, Record::HeaderSize+1, false).isNull()); // note that the drop indexes call also invalidates all clientcursors for the namespace, which is important and wanted here log() << "compact dropping indexes" << endl; BSONObjBuilder b; if( !dropIndexes(d, ns, "*", errmsg, b, true) ) { errmsg = "compact drop indexes failed"; log() << errmsg << endl; return false; } getDur().commitNow(); long long skipped = 0; int n = 0; for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) { skipped += compactExtent(ns, d, *i, n++, indexSpecs, phase1, nidx, validate, pf, pb); pm.hit(); } if( skipped ) { result.append("invalidObjects", skipped); } assert( d->firstExtent.ext()->xprev.isNull() ); // indexes will do their own progress meter? pm.finished(); // build indexes NamespaceString s(ns); string si = s.db + ".system.indexes"; for( int i = 0; i < nidx; i++ ) { killCurrentOp.checkForInterrupt(false); BSONObj info = indexSpecs[i].info; log() << "compact create index " << info["key"].Obj().toString() << endl; try { precalced = &phase1[i]; theDataFileMgr.insert(si.c_str(), info.objdata(), info.objsize()); } catch(...) { precalced = 0; throw; } precalced = 0; } return true; }
void Collection::_compactExtent(const DiskLoc diskloc, int extentNumber, MultiIndexBlock& indexesToInsertTo, const CompactOptions* compactOptions, CompactStats* stats ) { log() << "compact begin extent #" << extentNumber << " for namespace " << _ns << " " << diskloc; unsigned oldObjSize = 0; // we'll report what the old padding was unsigned oldObjSizeWithPadding = 0; Extent *e = diskloc.ext(); e->assertOk(); verify( e->validates(diskloc) ); { // the next/prev pointers within the extent might not be in order so we first // page the whole thing in sequentially log() << "compact paging in len=" << e->length/1000000.0 << "MB" << endl; Timer t; size_t length = e->length; touch_pages( reinterpret_cast<const char*>(e), length ); int ms = t.millis(); if( ms > 1000 ) log() << "compact end paging in " << ms << "ms " << e->length/1000000.0/t.seconds() << "MB/sec" << endl; } { log() << "compact copying records" << endl; long long datasize = 0; long long nrecords = 0; DiskLoc L = e->firstRecord; if( !L.isNull() ) { while( 1 ) { Record *recOld = L.rec(); L = getExtentManager()->getNextRecordInExtent(L); BSONObj objOld = BSONObj::make(recOld); if ( compactOptions->validateDocuments && !objOld.valid() ) { // object is corrupt! log() << "compact skipping corrupt document!"; stats->corruptDocuments++; } else { unsigned docSize = objOld.objsize(); nrecords++; oldObjSize += docSize; oldObjSizeWithPadding += recOld->netLength(); unsigned lenWHdr = docSize + Record::HeaderSize; unsigned lenWPadding = lenWHdr; switch( compactOptions->paddingMode ) { case CompactOptions::NONE: if ( details()->isUserFlagSet(NamespaceDetails::Flag_UsePowerOf2Sizes) ) lenWPadding = details()->quantizePowerOf2AllocationSpace(lenWPadding); break; case CompactOptions::PRESERVE: // if we are preserving the padding, the record should not change size lenWPadding = recOld->lengthWithHeaders(); break; case CompactOptions::MANUAL: lenWPadding = compactOptions->computeRecordSize(lenWPadding); if (lenWPadding < lenWHdr || lenWPadding > BSONObjMaxUserSize / 2 ) { lenWPadding = lenWHdr; } break; } CompactDocWriter writer( objOld, lenWPadding ); StatusWith<DiskLoc> status = _recordStore->insertRecord( &writer, 0 ); uassertStatusOK( status.getStatus() ); datasize += _recordStore->recordFor( status.getValue() )->netLength(); InsertDeleteOptions options; options.logIfError = false; options.dupsAllowed = true; // in compact we should be doing no checking indexesToInsertTo.insert( objOld, status.getValue(), options ); } if( L.isNull() ) { // we just did the very last record from the old extent. it's still pointed to // by the old extent ext, but that will be fixed below after this loop break; } // remove the old records (orphan them) periodically so our commit block doesn't get too large bool stopping = false; RARELY stopping = *killCurrentOp.checkForInterruptNoAssert() != 0; if( stopping || getDur().aCommitIsNeeded() ) { e->firstRecord.writing() = L; Record *r = L.rec(); getDur().writingInt(r->prevOfs()) = DiskLoc::NullOfs; getDur().commitIfNeeded(); killCurrentOp.checkForInterrupt(); } } } // if !L.isNull() verify( details()->firstExtent() == diskloc ); verify( details()->lastExtent() != diskloc ); DiskLoc newFirst = e->xnext; details()->firstExtent().writing() = newFirst; newFirst.ext()->xprev.writing().Null(); getDur().writing(e)->markEmpty(); getExtentManager()->freeExtents( diskloc, diskloc ); getDur().commitIfNeeded(); { double op = 1.0; if( oldObjSize ) op = static_cast<double>(oldObjSizeWithPadding)/oldObjSize; log() << "compact finished extent #" << extentNumber << " containing " << nrecords << " documents (" << datasize/1000000.0 << "MB)" << " oldPadding: " << op << ' ' << static_cast<unsigned>(op*100.0)/100; } } }
StatusWith<CompactStats> Collection::compact( const CompactOptions* compactOptions ) { if ( isCapped() ) return StatusWith<CompactStats>( ErrorCodes::BadValue, "cannot compact capped collection" ); if ( _indexCatalog.numIndexesInProgress() ) return StatusWith<CompactStats>( ErrorCodes::BadValue, "cannot compact when indexes in progress" ); NamespaceDetails* d = details(); // this is a big job, so might as well make things tidy before we start just to be nice. getDur().commitIfNeeded(); list<DiskLoc> extents; for( DiskLoc L = d->firstExtent(); !L.isNull(); L = L.ext()->xnext ) extents.push_back(L); log() << "compact " << extents.size() << " extents" << endl; // same data, but might perform a little different after compact? _infoCache.reset(); vector<BSONObj> indexSpecs; { IndexCatalog::IndexIterator ii( _indexCatalog.getIndexIterator( false ) ); while ( ii.more() ) { IndexDescriptor* descriptor = ii.next(); const BSONObj spec = _compactAdjustIndexSpec(descriptor->infoObj()); const BSONObj key = spec.getObjectField("key"); const Status keyStatus = validateKeyPattern(key); if (!keyStatus.isOK()) { return StatusWith<CompactStats>( ErrorCodes::CannotCreateIndex, str::stream() << "Cannot rebuild index " << spec << ": " << keyStatus.reason() << " For more info see" << " http://dochub.mongodb.org/core/index-validation"); } indexSpecs.push_back(spec); } } log() << "compact orphan deleted lists" << endl; d->orphanDeletedList(); // Start over from scratch with our extent sizing and growth d->setLastExtentSize( 0 ); // before dropping indexes, at least make sure we can allocate one extent! // this will allocate an extent and add to free list // if it cannot, it will throw an exception increaseStorageSize( _details->lastExtentSize(), true ); // note that the drop indexes call also invalidates all clientcursors for the namespace, // which is important and wanted here log() << "compact dropping indexes" << endl; Status status = _indexCatalog.dropAllIndexes( true ); if ( !status.isOK() ) { return StatusWith<CompactStats>( status ); } getDur().commitIfNeeded(); killCurrentOp.checkForInterrupt(); CompactStats stats; MultiIndexBlock multiIndexBlock( this ); status = multiIndexBlock.init( indexSpecs ); if ( !status.isOK() ) return StatusWith<CompactStats>( status ); // reset data size and record counts to 0 for this namespace // as we're about to tally them up again for each new extent d->setStats( 0, 0 ); ProgressMeterHolder pm(cc().curop()->setMessage("compact extent", "Extent Compacting Progress", extents.size())); int extentNumber = 0; for( list<DiskLoc>::iterator i = extents.begin(); i != extents.end(); i++ ) { _compactExtent(*i, extentNumber++, multiIndexBlock, compactOptions, &stats ); pm.hit(); } verify( d->firstExtent().ext()->xprev.isNull() ); // indexes will do their own progress meter? pm.finished(); log() << "starting index commits"; status = multiIndexBlock.commit(); if ( !status.isOK() ) return StatusWith<CompactStats>( status ); return StatusWith<CompactStats>( stats ); }
void operator()( DBClientCursorBatchIterator &i ) { mongolock l( true ); if ( context ) { context->relocked(); } while( i.moreInCurrentBatch() ) { if ( n % 128 == 127 /*yield some*/ ) { time_t now = time(0); if( now - lastLog >= 60 ) { // report progress if( lastLog ) log() << "clone " << to_collection << ' ' << n << endl; lastLog = now; } mayInterrupt( _mayBeInterrupted ); dbtempreleaseif t( _mayYield ); } BSONObj tmp = i.nextSafe(); /* assure object is valid. note this will slow us down a little. */ if ( !tmp.valid() ) { stringstream ss; ss << "Cloner: skipping corrupt object from " << from_collection; BSONElement e = tmp.firstElement(); try { e.validate(); ss << " firstElement: " << e; } catch( ... ) { ss << " firstElement corrupt"; } out() << ss.str() << endl; continue; } ++n; BSONObj js = tmp; if ( isindex ) { verify( strstr(from_collection, "system.indexes") ); js = fixindex(tmp); storedForLater->push_back( js.getOwned() ); continue; } try { theDataFileMgr.insertWithObjMod(to_collection, js); if ( logForRepl ) logOp("i", to_collection, js); getDur().commitIfNeeded(); } catch( UserException& e ) { log() << "warning: exception cloning object in " << from_collection << ' ' << e.what() << " obj:" << js.toString() << '\n'; } RARELY if ( time( 0 ) - saveLast > 60 ) { log() << n << " objects cloned so far from collection " << from_collection << endl; saveLast = time( 0 ); } } }
/* ns: namespace, e.g. <database>.<collection> pattern: the "where" clause / criteria justOne: stop after 1 match god: allow access to system namespaces, and don't yield */ long long deleteObjects(const char *ns, BSONObj pattern, bool justOneOrig, bool logop, bool god, RemoveSaver * rs ) { if( !god ) { if ( strstr(ns, ".system.") ) { /* note a delete from system.indexes would corrupt the db if done here, as there are pointers into those objects in NamespaceDetails. */ uassert(12050, "cannot delete from system namespace", legalClientSystemNS( ns , true ) ); } if ( strchr( ns , '$' ) ) { log() << "cannot delete from collection with reserved $ in name: " << ns << endl; uassert( 10100 , "cannot delete from collection with reserved $ in name", strchr(ns, '$') == 0 ); } } { NamespaceDetails *d = nsdetails( ns ); if ( ! d ) return 0; uassert( 10101 , "can't remove from a capped collection" , ! d->capped ); } long long nDeleted = 0; shared_ptr< Cursor > creal = NamespaceDetailsTransient::getCursor( ns, pattern, BSONObj(), false, 0 ); if( !creal->ok() ) return nDeleted; shared_ptr< Cursor > cPtr = creal; auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout, cPtr, ns) ); cc->setDoingDeletes( true ); CursorId id = cc->cursorid(); bool justOne = justOneOrig; bool canYield = !god && !(creal->matcher() && creal->matcher()->docMatcher().atomic()); do { // TODO: we can generalize this I believe // bool willNeedRecord = (creal->matcher() && creal->matcher()->needRecord()) || pattern.isEmpty() || isSimpleIdQuery( pattern ); if ( ! willNeedRecord ) { // TODO: this is a total hack right now // check if the index full encompasses query if ( pattern.nFields() == 1 && str::equals( pattern.firstElement().fieldName() , creal->indexKeyPattern().firstElement().fieldName() ) ) willNeedRecord = true; } if ( canYield && ! cc->yieldSometimes( willNeedRecord ? ClientCursor::WillNeed : ClientCursor::MaybeCovered ) ) { cc.release(); // has already been deleted elsewhere // TODO should we assert or something? break; } if ( !cc->ok() ) { break; // if we yielded, could have hit the end } // this way we can avoid calling updateLocation() every time (expensive) // as well as some other nuances handled cc->setDoingDeletes( true ); DiskLoc rloc = cc->currLoc(); BSONObj key = cc->currKey(); bool match = creal->currentMatches(); bool dup = cc->c()->getsetdup(rloc); if ( ! cc->advance() ) justOne = true; if ( ! match ) continue; assert( !dup ); // can't be a dup, we deleted it! if ( !justOne ) { /* NOTE: this is SLOW. this is not good, noteLocation() was designed to be called across getMore blocks. here we might call millions of times which would be bad. */ cc->c()->prepareToTouchEarlierIterate(); } if ( logop ) { BSONElement e; if( BSONObj( rloc.rec() ).getObjectID( e ) ) { BSONObjBuilder b; b.append( e ); bool replJustOne = true; logOp( "d", ns, b.done(), 0, &replJustOne ); } else { problem() << "deleted object without id, not logging" << endl; } } if ( rs ) rs->goingToDelete( rloc.obj() /*cc->c->current()*/ ); theDataFileMgr.deleteRecord(ns, rloc.rec(), rloc); nDeleted++; if ( justOne ) { break; } cc->c()->recoverFromTouchingEarlierIterate(); if( !god ) getDur().commitIfNeeded(); if( debug && god && nDeleted == 100 ) log() << "warning high number of deletes with god=true which could use significant memory" << endl; } while ( cc->ok() ); if ( cc.get() && ClientCursor::find( id , false ) == 0 ) { // TODO: remove this and the id declaration above if this doesn't trigger // if it does, then i'm very confused (ERH 06/2011) error() << "this should be impossible" << endl; printStackTrace(); cc.release(); } return nDeleted; }
long long Helpers::removeRange( const string& ns , const BSONObj& min , const BSONObj& max , bool yield , bool maxInclusive , RemoveCallback * callback, bool fromMigrate ) { BSONObj keya , keyb; BSONObj minClean = toKeyFormat( min , keya ); BSONObj maxClean = toKeyFormat( max , keyb ); verify( keya == keyb ); Client::Context ctx(ns); shared_ptr<Cursor> c; auto_ptr<ClientCursor> cc; { NamespaceDetails* nsd = nsdetails( ns.c_str() ); if ( ! nsd ) return 0; int ii = nsd->findIndexByKeyPattern( keya ); verify( ii >= 0 ); IndexDetails& i = nsd->idx( ii ); c.reset( BtreeCursor::make( nsd , ii , i , minClean , maxClean , maxInclusive, 1 ) ); cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) ); cc->setDoingDeletes( true ); } long long num = 0; while ( cc->ok() ) { if ( yield && ! cc->yieldSometimes( ClientCursor::WillNeed) ) { // cursor got finished by someone else, so we're done cc.release(); // if the collection/db is dropped, cc may be deleted break; } if ( ! cc->ok() ) break; DiskLoc rloc = cc->currLoc(); if ( callback ) callback->goingToDelete( cc->current() ); cc->advance(); // SERVER-5198 Additional advancement is unnecessary for a single btree cursor, and see // SERVER-5725. c->prepareToTouchEarlierIterate(); logOp( "d" , ns.c_str() , rloc.obj()["_id"].wrap() , 0 , 0 , fromMigrate ); theDataFileMgr.deleteRecord(ns.c_str() , rloc.rec(), rloc); num++; c->recoverFromTouchingEarlierIterate(); getDur().commitIfNeeded(); } return num; }
Status cloneCollectionAsCapped( Database* db, const string& shortFrom, const string& shortTo, double size, bool temp, bool logForReplication ) { string fromNs = db->name() + "." + shortFrom; string toNs = db->name() + "." + shortTo; Collection* fromCollection = db->getCollection( fromNs ); if ( !fromCollection ) return Status( ErrorCodes::NamespaceNotFound, str::stream() << "source collection " << fromNs << " does not exist" ); if ( db->getCollection( toNs ) ) return Status( ErrorCodes::NamespaceExists, "to collection already exists" ); // create new collection { Client::Context ctx( toNs ); BSONObjBuilder spec; spec.appendBool( "capped", true ); spec.append( "size", size ); if ( temp ) spec.appendBool( "temp", true ); Status status = userCreateNS( ctx.db(), toNs, spec.done(), logForReplication ); if ( !status.isOK() ) return status; } Collection* toCollection = db->getCollection( toNs ); invariant( toCollection ); // we created above // how much data to ignore because it won't fit anyway // datasize and extentSize can't be compared exactly, so add some padding to 'size' long long excessSize = static_cast<long long>( fromCollection->dataSize() - ( toCollection->storageSize() * 2 ) ); scoped_ptr<Runner> runner( InternalPlanner::collectionScan(fromNs, fromCollection, InternalPlanner::FORWARD ) ); while ( true ) { BSONObj obj; Runner::RunnerState state = runner->getNext(&obj, NULL); switch( state ) { case Runner::RUNNER_EOF: return Status::OK(); case Runner::RUNNER_DEAD: db->dropCollection( toNs ); return Status( ErrorCodes::InternalError, "runner turned dead while iterating" ); case Runner::RUNNER_ERROR: return Status( ErrorCodes::InternalError, "runner error while iterating" ); case Runner::RUNNER_ADVANCED: if ( excessSize > 0 ) { excessSize -= ( 4 * obj.objsize() ); // 4x is for padding, power of 2, etc... continue; } toCollection->insertDocument( obj, true ); if ( logForReplication ) logOp( "i", toNs.c_str(), obj ); getDur().commitIfNeeded(); } } invariant( false ); // unreachable }
void NamespaceDetails::cappedTruncateAfter(const char *ns, DiskLoc end, bool inclusive) { DEV verify( this == nsdetails(ns) ); verify( cappedLastDelRecLastExtent().isValid() ); // We iteratively remove the newest document until the newest document // is 'end', then we remove 'end' if requested. bool foundLast = false; while( 1 ) { if ( foundLast ) { // 'end' has been found and removed, so break. break; } getDur().commitIfNeeded(); // 'curr' will point to the newest document in the collection. DiskLoc curr = theCapExtent()->lastRecord; verify( !curr.isNull() ); if ( curr == end ) { if ( inclusive ) { // 'end' has been found, so break next iteration. foundLast = true; } else { // 'end' has been found, so break. break; } } // TODO The algorithm used in this function cannot generate an // empty collection, but we could call emptyCappedCollection() in // this case instead of asserting. uassert( 13415, "emptying the collection is not allowed", _stats.nrecords > 1 ); // Delete the newest record, and coalesce the new deleted // record with existing deleted records. theDataFileMgr.deleteRecord(this, ns, curr.rec(), curr, true); compact(); // This is the case where we have not yet had to remove any // documents to make room for other documents, and we are allocating // documents from free space in fresh extents instead of reusing // space from familiar extents. if ( !capLooped() ) { // We just removed the last record from the 'capExtent', and // the 'capExtent' can't be empty, so we set 'capExtent' to // capExtent's prev extent. if ( theCapExtent()->lastRecord.isNull() ) { verify( !theCapExtent()->xprev.isNull() ); // NOTE Because we didn't delete the last document, and // capLooped() is false, capExtent is not the first extent // so xprev will be nonnull. _capExtent.writing() = theCapExtent()->xprev; theCapExtent()->assertOk(); // update cappedLastDelRecLastExtent() cappedTruncateLastDelUpdate(); } continue; } // This is the case where capLooped() is true, and we just deleted // from capExtent, and we just deleted capFirstNewRecord, which was // the last record on the fresh side of capExtent. // NOTE In this comparison, curr and potentially capFirstNewRecord // may point to invalid data, but we can still compare the // references themselves. if ( curr == _capFirstNewRecord ) { // Set 'capExtent' to the first nonempty extent prior to the // initial capExtent. There must be such an extent because we // have not deleted the last document in the collection. It is // possible that all extents other than the capExtent are empty. // In this case we will keep the initial capExtent and specify // that all records contained within are on the fresh rather than // stale side of the extent. DiskLoc newCapExtent = _capExtent; do { // Find the previous extent, looping if necessary. newCapExtent = ( newCapExtent == _firstExtent ) ? _lastExtent : newCapExtent.ext()->xprev; newCapExtent.ext()->assertOk(); } while ( newCapExtent.ext()->firstRecord.isNull() ); _capExtent.writing() = newCapExtent; // Place all documents in the new capExtent on the fresh side // of the capExtent by setting capFirstNewRecord to the first // document in the new capExtent. _capFirstNewRecord.writing() = theCapExtent()->firstRecord; // update cappedLastDelRecLastExtent() cappedTruncateLastDelUpdate(); } } }