virtual bool run(OperationContext* txn, const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl) { if (txn->lockState()->isLocked()) { errmsg = "fsync: Cannot execute fsync command from contexts that hold a data lock"; return false; } bool sync = !cmdObj["async"].trueValue(); // async means do an fsync, but return immediately bool lock = cmdObj["lock"].trueValue(); log() << "CMD fsync: sync:" << sync << " lock:" << lock << endl; if( lock ) { if ( ! sync ) { errmsg = "fsync: sync option must be true when using lock"; return false; } SimpleMutex::scoped_lock lk(m); err = ""; (new FSyncLockThread())->go(); while ( ! locked && err.size() == 0 ) { _threadSync.wait( m ); } if ( err.size() ){ errmsg = err; return false; } log() << "db is now locked, no writes allowed. db.fsyncUnlock() to unlock" << endl; log() << " For more info see " << FSyncCommand::url() << endl; result.append("info", "now locked against writes, use db.fsyncUnlock() to unlock"); result.append("seeAlso", FSyncCommand::url()); } else { // the simple fsync command case if (sync) { // can this be GlobalRead? and if it can, it should be nongreedy. ScopedTransaction transaction(txn, MODE_X); Lock::GlobalWrite w(txn->lockState()); getDur().commitNow(txn); // No WriteUnitOfWork needed, as this does no writes of its own. } // Take a global IS lock to ensure the storage engine is not shutdown Lock::GlobalLock global(txn->lockState(), MODE_IS, UINT_MAX); StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); result.append( "numFiles" , storageEngine->flushAllFiles( sync ) ); } return 1; }
void FSyncLockThread::doRealWork() { SimpleMutex::scoped_lock lkf(filesLockedFsync); OperationContextImpl txn; ScopedTransaction transaction(&txn, MODE_X); Lock::GlobalWrite global(txn.lockState()); // No WriteUnitOfWork needed SimpleMutex::scoped_lock lk(fsyncCmd.m); invariant(!fsyncCmd.locked); // impossible to get here if locked is true try { getDur().syncDataAndTruncateJournal(&txn); } catch( std::exception& e ) { error() << "error doing syncDataAndTruncateJournal: " << e.what() << endl; fsyncCmd.err = e.what(); fsyncCmd._threadSync.notify_one(); fsyncCmd.locked = false; return; } txn.lockState()->downgradeGlobalXtoSForMMAPV1(); try { StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); storageEngine->flushAllFiles(true); } catch( std::exception& e ) { error() << "error doing flushAll: " << e.what() << endl; fsyncCmd.err = e.what(); fsyncCmd._threadSync.notify_one(); fsyncCmd.locked = false; return; } invariant(!fsyncCmd.locked); fsyncCmd.locked = true; fsyncCmd._threadSync.notify_one(); while ( ! fsyncCmd.pendingUnlock ) { fsyncCmd._unlockSync.wait(fsyncCmd.m); } fsyncCmd.pendingUnlock = false; fsyncCmd.locked = false; fsyncCmd.err = "unlocked"; fsyncCmd._unlockSync.notify_one(); }
void DataFileSync::run() { Client::initThread(name().c_str()); if (storageGlobalParams.syncdelay == 0) { log() << "warning: --syncdelay 0 is not recommended and can have strange performance" << endl; } else if (storageGlobalParams.syncdelay == 1) { log() << "--syncdelay 1" << endl; } else if (storageGlobalParams.syncdelay != 60) { LOG(1) << "--syncdelay " << storageGlobalParams.syncdelay.load() << endl; } int time_flushing = 0; while (!globalInShutdownDeprecated()) { _diaglog.flush(); if (storageGlobalParams.syncdelay == 0) { // in case at some point we add an option to change at runtime sleepsecs(5); continue; } sleepmillis( (long long)std::max(0.0, (storageGlobalParams.syncdelay * 1000) - time_flushing)); if (globalInShutdownDeprecated()) { // occasional issue trying to flush during shutdown when sleep interrupted break; } Date_t start = jsTime(); StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); dur::notifyPreDataFileFlush(); int numFiles = storageEngine->flushAllFiles(true); dur::notifyPostDataFileFlush(); time_flushing = durationCount<Milliseconds>(jsTime() - start); _flushed(time_flushing); if (shouldLog(logger::LogSeverity::Debug(1)) || time_flushing >= 10000) { log() << "flushing mmaps took " << time_flushing << "ms " << " for " << numFiles << " files" << endl; } } }
void DataFileSync::run() { Client::initThread( name().c_str() ); if (mmapv1GlobalOptions.syncdelay == 0) { log() << "warning: --syncdelay 0 is not recommended and can have strange performance" << endl; } else if (mmapv1GlobalOptions.syncdelay == 1) { log() << "--syncdelay 1" << endl; } else if (mmapv1GlobalOptions.syncdelay != 60) { LOG(1) << "--syncdelay " << mmapv1GlobalOptions.syncdelay << endl; } int time_flushing = 0; while ( ! inShutdown() ) { _diaglog.flush(); if (mmapv1GlobalOptions.syncdelay == 0) { // in case at some point we add an option to change at runtime sleepsecs(5); continue; } sleepmillis((long long) std::max(0.0, (mmapv1GlobalOptions.syncdelay * 1000) - time_flushing)); if ( inShutdown() ) { // occasional issue trying to flush during shutdown when sleep interrupted break; } Date_t start = jsTime(); StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); int numFiles = storageEngine->flushAllFiles( true ); time_flushing = (int) (jsTime() - start); _flushed(time_flushing); if( logger::globalLogDomain()->shouldLog(logger::LogSeverity::Debug(1)) || time_flushing >= 10000 ) { log() << "flushing mmaps took " << time_flushing << "ms " << " for " << numFiles << " files" << endl; } } }
void createOplog(OperationContext* txn) { Lock::GlobalWrite lk(txn->lockState()); const char * ns = "local.oplog.$main"; const ReplSettings& replSettings = getGlobalReplicationCoordinator()->getSettings(); bool rs = !replSettings.replSet.empty(); if( rs ) ns = rsoplog; Client::Context ctx(txn, ns); Collection* collection = ctx.db()->getCollection(txn, ns ); if ( collection ) { if (replSettings.oplogSize != 0) { int o = (int)(collection->getRecordStore()->storageSize(txn) / ( 1024 * 1024 ) ); int n = (int)(replSettings.oplogSize / (1024 * 1024)); if ( n != o ) { stringstream ss; ss << "cmdline oplogsize (" << n << ") different than existing (" << o << ") see: http://dochub.mongodb.org/core/increase-oplog"; log() << ss.str() << endl; throw UserException( 13257 , ss.str() ); } } if( rs ) return; initOpTimeFromOplog(txn, ns); return; } /* create an oplog collection, if it doesn't yet exist. */ long long sz = 0; if ( replSettings.oplogSize != 0 ) { sz = replSettings.oplogSize; } else { /* not specified. pick a default size */ sz = 50LL * 1024LL * 1024LL; if ( sizeof(int *) >= 8 ) { #if defined(__APPLE__) // typically these are desktops (dev machines), so keep it smallish sz = (256-64) * 1024 * 1024; #else sz = 990LL * 1024 * 1024; double free = File::freeSpace(storageGlobalParams.dbpath); //-1 if call not supported. long long fivePct = static_cast<long long>( free * 0.05 ); if ( fivePct > sz ) sz = fivePct; // we use 5% of free space up to 50GB (1TB free) static long long upperBound = 50LL * 1024 * 1024 * 1024; if (fivePct > upperBound) sz = upperBound; #endif } } log() << "******" << endl; log() << "creating replication oplog of size: " << (int)( sz / ( 1024 * 1024 ) ) << "MB..." << endl; CollectionOptions options; options.capped = true; options.cappedSize = sz; options.autoIndexId = CollectionOptions::NO; WriteUnitOfWork wunit(txn); invariant(ctx.db()->createCollection(txn, ns, options)); if( !rs ) logOp(txn, "n", "", BSONObj() ); wunit.commit(); /* sync here so we don't get any surprising lag later when we try to sync */ StorageEngine* storageEngine = getGlobalEnvironment()->getGlobalStorageEngine(); storageEngine->flushAllFiles(true); log() << "******" << endl; }
Status waitForWriteConcern(OperationContext* txn, const OpTime& replOpTime, const WriteConcernOptions& writeConcern, WriteConcernResult* result) { LOG(2) << "Waiting for write concern. OpTime: " << replOpTime << ", write concern: " << writeConcern.toBSON(); auto replCoord = repl::ReplicationCoordinator::get(txn); MONGO_FAIL_POINT_PAUSE_WHILE_SET(hangBeforeWaitingForWriteConcern); // Next handle blocking on disk Timer syncTimer; WriteConcernOptions writeConcernWithPopulatedSyncMode = replCoord->populateUnsetWriteConcernOptionsSyncMode(writeConcern); switch (writeConcernWithPopulatedSyncMode.syncMode) { case WriteConcernOptions::SyncMode::UNSET: severe() << "Attempting to wait on a WriteConcern with an unset sync option"; fassertFailed(34410); case WriteConcernOptions::SyncMode::NONE: break; case WriteConcernOptions::SyncMode::FSYNC: { StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); if (!storageEngine->isDurable()) { result->fsyncFiles = storageEngine->flushAllFiles(true); } else { // We only need to commit the journal if we're durable txn->recoveryUnit()->waitUntilDurable(); } break; } case WriteConcernOptions::SyncMode::JOURNAL: if (replCoord->getReplicationMode() != repl::ReplicationCoordinator::Mode::modeNone) { // Wait for ops to become durable then update replication system's // knowledge of this. OpTime appliedOpTime = replCoord->getMyLastAppliedOpTime(); txn->recoveryUnit()->waitUntilDurable(); replCoord->setMyLastDurableOpTimeForward(appliedOpTime); } else { txn->recoveryUnit()->waitUntilDurable(); } break; } result->syncMillis = syncTimer.millis(); // Now wait for replication if (replOpTime.isNull()) { // no write happened for this client yet return Status::OK(); } // needed to avoid incrementing gleWtimeStats SERVER-9005 if (writeConcernWithPopulatedSyncMode.wNumNodes <= 1 && writeConcernWithPopulatedSyncMode.wMode.empty()) { // no desired replication check return Status::OK(); } // Replica set stepdowns and gle mode changes are thrown as errors repl::ReplicationCoordinator::StatusAndDuration replStatus = replCoord->awaitReplication(txn, replOpTime, writeConcernWithPopulatedSyncMode); if (replStatus.status == ErrorCodes::WriteConcernFailed) { gleWtimeouts.increment(); result->err = "timeout"; result->wTimedOut = true; } // Add stats result->writtenTo = repl::getGlobalReplicationCoordinator()->getHostsWrittenTo( replOpTime, writeConcernWithPopulatedSyncMode.syncMode == WriteConcernOptions::SyncMode::JOURNAL); gleWtimeStats.recordMillis(durationCount<Milliseconds>(replStatus.duration)); result->wTime = durationCount<Milliseconds>(replStatus.duration); return replStatus.status; }
Status waitForWriteConcern(OperationContext* txn, const OpTime& replOpTime, const WriteConcernOptions& writeConcern, WriteConcernResult* result) { // We assume all options have been validated earlier, if not, programming error. // Passing localDB name is a hack to avoid more rigorous check that performed for non local DB. dassert(validateWriteConcern(txn, writeConcern, kLocalDB).isOK()); // We should never be waiting for write concern while holding any sort of lock, because this may // lead to situations where the replication heartbeats are stalled. // // This check does not hold for writes done through dbeval because it runs with a global X lock. dassert(!txn->lockState()->isLocked() || txn->getClient()->isInDirectClient()); // Next handle blocking on disk Timer syncTimer; auto replCoord = repl::getGlobalReplicationCoordinator(); WriteConcernOptions writeConcernWithPopulatedSyncMode = replCoord->populateUnsetWriteConcernOptionsSyncMode(writeConcern); switch (writeConcernWithPopulatedSyncMode.syncMode) { case WriteConcernOptions::SyncMode::UNSET: severe() << "Attempting to wait on a WriteConcern with an unset sync option"; fassertFailed(34410); case WriteConcernOptions::SyncMode::NONE: break; case WriteConcernOptions::SyncMode::FSYNC: { StorageEngine* storageEngine = getGlobalServiceContext()->getGlobalStorageEngine(); if (!storageEngine->isDurable()) { result->fsyncFiles = storageEngine->flushAllFiles(true); } else { // We only need to commit the journal if we're durable txn->recoveryUnit()->waitUntilDurable(); } break; } case WriteConcernOptions::SyncMode::JOURNAL: if (replCoord->getReplicationMode() != repl::ReplicationCoordinator::Mode::modeNone) { // Wait for ops to become durable then update replication system's // knowledge of this. OpTime appliedOpTime = replCoord->getMyLastAppliedOpTime(); txn->recoveryUnit()->waitUntilDurable(); replCoord->setMyLastDurableOpTimeForward(appliedOpTime); } else { txn->recoveryUnit()->waitUntilDurable(); } break; } result->syncMillis = syncTimer.millis(); // Now wait for replication if (replOpTime.isNull()) { // no write happened for this client yet return Status::OK(); } // needed to avoid incrementing gleWtimeStats SERVER-9005 if (writeConcernWithPopulatedSyncMode.wNumNodes <= 1 && writeConcernWithPopulatedSyncMode.wMode.empty()) { // no desired replication check return Status::OK(); } // Now we wait for replication // Note that replica set stepdowns and gle mode changes are thrown as errors repl::ReplicationCoordinator::StatusAndDuration replStatus = repl::getGlobalReplicationCoordinator()->awaitReplication( txn, replOpTime, writeConcernWithPopulatedSyncMode); if (replStatus.status == ErrorCodes::WriteConcernFailed) { gleWtimeouts.increment(); result->err = "timeout"; result->wTimedOut = true; } // Add stats result->writtenTo = repl::getGlobalReplicationCoordinator()->getHostsWrittenTo( replOpTime, writeConcernWithPopulatedSyncMode.syncMode == WriteConcernOptions::SyncMode::JOURNAL); gleWtimeStats.recordMillis(durationCount<Milliseconds>(replStatus.duration)); result->wTime = durationCount<Milliseconds>(replStatus.duration); return replStatus.status; }
void FSyncLockThread::run() { ThreadClient tc("fsyncLockWorker", getGlobalServiceContext()); stdx::lock_guard<SimpleMutex> lkf(filesLockedFsync); stdx::unique_lock<stdx::mutex> lk(fsyncCmd.lockStateMutex); invariant(fsyncCmd.getLockCount_inLock() == 1); try { const ServiceContext::UniqueOperationContext opCtxPtr = cc().makeOperationContext(); OperationContext& opCtx = *opCtxPtr; Lock::GlobalRead global(&opCtx); // Block any writes in order to flush the files. StorageEngine* storageEngine = getGlobalServiceContext()->getStorageEngine(); try { storageEngine->flushAllFiles(&opCtx, true); } catch (const std::exception& e) { error() << "error doing flushAll: " << e.what(); fsyncCmd.threadStatus = Status(ErrorCodes::CommandFailed, e.what()); fsyncCmd.acquireFsyncLockSyncCV.notify_one(); return; } bool successfulFsyncLock = false; auto backupCursorHooks = BackupCursorHooks::get(opCtx.getServiceContext()); try { writeConflictRetry(&opCtx, "beginBackup", "global", [&opCtx, backupCursorHooks, &successfulFsyncLock, storageEngine] { if (backupCursorHooks->enabled()) { backupCursorHooks->fsyncLock(&opCtx); successfulFsyncLock = true; } else { // Have the uassert be caught by the DBException // block. Maintain "allowFsyncFailure" compatibility in // community. uassertStatusOK(storageEngine->beginBackup(&opCtx)); successfulFsyncLock = true; } }); } catch (const DBException& e) { if (_allowFsyncFailure) { warning() << "Locking despite storage engine being unable to begin backup : " << e.toString(); opCtx.recoveryUnit()->waitUntilDurable(); } else { error() << "storage engine unable to begin backup : " << e.toString(); fsyncCmd.threadStatus = e.toStatus(); fsyncCmd.acquireFsyncLockSyncCV.notify_one(); return; } } fsyncCmd.threadStarted = true; fsyncCmd.acquireFsyncLockSyncCV.notify_one(); while (fsyncCmd.getLockCount_inLock() > 0) { fsyncCmd.releaseFsyncLockSyncCV.wait(lk); } if (successfulFsyncLock) { if (backupCursorHooks->enabled()) { backupCursorHooks->fsyncUnlock(&opCtx); } else { storageEngine->endBackup(&opCtx); } } } catch (const std::exception& e) { severe() << "FSyncLockThread exception: " << e.what(); fassertFailed(40350); } }
virtual bool errmsgRun(OperationContext* opCtx, const string& dbname, const BSONObj& cmdObj, string& errmsg, BSONObjBuilder& result) { if (opCtx->lockState()->isLocked()) { errmsg = "fsync: Cannot execute fsync command from contexts that hold a data lock"; return false; } const bool sync = !cmdObj["async"].trueValue(); // async means do an fsync, but return immediately const bool lock = cmdObj["lock"].trueValue(); log() << "CMD fsync: sync:" << sync << " lock:" << lock; // fsync + lock is sometimes used to block writes out of the system and does not care if // the `BackupCursorService::fsyncLock` call succeeds. const bool allowFsyncFailure = getTestCommandsEnabled() && cmdObj["allowFsyncFailure"].trueValue(); if (!lock) { // Take a global IS lock to ensure the storage engine is not shutdown Lock::GlobalLock global(opCtx, MODE_IS); StorageEngine* storageEngine = getGlobalServiceContext()->getStorageEngine(); result.append("numFiles", storageEngine->flushAllFiles(opCtx, sync)); return true; } Lock::ExclusiveLock lk(opCtx->lockState(), commandMutex); if (!sync) { errmsg = "fsync: sync option must be true when using lock"; return false; } const auto lockCountAtStart = getLockCount(); invariant(lockCountAtStart > 0 || !_lockThread); acquireLock(); if (lockCountAtStart == 0) { Status status = Status::OK(); { stdx::unique_lock<stdx::mutex> lk(lockStateMutex); threadStatus = Status::OK(); threadStarted = false; _lockThread = stdx::make_unique<FSyncLockThread>(allowFsyncFailure); _lockThread->go(); while (!threadStarted && threadStatus.isOK()) { acquireFsyncLockSyncCV.wait(lk); } // 'threadStatus' must be copied while 'lockStateMutex' is held. status = threadStatus; } if (!status.isOK()) { releaseLock(); warning() << "fsyncLock failed. Lock count reset to 0. Status: " << status; uassertStatusOK(status); } } log() << "mongod is locked and no writes are allowed. db.fsyncUnlock() to unlock"; log() << "Lock count is " << getLockCount(); log() << " For more info see " << FSyncCommand::url(); result.append("info", "now locked against writes, use db.fsyncUnlock() to unlock"); result.append("lockCount", getLockCount()); result.append("seeAlso", FSyncCommand::url()); return true; }