/** * Remaps the private view from the shared view so that it does not consume too much * copy-on-write/swap space. Must only be called after the in-memory journal has been flushed * to disk and applied on top of the shared view. * * @param fraction Value between (0, 1] indicating what fraction of the memory to remap. * Remapping too much or too frequently incurs copy-on-write page fault cost. */ static void remapPrivateView(double fraction) { // Remapping private views must occur after WRITETODATAFILES otherwise we wouldn't see any // newly written data on reads. invariant(!commitJob.hasWritten()); try { Timer t; remapPrivateViewImpl(fraction); stats.curr()->_remapPrivateViewMicros += t.micros(); LOG(4) << "remapPrivateView end"; return; } catch (DBException& e) { severe() << "dbexception in remapPrivateView causing immediate shutdown: " << e.toString(); } catch (std::ios_base::failure& e) { severe() << "ios_base exception in remapPrivateView causing immediate shutdown: " << e.what(); } catch (std::bad_alloc& e) { severe() << "bad_alloc exception in remapPrivateView causing immediate shutdown: " << e.what(); } catch (std::exception& e) { severe() << "exception in remapPrivateView causing immediate shutdown: " << e.what(); } catch (...) { severe() << "unknown exception in remapPrivateView causing immediate shutdown: "; } invariant(false); }
void DurableImpl::closingFileNotification() { if (commitJob.hasWritten()) { severe() << "journal warning files are closing outside locks with writes pending"; // File is closing while there are unwritten changes invariant(false); } }
/** locking in read lock when called @see MongoMMF::close() */ static void groupCommit() { dbMutex.assertAtLeastReadLocked(); if( !commitJob.hasWritten() ) return; PREPLOGBUFFER(); WRITETOJOURNAL(commitJob._ab); // data is now in the journal, which is sufficient for acknowledging getlasterror. // (ok to crash after that) log() << "TEMP NOTIFYING COMMITTED" << endl; commitJob.notifyCommitted(); // write the noted write intent entries to the data files. // this has to come after writing to the journal, obviously... MongoFile::markAllWritable(); // for _DEBUG. normally we don't write in a read lock WRITETODATAFILES(); if (!dbMutex.isWriteLocked()) MongoFile::unmarkAllWritable(); commitJob.reset(); // REMAPPRIVATEVIEW // // remapping private views must occur after WRITETODATAFILES otherwise // we wouldn't see newly written data on reads. // DEV assert( !commitJob.hasWritten() ); if( !dbMutex.isWriteLocked() ) { // this needs done in a write lock thus we do it on the next acquisition of that // instead of here (there is no rush if you aren't writing anyway -- but it must happen, // if it is done, before any uncommitted writes occur). // dbMutex._remapPrivateViewRequested = true; } else { // however, if we are already write locked, we must do it now -- up the call tree someone // may do a write without a new lock acquisition. this can happen when MongoMMF::close() calls // this method when a file (and its views) is about to go away. // REMAPPRIVATEVIEW(); } }
/** called when a MongoMMF is closing -- we need to go ahead and group commit in that case before its views disappear */ void closingFileNotification() { if( dbMutex.atLeastReadLocked() ) { groupCommit(); } else { assert( inShutdown() ); if( commitJob.hasWritten() ) { log() << "dur warning files are closing outside locks with writes pending" << endl; } } }
/** We need to remap the private views periodically. otherwise they would become very large. Call within write lock. */ void REMAPPRIVATEVIEW() { static unsigned startAt; static unsigned long long lastRemap; dbMutex.assertWriteLocked(); dbMutex._remapPrivateViewRequested = false; assert( !commitJob.hasWritten() ); if( 0 ) { log() << "TEMP remapprivateview disabled for testing - will eventually run oom in this mode if db bigger than ram" << endl; return; } // we want to remap all private views about every 2 seconds. there could be ~1000 views so // we do a little each pass; beyond the remap time, more significantly, there will be copy on write // faults after remapping, so doing a little bit at a time will avoid big load spikes on // remapping. unsigned long long now = curTimeMicros64(); double fraction = (now-lastRemap)/20000000.0; set<MongoFile*>& files = MongoFile::getAllFiles(); unsigned sz = files.size(); if( sz == 0 ) return; unsigned ntodo = (unsigned) (sz * fraction); if( ntodo < 1 ) ntodo = 1; if( ntodo > sz ) ntodo = sz; const set<MongoFile*>::iterator b = files.begin(); const set<MongoFile*>::iterator e = files.end(); set<MongoFile*>::iterator i = b; // skip to our starting position for( unsigned x = 0; x < startAt; x++ ) { i++; if( i == e ) i = b; } startAt = (startAt + ntodo) % sz; // mark where to start next time for( unsigned x = 0; x < ntodo; x++ ) { dassert( i != e ); if( (*i)->isMongoMMF() ) { MongoMMF *mmf = (MongoMMF*) *i; assert(mmf); if( mmf->willNeedRemap() ) { mmf->willNeedRemap() = false; mmf->remapThePrivateView(); } i++; if( i == e ) i = b; } } }
static void go() { if( !commitJob.hasWritten() ) return; { readlocktry lk("", 1000); if( lk.got() ) { groupCommit(); return; } } // starvation on read locks could occur. so if read lock acquisition is slow, try to get a // write lock instead. otherwise writes could use too much RAM. writelock lk; groupCommit(); }
/** * The main durability thread loop. There is a single instance of this function running. */ static void durThread(ClockSource* cs, int64_t serverStartMs) { Client::initThread("durability"); log() << "Durability thread started"; bool samePartition = true; try { const std::string dbpathDir = boost::filesystem::path(storageGlobalParams.dbpath).string(); samePartition = onSamePartition(getJournalDir().string(), dbpathDir); } catch (...) { } // Spawn the journal writer thread JournalWriter journalWriter(&commitNotify, &applyToDataFilesNotify, NumAsyncJournalWrites); journalWriter.start(); // Used as an estimate of how much / how fast to remap uint64_t commitCounter(0); uint64_t estimatedPrivateMapSize(0); uint64_t remapLastTimestamp(0); while (shutdownRequested.loadRelaxed() == 0) { unsigned ms = storageGlobalParams.journalCommitIntervalMs; if (ms == 0) { ms = samePartition ? 100 : 30; } // +1 so it never goes down to zero const int64_t oneThird = (ms / 3) + 1; // Reset the stats based on the reset interval if (stats.curr()->getCurrentDurationMillis() > DurStatsResetIntervalMillis) { stats.reset(); } try { stdx::unique_lock<stdx::mutex> lock(flushMutex); for (unsigned i = 0; i <= 2; i++) { if (stdx::cv_status::no_timeout == flushRequested.wait_for(lock, Milliseconds(oneThird).toSystemDuration())) { // Someone forced a flush break; } if (commitNotify.nWaiting()) { // One or more getLastError j:true is pending break; } if (commitJob.bytes() > UncommittedBytesLimit / 2) { // The number of written bytes is growing break; } } // The commit logic itself LOG(4) << "groupCommit begin"; Timer t; const ServiceContext::UniqueOperationContext txnPtr = cc().makeOperationContext(); OperationContext& txn = *txnPtr; AutoAcquireFlushLockForMMAPV1Commit autoFlushLock(txn.lockState()); // We need to snapshot the commitNumber after the flush lock has been obtained, // because at this point we know that we have a stable snapshot of the data. const CommitNotifier::When commitNumber(commitNotify.now()); LOG(4) << "Processing commit number " << commitNumber; if (!commitJob.hasWritten()) { // We do not need the journal lock anymore. Free it here, for the really // unlikely possibility that the writeBuffer command below blocks. autoFlushLock.release(); // getlasterror request could have came after the data was already committed. // No need to call committingReset though, because we have not done any // writes (hasWritten == false). JournalWriter::Buffer* const buffer = journalWriter.newBuffer(); buffer->setNoop(); buffer->journalListenerToken = getJournalListener()->getToken(); journalWriter.writeBuffer(buffer, commitNumber); } else { // This copies all the in-memory changes into the journal writer's buffer. JournalWriter::Buffer* const buffer = journalWriter.newBuffer(); PREPLOGBUFFER(buffer->getHeader(), buffer->getBuilder(), cs, serverStartMs); estimatedPrivateMapSize += commitJob.bytes(); commitCounter++; // Now that the write intents have been copied to the buffer, the commit job is // free to be reused. We need to reset the commit job's contents while under // the S flush lock, because otherwise someone might have done a write and this // would wipe out their changes without ever being committed. commitJob.committingReset(); double systemMemoryPressurePercentage = ProcessInfo::getSystemMemoryPressurePercentage(); // Now that the in-memory modifications have been collected, we can potentially // release the flush lock if remap is not necessary. // When we remap due to memory pressure, we look at two criteria // 1. If the amount of 4k pages touched exceeds 512 MB, // a reasonable estimate of memory pressure on Linux. // 2. Check if the amount of free memory on the machine is running low, // since #1 is underestimates the memory pressure on Windows since // commits in 64MB chunks. const bool shouldRemap = (estimatedPrivateMapSize >= UncommittedBytesLimit) || (systemMemoryPressurePercentage > 0.0) || (commitCounter % NumCommitsBeforeRemap == 0) || (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalAlwaysRemap); double remapFraction = 0.0; if (shouldRemap) { // We want to remap all private views about every 2 seconds. There could be // ~1000 views so we do a little each pass. There will be copy on write // faults after remapping, so doing a little bit at a time will avoid big // load spikes when the pages are touched. // // TODO: Instead of the time-based logic above, consider using ProcessInfo // and watching for getResidentSize to drop, which is more precise. remapFraction = (curTimeMicros64() - remapLastTimestamp) / 2000000.0; if (mmapv1GlobalOptions.journalOptions & MMAPV1Options::JournalAlwaysRemap) { remapFraction = 1; } else { // We don't want to get close to the UncommittedBytesLimit const double remapMemFraction = estimatedPrivateMapSize / ((double)UncommittedBytesLimit); remapFraction = std::max(remapMemFraction, remapFraction); remapFraction = std::max(systemMemoryPressurePercentage, remapFraction); } } else { LOG(4) << "Early release flush lock"; // We will not be doing a remap so drop the flush lock. That way we will be // doing the journal I/O outside of lock, so other threads can proceed. invariant(!shouldRemap); autoFlushLock.release(); } buffer->journalListenerToken = getJournalListener()->getToken(); // Request async I/O to the journal. This may block. journalWriter.writeBuffer(buffer, commitNumber); // Data has now been written to the shared view. If remap was requested, we // would still be holding the S flush lock here, so just upgrade it and // perform the remap. if (shouldRemap) { // Need to wait for the previously scheduled journal writes to complete // before any remap is attempted. journalWriter.flush(); journalWriter.assertIdle(); // Upgrading the journal lock to flush stops all activity on the system, // because we will be remapping memory and we don't want readers to be // accessing it. Technically this step could be avoided on systems, which // support atomic remap. autoFlushLock.upgradeFlushLockToExclusive(); remapPrivateView(remapFraction); autoFlushLock.release(); // Reset the private map estimate outside of the lock estimatedPrivateMapSize = 0; remapLastTimestamp = curTimeMicros64(); stats.curr()->_commitsInWriteLock++; stats.curr()->_commitsInWriteLockMicros += t.micros(); } } stats.curr()->_commits++; stats.curr()->_commitsMicros += t.micros(); LOG(4) << "groupCommit end"; } catch (DBException& e) { severe() << "dbexception in durThread causing immediate shutdown: " << e.toString(); invariant(false); } catch (std::ios_base::failure& e) { severe() << "ios_base exception in durThread causing immediate shutdown: " << e.what(); invariant(false); } catch (std::bad_alloc& e) { severe() << "bad_alloc exception in durThread causing immediate shutdown: " << e.what(); invariant(false); } catch (std::exception& e) { severe() << "exception in durThread causing immediate shutdown: " << e.what(); invariant(false); } catch (...) { severe() << "unhandled exception in durThread causing immediate shutdown"; invariant(false); } } // Stops the journal thread and ensures everything was written invariant(!commitJob.hasWritten()); journalWriter.flush(); journalWriter.shutdown(); log() << "Durability thread stopped"; }