void WiredTigerOplogManager::waitForAllEarlierOplogWritesToBeVisible( const WiredTigerRecordStore* oplogRecordStore, OperationContext* opCtx) { invariant(opCtx->lockState()->isNoop() || !opCtx->lockState()->inAWriteUnitOfWork()); // In order to reliably detect rollback situations, we need to fetch the latestVisibleTimestamp // prior to querying the end of the oplog. auto currentLatestVisibleTimestamp = getOplogReadTimestamp(); // Procedure: issue a read on a reverse cursor (which is not subject to the oplog visibility // rules), see what is last, and wait for that to become visible. std::unique_ptr<SeekableRecordCursor> cursor = oplogRecordStore->getCursor(opCtx, false /* false = reverse cursor */); auto lastRecord = cursor->next(); if (!lastRecord) { LOG(2) << "Trying to query an empty oplog"; opCtx->recoveryUnit()->abandonSnapshot(); return; } const auto waitingFor = lastRecord->id; // Close transaction before we wait. opCtx->recoveryUnit()->abandonSnapshot(); stdx::unique_lock<stdx::mutex> lk(_oplogVisibilityStateMutex); // Prevent any scheduled journal flushes from being delayed and blocking this wait excessively. _opsWaitingForVisibility++; invariant(_opsWaitingForVisibility > 0); auto exitGuard = MakeGuard([&] { _opsWaitingForVisibility--; }); opCtx->waitForConditionOrInterrupt(_opsBecameVisibleCV, lk, [&] { auto newLatestVisibleTimestamp = getOplogReadTimestamp(); if (newLatestVisibleTimestamp < currentLatestVisibleTimestamp) { LOG(1) << "oplog latest visible timestamp went backwards"; // If the visibility went backwards, this means a rollback occurred. // Thus, we are finished waiting. return true; } currentLatestVisibleTimestamp = newLatestVisibleTimestamp; RecordId latestVisible = RecordId(currentLatestVisibleTimestamp); if (latestVisible < waitingFor) { LOG(2) << "Operation is waiting for " << waitingFor << "; latestVisible is " << currentLatestVisibleTimestamp; } return latestVisible >= waitingFor; }); }
void WiredTigerOplogManager::waitForAllEarlierOplogWritesToBeVisible( const WiredTigerRecordStore* oplogRecordStore, OperationContext* opCtx) const { invariant(opCtx->lockState()->isNoop() || !opCtx->lockState()->inAWriteUnitOfWork()); // In order to reliably detect rollback situations, we need to fetch the latestVisibleTimestamp // prior to querying the end of the oplog. auto currentLatestVisibleTimestamp = getOplogReadTimestamp(); // Procedure: issue a read on a reverse cursor (which is not subject to the oplog visibility // rules), see what is last, and wait for that to become visible. std::unique_ptr<SeekableRecordCursor> cursor = oplogRecordStore->getCursor(opCtx, false /* false = reverse cursor */); auto lastRecord = cursor->next(); if (!lastRecord) { LOG(2) << "Trying to query an empty oplog"; opCtx->recoveryUnit()->abandonSnapshot(); return; } const auto waitingFor = lastRecord->id; // Close transaction before we wait. opCtx->recoveryUnit()->abandonSnapshot(); stdx::unique_lock<stdx::mutex> lk(_oplogVisibilityStateMutex); opCtx->waitForConditionOrInterrupt(_opsBecameVisibleCV, lk, [&] { auto newLatestVisibleTimestamp = getOplogReadTimestamp(); if (newLatestVisibleTimestamp < currentLatestVisibleTimestamp) { LOG(1) << "oplog latest visible timestamp went backwards"; // If the visibility went backwards, this means a rollback occurred. // Thus, we are finished waiting. return true; } currentLatestVisibleTimestamp = newLatestVisibleTimestamp; // currentLatestVisibleTimestamp might be Timestamp "1" if there are no oplog documents // inserted since the last mongod restart. In this case, we need to simulate what timestamp // the last oplog document had when it was written, which is the _oplogMaxAtStartup value. RecordId latestVisible = std::max(RecordId(currentLatestVisibleTimestamp), _oplogMaxAtStartup); if (latestVisible < waitingFor) { LOG(2) << "Operation is waiting for " << waitingFor << "; latestVisible is " << currentLatestVisibleTimestamp << " oplogMaxAtStartup is " << _oplogMaxAtStartup; } return latestVisible >= waitingFor; }); }
void WiredTigerOplogManager::_oplogJournalThreadLoop(WiredTigerSessionCache* sessionCache, WiredTigerRecordStore* oplogRecordStore, const bool updateOldestTimestamp) noexcept { Client::initThread("WTOplogJournalThread"); // This thread updates the oplog read timestamp, the timestamp used to read from the oplog with // forward cursors. The timestamp is used to hide oplog entries that might be committed but // have uncommitted entries ahead of them. while (true) { stdx::unique_lock<stdx::mutex> lk(_oplogVisibilityStateMutex); { MONGO_IDLE_THREAD_BLOCK; _opsWaitingForJournalCV.wait(lk, [&] { return _shuttingDown || _opsWaitingForJournal; }); // If we're not shutting down and nobody is actively waiting for the oplog to become // durable, delay journaling a bit to reduce the sync rate. auto journalDelay = Milliseconds(storageGlobalParams.journalCommitIntervalMs.load()); if (journalDelay == Milliseconds(0)) { journalDelay = Milliseconds(WiredTigerKVEngine::kDefaultJournalDelayMillis); } auto now = Date_t::now(); auto deadline = now + journalDelay; auto shouldSyncOpsWaitingForJournal = [&] { return _shuttingDown || _opsWaitingForVisibility || oplogRecordStore->haveCappedWaiters(); }; // Eventually it would be more optimal to merge this with the normal journal flushing // and block for either oplog tailers or operations waiting for oplog visibility. For // now this loop will poll once a millisecond up to the journalDelay to see if we have // any waiters yet. This reduces sync-related I/O on the primary when secondaries are // lagged, but will avoid significant delays in confirming majority writes on replica // sets with infrequent writes. // Callers of waitForAllEarlierOplogWritesToBeVisible() like causally consistent reads // will preempt this delay. while (now < deadline && !_opsWaitingForJournalCV.wait_until( lk, now.toSystemTimePoint(), shouldSyncOpsWaitingForJournal)) { now += Milliseconds(1); } } while (!_shuttingDown && MONGO_FAIL_POINT(WTPausePrimaryOplogDurabilityLoop)) { lk.unlock(); sleepmillis(10); lk.lock(); } if (_shuttingDown) { log() << "oplog journal thread loop shutting down"; return; } invariant(_opsWaitingForJournal); _opsWaitingForJournal = false; lk.unlock(); const uint64_t newTimestamp = fetchAllCommittedValue(sessionCache->conn()); // The newTimestamp may actually go backward during secondary batch application, // where we commit data file changes separately from oplog changes, so ignore // a non-incrementing timestamp. if (newTimestamp <= _oplogReadTimestamp.load()) { LOG(2) << "no new oplog entries were made visible: " << newTimestamp; continue; } // In order to avoid oplog holes after an unclean shutdown, we must ensure this proposed // oplog read timestamp's documents are durable before publishing that timestamp. sessionCache->waitUntilDurable(/*forceCheckpoint=*/false, false); lk.lock(); // Publish the new timestamp value. Avoid going backward. auto oldTimestamp = getOplogReadTimestamp(); if (newTimestamp > oldTimestamp) { _setOplogReadTimestamp(lk, newTimestamp); } lk.unlock(); if (updateOldestTimestamp) { const bool force = false; sessionCache->getKVEngine()->setOldestTimestamp(Timestamp(newTimestamp), force); } // Wake up any await_data cursors and tell them more data might be visible now. oplogRecordStore->notifyCappedWaitersIfNeeded(); } }