void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx, Timestamp oplogApplicationStartPoint, Timestamp topOfOplog) { invariant(!oplogApplicationStartPoint.isNull()); invariant(!topOfOplog.isNull()); // Check if we have any unapplied ops in our oplog. It is important that this is done after // deleting the ragged end of the oplog. if (oplogApplicationStartPoint == topOfOplog) { log() << "No oplog entries to apply for recovery. appliedThrough is at the top of the oplog."; return; // We've applied all the valid oplog we have. } else if (oplogApplicationStartPoint > topOfOplog) { severe() << "Applied op " << oplogApplicationStartPoint.toBSON() << " not found. Top of oplog is " << topOfOplog.toBSON() << '.'; fassertFailedNoTrace(40313); } log() << "Replaying stored operations from " << oplogApplicationStartPoint.toBSON() << " (exclusive) to " << topOfOplog.toBSON() << " (inclusive)."; DBDirectClient db(opCtx); auto cursor = db.query(NamespaceString::kRsOplogNamespace.ns(), QUERY("ts" << BSON("$gte" << oplogApplicationStartPoint)), /*batchSize*/ 0, /*skip*/ 0, /*projection*/ nullptr, QueryOption_OplogReplay); // Check that the first document matches our appliedThrough point then skip it since it's // already been applied. if (!cursor->more()) { // This should really be impossible because we check above that the top of the oplog is // strictly > appliedThrough. If this fails it represents a serious bug in either the // storage engine or query's implementation of OplogReplay. severe() << "Couldn't find any entries in the oplog >= " << oplogApplicationStartPoint.toBSON() << " which should be impossible."; fassertFailedNoTrace(40293); } auto firstTimestampFound = fassertStatusOK(40291, OpTime::parseFromOplogEntry(cursor->nextSafe())).getTimestamp(); if (firstTimestampFound != oplogApplicationStartPoint) { severe() << "Oplog entry at " << oplogApplicationStartPoint.toBSON() << " is missing; actual entry found is " << firstTimestampFound.toBSON(); fassertFailedNoTrace(40292); } // Apply remaining ops one at at time, but don't log them because they are already logged. UnreplicatedWritesBlock uwb(opCtx); while (cursor->more()) { auto entry = cursor->nextSafe(); fassertStatusOK(40294, SyncTail::syncApply(opCtx, entry, OplogApplication::Mode::kRecovering)); _consistencyMarkers->setAppliedThrough( opCtx, fassertStatusOK(40295, OpTime::parseFromOplogEntry(entry))); } }
void ReplicationRecoveryImpl::_truncateOplogTo(OperationContext* opCtx, Timestamp truncateTimestamp) { Timer timer; const NamespaceString oplogNss(NamespaceString::kRsOplogNamespace); AutoGetDb autoDb(opCtx, oplogNss.db(), MODE_IX); Lock::CollectionLock oplogCollectionLoc(opCtx->lockState(), oplogNss.ns(), MODE_X); Collection* oplogCollection = autoDb.getDb()->getCollection(opCtx, oplogNss); if (!oplogCollection) { fassertFailedWithStatusNoTrace( 34418, Status(ErrorCodes::NamespaceNotFound, str::stream() << "Can't find " << NamespaceString::kRsOplogNamespace.ns())); } // Scan through oplog in reverse, from latest entry to first, to find the truncateTimestamp. RecordId oldestIDToDelete; // Non-null if there is something to delete. auto oplogRs = oplogCollection->getRecordStore(); auto oplogReverseCursor = oplogRs->getCursor(opCtx, /*forward=*/false); size_t count = 0; while (auto next = oplogReverseCursor->next()) { const BSONObj entry = next->data.releaseToBson(); const RecordId id = next->id; count++; const auto tsElem = entry["ts"]; if (count == 1) { if (tsElem.eoo()) LOG(2) << "Oplog tail entry: " << redact(entry); else LOG(2) << "Oplog tail entry ts field: " << tsElem; } if (tsElem.timestamp() < truncateTimestamp) { // If count == 1, that means that we have nothing to delete because everything in the // oplog is < truncateTimestamp. if (count != 1) { invariant(!oldestIDToDelete.isNull()); oplogCollection->cappedTruncateAfter(opCtx, oldestIDToDelete, /*inclusive=*/true); } log() << "Replication recovery oplog truncation finished in: " << timer.millis() << "ms"; return; } oldestIDToDelete = id; } severe() << "Reached end of oplog looking for oplog entry before " << truncateTimestamp.toBSON() << " but couldn't find any after looking through " << count << " entries."; fassertFailedNoTrace(40296); }
void ReplicationRecoveryImpl::_applyToEndOfOplog(OperationContext* opCtx, const Timestamp& oplogApplicationStartPoint, const Timestamp& topOfOplog) { invariant(!oplogApplicationStartPoint.isNull()); invariant(!topOfOplog.isNull()); // Check if we have any unapplied ops in our oplog. It is important that this is done after // deleting the ragged end of the oplog. if (oplogApplicationStartPoint == topOfOplog) { log() << "No oplog entries to apply for recovery. Start point is at the top of the oplog."; return; // We've applied all the valid oplog we have. } else if (oplogApplicationStartPoint > topOfOplog) { severe() << "Applied op " << oplogApplicationStartPoint.toBSON() << " not found. Top of oplog is " << topOfOplog.toBSON() << '.'; fassertFailedNoTrace(40313); } log() << "Replaying stored operations from " << oplogApplicationStartPoint.toBSON() << " (exclusive) to " << topOfOplog.toBSON() << " (inclusive)."; OplogBufferLocalOplog oplogBuffer(oplogApplicationStartPoint); oplogBuffer.startup(opCtx); RecoveryOplogApplierStats stats; auto writerPool = OplogApplier::makeWriterPool(); OplogApplier::Options options; options.allowNamespaceNotFoundErrorsOnCrudOps = true; options.skipWritesToOplog = true; // During replication recovery, the stableTimestampForRecovery refers to the stable timestamp // from which we replay the oplog. // For startup recovery, this will be the recovery timestamp, which is the stable timestamp that // the storage engine recovered to on startup. For rollback recovery, this will be the last // stable timestamp, returned when we call recoverToStableTimestamp. // We keep track of this for prepared transactions so that when we apply a commitTransaction // oplog entry, we can check if it occurs before or after the stable timestamp and decide // whether the operations would have already been reflected in the data. options.stableTimestampForRecovery = oplogApplicationStartPoint; OplogApplierImpl oplogApplier(nullptr, &oplogBuffer, &stats, nullptr, _consistencyMarkers, _storageInterface, options, writerPool.get()); OplogApplier::BatchLimits batchLimits; batchLimits.bytes = OplogApplier::calculateBatchLimitBytes(opCtx, _storageInterface); batchLimits.ops = OplogApplier::getBatchLimitOperations(); OpTime applyThroughOpTime; OplogApplier::Operations batch; while ( !(batch = fassert(50763, oplogApplier.getNextApplierBatch(opCtx, batchLimits))).empty()) { applyThroughOpTime = uassertStatusOK(oplogApplier.multiApply(opCtx, std::move(batch))); } stats.complete(applyThroughOpTime); invariant(oplogBuffer.isEmpty(), str::stream() << "Oplog buffer not empty after applying operations. Last operation " "applied with optime: " << applyThroughOpTime.toBSON()); invariant(applyThroughOpTime.getTimestamp() == topOfOplog, str::stream() << "Did not apply to top of oplog. Applied through: " << applyThroughOpTime.toString() << ". Top of oplog: " << topOfOplog.toString()); oplogBuffer.shutdown(opCtx); // We may crash before setting appliedThrough. If we have a stable checkpoint, we will recover // to that checkpoint at a replication consistent point, and applying the oplog is safe. // If we don't have a stable checkpoint, then we must be in startup recovery, and not rollback // recovery, because we only roll back to a stable timestamp when we have a stable checkpoint. // Startup recovery from an unstable checkpoint only ever applies a single batch and it is safe // to replay the batch from any point. _consistencyMarkers->setAppliedThrough(opCtx, applyThroughOpTime); }