// takes an entry that was written _logTransactionOps // and applies them to collections // // TODO: possibly improve performance of this. We create and destroy a // context for each operation. Find a way to amortize it out if necessary // void applyTransactionFromOplog(BSONObj entry) { bool transactionAlreadyApplied = entry["a"].Bool(); if (!transactionAlreadyApplied) { Client::Transaction transaction(DB_SERIALIZABLE); if (entry.hasElement("ref")) { applyRefOp(entry); } else if (entry.hasElement("ops")) { applyOps(entry["ops"].Array()); } else { verify(0); } // set the applied bool to true, to let the oplog know that // this entry has been applied to collections BSONElementManipulator(entry["a"]).setBool(true); { LOCK_REASON(lockReason, "repl: setting oplog entry's applied bit"); Lock::DBRead lk1("local", lockReason); writeEntryToOplog(entry, false); } // If this code fails, it is impossible to recover from // because we don't know if the transaction successfully committed // so we might as well crash // There is currently no known way this code can throw an exception try { // we are operating as a secondary. We don't have to fsync transaction.commit(DB_TXN_NOSYNC); } catch (std::exception &e) { log() << "exception during commit of applyTransactionFromOplog, aborting system: " << e.what() << endl; printStackTrace(); logflush(); ::abort(); } } }
// find all oplog entries for a given OID in the oplog.refs collection and apply them // TODO this should be a range query on oplog.refs where _id.oid == oid and applyOps to // each entry found. The locking of the query interleaved with the locking in the applyOps // did not work, so it a sequence of point queries. // TODO verify that the query plan is a indexed lookup. // TODO verify that the query plan does not fetch too many docs and then only process one of them. void applyRefOp(BSONObj entry) { OID oid = entry["ref"].OID(); LOG(3) << "apply ref " << entry << " oid " << oid << endl; long long seq = 0; // note that 0 is smaller than any of the seq numbers while (1) { BSONObj entry; { LOCK_REASON(lockReason, "repl: finding oplog.refs entry to apply"); Client::ReadContext ctx(rsOplogRefs, lockReason); // TODO: Should this be using rsOplogRefsDetails, verifying non-null? Collection *cl = getCollection(rsOplogRefs); if (cl == NULL || !cl->findOne(BSON("_id" << BSON("$gt" << BSON("oid" << oid << "seq" << seq))), entry, true)) { break; } } BSONElement e = entry.getFieldDotted("_id.seq"); seq = e.Long(); BSONElement eOID = entry.getFieldDotted("_id.oid"); if (oid != eOID.OID()) { break; } LOG(3) << "apply " << entry << " seq=" << seq << endl; applyOps(entry["ops"].Array()); } }
Status applyCommitTransaction(OperationContext* opCtx, const repl::OplogEntry& entry, repl::OplogApplication::Mode mode) { // Return error if run via applyOps command. uassert(50987, "commitTransaction is only used internally by secondaries.", mode != repl::OplogApplication::Mode::kApplyOpsCmd); IDLParserErrorContext ctx("commitTransaction"); auto commitCommand = CommitTransactionOplogObject::parse(ctx, entry.getObject()); if (mode == repl::OplogApplication::Mode::kRecovering) { const auto replCoord = repl::ReplicationCoordinator::get(opCtx); const auto recoveryTimestamp = replCoord->getRecoveryTimestamp(); invariant(recoveryTimestamp); // If the commitTimestamp is before the recoveryTimestamp, then the data already // reflects the operations from the transaction. const auto& commitTimestamp = commitCommand.getCommitTimestamp(); if (recoveryTimestamp.get() > commitTimestamp) { return Status::OK(); } // Get the corresponding prepareTransaction oplog entry. TransactionHistoryIterator iter(entry.getOpTime()); invariant(iter.hasNext()); const auto commitOplogEntry = iter.next(opCtx); invariant(iter.hasNext()); const auto prepareOplogEntry = iter.next(opCtx); // Transform prepare command into a normal applyOps command. const auto prepareCmd = prepareOplogEntry.getOperationToApply().removeField("prepare"); BSONObjBuilder resultWeDontCareAbout; return applyOps( opCtx, entry.getNss().db().toString(), prepareCmd, mode, &resultWeDontCareAbout); } // TODO: SERVER-36492 Only run on secondary until we support initial sync. invariant(mode == repl::OplogApplication::Mode::kSecondary); // Transaction operations are in its own batch, so we can modify their opCtx. invariant(entry.getSessionId()); invariant(entry.getTxnNumber()); opCtx->setLogicalSessionId(*entry.getSessionId()); opCtx->setTxnNumber(*entry.getTxnNumber()); // The write on transaction table may be applied concurrently, so refreshing state // from disk may read that write, causing starting a new transaction on an existing // txnNumber. Thus, we start a new transaction without refreshing state from disk. MongoDOperationContextSessionWithoutRefresh sessionCheckout(opCtx); auto transaction = TransactionParticipant::get(opCtx); invariant(transaction); transaction->unstashTransactionResources(opCtx, "commitTransaction"); transaction->commitPreparedTransaction(opCtx, commitCommand.getCommitTimestamp()); return Status::OK(); }
// Doles out all the work to the writer pool threads and waits for them to complete void SyncTail::multiApply( std::deque<BSONObj>& ops, MultiSyncApplyFunc applyFunc ) { // Use a ThreadPool to prefetch all the operations in a batch. prefetchOps(ops); std::vector< std::vector<BSONObj> > writerVectors(theReplSet->replWriterThreadCount); fillWriterVectors(ops, &writerVectors); LOG(2) << "replication batch size is " << ops.size() << endl; // We must grab this because we're going to grab write locks later. // We hold this mutex the entire time we're writing; it doesn't matter // because all readers are blocked anyway. SimpleMutex::scoped_lock fsynclk(filesLockedFsync); // stop all readers until we're done Lock::ParallelBatchWriterMode pbwm; applyOps(writerVectors, applyFunc); }
// Applies a batch of oplog entries, by using a set of threads to apply the operations and then // writes the oplog entries to the local oplog. OpTime SyncTail::multiApply(OperationContext* txn, const OpQueue& ops) { invariant(_applyFunc); if (getGlobalServiceContext()->getGlobalStorageEngine()->isMmapV1()) { // Use a ThreadPool to prefetch all the operations in a batch. prefetchOps(ops.getDeque(), &_prefetcherPool); } std::vector<std::vector<BSONObj>> writerVectors(replWriterThreadCount); fillWriterVectors(txn, ops.getDeque(), &writerVectors); LOG(2) << "replication batch size is " << ops.getDeque().size() << endl; // We must grab this because we're going to grab write locks later. // We hold this mutex the entire time we're writing; it doesn't matter // because all readers are blocked anyway. stdx::lock_guard<SimpleMutex> fsynclk(filesLockedFsync); // stop all readers until we're done Lock::ParallelBatchWriterMode pbwm(txn->lockState()); ReplicationCoordinator* replCoord = getGlobalReplicationCoordinator(); if (replCoord->getMemberState().primary() && !replCoord->isWaitingForApplierToDrain()) { severe() << "attempting to replicate ops while primary"; fassertFailed(28527); } applyOps(writerVectors, &_writerPool, _applyFunc, this); OpTime lastOpTime; { ON_BLOCK_EXIT([&] { _writerPool.join(); }); std::vector<BSONObj> raws; raws.reserve(ops.getDeque().size()); for (auto&& op : ops.getDeque()) { raws.emplace_back(op.raw); } lastOpTime = writeOpsToOplog(txn, raws); if (inShutdown()) { return OpTime(); } } // We have now written all database writes and updated the oplog to match. return lastOpTime; }
// find all oplog entries for a given OID in the oplog.refs collection and apply them // TODO this should be a range query on oplog.refs where _id.oid == oid and applyOps to // each entry found. The locking of the query interleaved with the locking in the applyOps // did not work, so it a sequence of point queries. // TODO verify that the query plan is a indexed lookup. // TODO verify that the query plan does not fetch too many docs and then only process one of them. void applyRefOp(BSONObj entry) { OID oid = entry["ref"].OID(); LOG(3) << "apply ref " << entry << " oid " << oid << endl; long long seq = 0; // note that 0 is smaller than any of the seq numbers while (1) { BSONObj entry; { Client::ReadContext ctx(rsOplogRefs); // TODO: Should this be using rsOplogRefsDetails, verifying non-null? NamespaceDetails *d = nsdetails(rsOplogRefs); if (d == NULL || !d->findOne(BSON("_id" << BSON("$gt" << BSON("oid" << oid << "seq" << seq))), entry, true)) { break; } } BSONElement e = entry.getFieldDotted("_id.seq"); seq = e.Long(); BSONElement eOID = entry.getFieldDotted("_id.oid"); if (oid != eOID.OID()) { break; } LOG(3) << "apply " << entry << " seq=" << seq << endl; applyOps(entry["ops"].Array()); } }