Status applyAbortTransaction(OperationContext* opCtx, const OplogEntry& entry, repl::OplogApplication::Mode mode) { // Return error if run via applyOps command. uassert(50972, "abortTransaction is only used internally by secondaries.", mode != repl::OplogApplication::Mode::kApplyOpsCmd); // We don't put transactions into the prepare state until the end of recovery and initial sync, // so there is no transaction to abort. if (mode == repl::OplogApplication::Mode::kRecovering || mode == repl::OplogApplication::Mode::kInitialSync) { return Status::OK(); } invariant(mode == repl::OplogApplication::Mode::kSecondary); // Transaction operations are in its own batch, so we can modify their opCtx. invariant(entry.getSessionId()); invariant(entry.getTxnNumber()); opCtx->setLogicalSessionId(*entry.getSessionId()); opCtx->setTxnNumber(*entry.getTxnNumber()); // The write on transaction table may be applied concurrently, so refreshing state // from disk may read that write, causing starting a new transaction on an existing // txnNumber. Thus, we start a new transaction without refreshing state from disk. MongoDOperationContextSessionWithoutRefresh sessionCheckout(opCtx); auto transaction = TransactionParticipant::get(opCtx); transaction.unstashTransactionResources(opCtx, "abortTransaction"); transaction.abortActiveTransaction(opCtx); return Status::OK(); }
Status SyncSourceResolver::_compareRequiredOpTimeWithQueryResponse( const Fetcher::QueryResponse& queryResponse) { if (queryResponse.documents.empty()) { return Status( ErrorCodes::NoMatchingDocument, "remote oplog does not contain entry with optime matching our required optime"); } const OplogEntry oplogEntry(queryResponse.documents.front()); const auto opTime = oplogEntry.getOpTime(); if (_requiredOpTime != opTime) { return Status(ErrorCodes::BadValue, str::stream() << "remote oplog contain entry with matching timestamp " << opTime.getTimestamp().toString() << " but optime " << opTime.toString() << " does not " "match our required optime"); } if (_requiredOpTime.getTerm() != opTime.getTerm()) { return Status(ErrorCodes::BadValue, str::stream() << "remote oplog contain entry with term " << opTime.getTerm() << " that does not " "match the term in our required optime"); } return Status::OK(); }
// prefetch for an oplog operation void prefetchPagesForReplicatedOp(OperationContext* opCtx, Database* db, const OplogEntry& oplogEntry) { invariant(db); const ReplSettings::IndexPrefetchConfig prefetchConfig = ReplicationCoordinator::get(opCtx)->getIndexPrefetchConfig(); // Prefetch ignores non-CRUD operations. if (!oplogEntry.isCrudOpType()) { return; } // This will have to change for engines other than MMAP V1, because they might not have // means for directly prefetching pages from the collection. For this purpose, acquire S // lock on the database, instead of optimizing with IS. const auto& nss = oplogEntry.getNamespace(); Lock::CollectionLock collLock(opCtx->lockState(), nss.ns(), MODE_S); Collection* collection = db->getCollection(opCtx, nss); if (!collection) { return; } auto opType = oplogEntry.getOpType(); LOG(4) << "index prefetch for op " << OpType_serializer(opType); // should we prefetch index pages on updates? if the update is in-place and doesn't change // indexed values, it is actually slower - a lot slower if there are a dozen indexes or // lots of multikeys. possible variations (not all mutually exclusive): // 1) current behavior: full prefetch // 2) don't do it for updates // 3) don't do multikey indexes for updates // 4) don't prefetchIndexPages on some heuristic; e.g., if it's an $inc. // 5) if not prefetching index pages (#2), we should do it if we are upsertings and it // will be an insert. to do that we could do the prefetchRecordPage first and if DNE // then we do #1. // // note that on deletes 'obj' does not have all the keys we would want to prefetch on. // a way to achieve that would be to prefetch the record first, and then afterwards do // this part. // auto obj = oplogEntry.getOperationToApply(); invariant(!obj.isEmpty()); prefetchIndexPages(opCtx, collection, prefetchConfig, obj); // do not prefetch the data for inserts; it doesn't exist yet // // we should consider doing the record prefetch for the delete op case as we hit the record // when we delete. note if done we only want to touch the first page. // // update: do record prefetch. if ((opType == OpTypeEnum::kUpdate) && // do not prefetch the data for capped collections because // they typically do not have an _id index for findById() to use. !collection->isCapped()) { prefetchRecordPages(opCtx, db, nss.ns().c_str(), obj); } }
repl::MultiApplier::Operations readTransactionOperationsFromOplogChain( OperationContext* opCtx, const OplogEntry& commitOrPrepare, const std::vector<OplogEntry*> cachedOps) { repl::MultiApplier::Operations ops; // Get the previous oplog entry. auto currentOpTime = commitOrPrepare.getOpTime(); // The cachedOps are the ops for this transaction that are from the same oplog application batch // as the commit or prepare, those which have not necessarily been written to the oplog. These // ops are in order of increasing timestamp. // The lastEntryOpTime is the OpTime of the last (latest OpTime) entry for this transaction // which is expected to be present in the oplog. It is the entry before the first cachedOp, // unless there are no cachedOps in which case it is the entry before the commit or prepare. const auto lastEntryOpTime = (cachedOps.empty() ? commitOrPrepare : *cachedOps.front()) .getPrevWriteOpTimeInTransaction(); invariant(lastEntryOpTime < currentOpTime); TransactionHistoryIterator iter(lastEntryOpTime.get()); // Empty commits are not allowed, but empty prepares are. invariant(commitOrPrepare.getCommandType() != OplogEntry::CommandType::kCommitTransaction || !cachedOps.empty() || iter.hasNext()); auto commitOrPrepareObj = commitOrPrepare.toBSON(); // First retrieve and transform the ops from the oplog, which will be retrieved in reverse // order. while (iter.hasNext()) { const auto& operationEntry = iter.next(opCtx); invariant(operationEntry.isPartialTransaction()); auto prevOpsEnd = ops.size(); _reconstructPartialTxnEntryAtGivenTime(operationEntry, commitOrPrepareObj, &ops); // Because BSONArrays do not have fast way of determining size without iterating through // them, and we also have no way of knowing how many oplog entries are in a transaction // without iterating, reversing each applyOps and then reversing the whole array is // about as good as we can do to get the entire thing in chronological order. Fortunately // STL arrays of BSON objects should be fast to reverse (just pointer copies). std::reverse(ops.begin() + prevOpsEnd, ops.end()); } std::reverse(ops.begin(), ops.end()); // Next retrieve and transform the ops from the current batch, which are in increasing timestamp // order. for (auto* cachedOp : cachedOps) { const auto& operationEntry = *cachedOp; invariant(operationEntry.isPartialTransaction()); _reconstructPartialTxnEntryAtGivenTime(operationEntry, commitOrPrepareObj, &ops); } return ops; }
bool FieldUpdateSubscription::matchesWithEntry(const OplogEntry& entry) const { if (!BasicOperationSubscription::matchesWithEntry(entry)) { return false; } mongo::BSONObj updateDoc = entry.getUpdateDocument(); for (std::vector<MongoUpdateLogOperation>::const_iterator iter = subscribedOperations.begin(); iter != subscribedOperations.end() ; iter++) { std::string operationString = stringForUpdateOperation(*iter); if (!operationString.empty()) { if (updateDoc.hasField(operationString)) { mongo::BSONObj operationObj = updateDoc.getObjectField(operationString.c_str()); if (operationObj.hasField(fieldName)) { return true; } } } else if (updateDoc.hasField(fieldName)) { return true; } } return false; }
Status applyCommitTransaction(OperationContext* opCtx, const OplogEntry& entry, repl::OplogApplication::Mode mode) { // Return error if run via applyOps command. uassert(50987, "commitTransaction is only used internally by secondaries.", mode != repl::OplogApplication::Mode::kApplyOpsCmd); IDLParserErrorContext ctx("commitTransaction"); auto commitOplogEntryOpTime = entry.getOpTime(); auto commitCommand = CommitTransactionOplogObject::parse(ctx, entry.getObject()); const bool prepared = !commitCommand.getPrepared() || *commitCommand.getPrepared(); if (!prepared) return Status::OK(); invariant(commitCommand.getCommitTimestamp()); if (mode == repl::OplogApplication::Mode::kRecovering || mode == repl::OplogApplication::Mode::kInitialSync) { return _applyTransactionFromOplogChain(opCtx, entry, mode, *commitCommand.getCommitTimestamp(), commitOplogEntryOpTime.getTimestamp()); } invariant(mode == repl::OplogApplication::Mode::kSecondary); // Transaction operations are in its own batch, so we can modify their opCtx. invariant(entry.getSessionId()); invariant(entry.getTxnNumber()); opCtx->setLogicalSessionId(*entry.getSessionId()); opCtx->setTxnNumber(*entry.getTxnNumber()); // The write on transaction table may be applied concurrently, so refreshing state // from disk may read that write, causing starting a new transaction on an existing // txnNumber. Thus, we start a new transaction without refreshing state from disk. MongoDOperationContextSessionWithoutRefresh sessionCheckout(opCtx); auto transaction = TransactionParticipant::get(opCtx); invariant(transaction); transaction.unstashTransactionResources(opCtx, "commitTransaction"); transaction.commitPreparedTransaction( opCtx, *commitCommand.getCommitTimestamp(), commitOplogEntryOpTime); return Status::OK(); }
bool BasicOperationSubscription::matchesWithEntry(const OplogEntry & entry) const { return entry.getOperation() == operation; }
StatusWith<std::set<NamespaceString>> RollbackImpl::_namespacesForOp(const OplogEntry& oplogEntry) { NamespaceString opNss = oplogEntry.getNamespace(); OpTypeEnum opType = oplogEntry.getOpType(); std::set<NamespaceString> namespaces; // No namespaces for a no-op. if (opType == OpTypeEnum::kNoop) { return std::set<NamespaceString>(); } // CRUD ops have the proper namespace in the operation 'ns' field. if (opType == OpTypeEnum::kInsert || opType == OpTypeEnum::kUpdate || opType == OpTypeEnum::kDelete) { return std::set<NamespaceString>({opNss}); } // If the operation is a command, then we need to extract the appropriate namespaces from the // command object, as opposed to just using the 'ns' field of the oplog entry itself. if (opType == OpTypeEnum::kCommand) { auto obj = oplogEntry.getObject(); auto firstElem = obj.firstElement(); // Does not handle 'applyOps' entries. invariant(oplogEntry.getCommandType() != OplogEntry::CommandType::kApplyOps, "_namespacesForOp does not handle 'applyOps' oplog entries."); switch (oplogEntry.getCommandType()) { case OplogEntry::CommandType::kRenameCollection: { // Add both the 'from' and 'to' namespaces. namespaces.insert(NamespaceString(firstElem.valuestrsafe())); namespaces.insert(NamespaceString(obj.getStringField("to"))); break; } case OplogEntry::CommandType::kDropDatabase: { // There is no specific namespace to save for a drop database operation. break; } case OplogEntry::CommandType::kDbCheck: case OplogEntry::CommandType::kConvertToCapped: case OplogEntry::CommandType::kEmptyCapped: { // These commands do not need to be supported by rollback. 'convertToCapped' should // always be converted to lower level DDL operations, and 'emptycapped' is a // testing-only command. std::string message = str::stream() << "Encountered unsupported command type '" << firstElem.fieldName() << "' during rollback."; return Status(ErrorCodes::UnrecoverableRollbackError, message); } case OplogEntry::CommandType::kCreate: case OplogEntry::CommandType::kDrop: case OplogEntry::CommandType::kCreateIndexes: case OplogEntry::CommandType::kDropIndexes: case OplogEntry::CommandType::kCollMod: { // For all other command types, we should be able to parse the collection name from // the first command argument. try { auto cmdNss = CommandHelpers::parseNsCollectionRequired(opNss.db(), obj); namespaces.insert(cmdNss); } catch (const DBException& ex) { return ex.toStatus(); } break; } case OplogEntry::CommandType::kApplyOps: default: // Every possible command type should be handled above. MONGO_UNREACHABLE } }