/* * Make sure that SUBTRANS has room for a newly-allocated XID. * * NB: this is called while holding XidGenLock. We want it to be very fast * most of the time; even when it's not so fast, no actual I/O need happen * unless we're forced to write out a dirty subtrans page to make room * in shared memory. */ void ExtendSUBTRANS(TransactionId newestXact) { int pageno; /* * Caller must have already taken mirrored lock shared. */ /* * No work except at first XID of a page. But beware: just after * wraparound, the first XID of page zero is FirstNormalTransactionId. */ if (TransactionIdToEntry(newestXact) != 0 && !TransactionIdEquals(newestXact, FirstNormalTransactionId)) return; pageno = TransactionIdToPage(newestXact); LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE); /* Zero the page */ ZeroSUBTRANSPage(pageno); LWLockRelease(SubtransControlLock); }
/* * XactLockTableWait * * Wait for the specified transaction to commit or abort. */ void XactLockTableWait(TransactionId xid) { LOCKTAG tag; TransactionId myxid = GetCurrentTransactionId(); Assert(!TransactionIdEquals(xid, myxid)); MemSet(&tag, 0, sizeof(tag)); tag.relId = XactLockTableId; tag.dbId = InvalidOid; tag.objId.xid = xid; if (!LockAcquire(LockTableId, &tag, myxid, ShareLock, false)) elog(ERROR, "LockAcquire failed"); LockRelease(LockTableId, &tag, myxid, ShareLock); /* * Transaction was committed/aborted/crashed - we have to update * pg_clog if transaction is still marked as running. */ if (!TransactionIdDidCommit(xid) && !TransactionIdDidAbort(xid)) TransactionIdAbort(xid); }
/* * TransactionIdGetCommitLSN * * This function returns an LSN that is late enough to be able * to guarantee that if we flush up to the LSN returned then we * will have flushed the transaction's commit record to disk. * * The result is not necessarily the exact LSN of the transaction's * commit record! For example, for long-past transactions (those whose * clog pages already migrated to disk), we'll return InvalidXLogRecPtr. * Also, because we group transactions on the same clog page to conserve * storage, we might return the LSN of a later transaction that falls into * the same group. */ XLogRecPtr TransactionIdGetCommitLSN(TransactionId xid) { XLogRecPtr result; /* * Currently, all uses of this function are for xids that were just * reported to be committed by TransactionLogFetch, so we expect that * checking TransactionLogFetch's cache will usually succeed and avoid an * extra trip to shared memory. */ if (TransactionIdEquals(xid, cachedFetchXid)) return cachedCommitLSN; /* Special XIDs are always known committed */ if (!TransactionIdIsNormal(xid)) return InvalidXLogRecPtr; /* * Get the transaction status. */ (void) TransactionIdGetStatus(xid, &result); return result; }
/* * Make sure that CommitTs has room for a newly-allocated XID. * * NB: this is called while holding XidGenLock. We want it to be very fast * most of the time; even when it's not so fast, no actual I/O need happen * unless we're forced to write out a dirty CommitTs or xlog page to make room * in shared memory. * * NB: the current implementation relies on track_commit_timestamp being * PGC_POSTMASTER. */ void ExtendCommitTs(TransactionId newestXact) { int pageno; /* * Nothing to do if module not enabled. Note we do an unlocked read of * the flag here, which is okay because this routine is only called from * GetNewTransactionId, which is never called in a standby. */ Assert(!InRecovery); if (!commitTsShared->commitTsActive) return; /* * No work except at first XID of a page. But beware: just after * wraparound, the first XID of page zero is FirstNormalTransactionId. */ if (TransactionIdToCTsEntry(newestXact) != 0 && !TransactionIdEquals(newestXact, FirstNormalTransactionId)) return; pageno = TransactionIdToCTsPage(newestXact); LWLockAcquire(CommitTsControlLock, LW_EXCLUSIVE); /* Zero the page and make an XLOG entry about it */ ZeroCommitTsPage(pageno, !InRecovery); LWLockRelease(CommitTsControlLock); }
/* * TransactionIdIsInProgress -- is given transaction running by some backend */ bool TransactionIdIsInProgress(TransactionId xid) { bool result = false; SISeg *segP = shmInvalBuffer; ProcState *stateP = segP->procState; int index; LWLockAcquire(SInvalLock, LW_SHARED); for (index = 0; index < segP->lastBackend; index++) { SHMEM_OFFSET pOffset = stateP[index].procStruct; if (pOffset != INVALID_OFFSET) { PGPROC *proc = (PGPROC *) MAKE_PTR(pOffset); /* Fetch xid just once - see GetNewTransactionId */ TransactionId pxid = proc->xid; if (TransactionIdEquals(pxid, xid)) { result = true; break; } } } LWLockRelease(SInvalLock); return result; }
/* * Make sure that SUBTRANS has room for a newly-allocated XID. * * NB: this is called while holding XidGenLock. We want it to be very fast * most of the time; even when it's not so fast, no actual I/O need happen * unless we're forced to write out a dirty subtrans page to make room * in shared memory. */ void ExtendSUBTRANS(TransactionId newestXact) { int pageno; /* * No work except at first XID of a page. But beware: just after * wraparound, the first XID of page zero is FirstNormalTransactionId. */ #ifdef PGXC /* PGXC_COORD || PGXC_DATANODE */ /* * In PGXC, it may be that a node is not involved in a transaction, * and therefore will be skipped, so we need to detect this by using * the latest_page_number instead of the pg index. * * Also, there is a special case of when transactions wrap-around that * we need to detect. */ pageno = TransactionIdToPage(newestXact); /* * The first condition makes sure we did not wrap around * The second checks if we are still using the same page. * Note that this value can change and we are not holding a lock, * so we repeat the check below. We do it this way instead of * grabbing the lock to avoid lock contention. */ if (SubTransCtl->shared->latest_page_number - pageno <= SUBTRANS_WRAP_CHECK_DELTA && pageno <= SubTransCtl->shared->latest_page_number) return; #else if (TransactionIdToEntry(newestXact) != 0 && !TransactionIdEquals(newestXact, FirstNormalTransactionId)) return; pageno = TransactionIdToPage(newestXact); #endif LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE); #ifdef PGXC /* * We repeat the check. Another process may have written * out the page already and advanced the latest_page_number * while we were waiting for the lock. */ if (SubTransCtl->shared->latest_page_number - pageno <= SUBTRANS_WRAP_CHECK_DELTA && pageno <= SubTransCtl->shared->latest_page_number) { LWLockRelease(SubtransControlLock); return; } #endif /* Zero the page */ ZeroSUBTRANSPage(pageno); LWLockRelease(SubtransControlLock); }
/* * xideq - are two xids equal? */ Datum xideq(PG_FUNCTION_ARGS) { TransactionId xid1 = PG_GETARG_TRANSACTIONID(0); TransactionId xid2 = PG_GETARG_TRANSACTIONID(1); PG_RETURN_BOOL(TransactionIdEquals(xid1, xid2)); }
/* * TransactionLogFetch --- fetch commit status of specified transaction id */ static XidStatus TransactionLogFetch(TransactionId transactionId) { XidStatus xidstatus; XLogRecPtr xidlsn; /* * Before going to the commit log manager, check our single item cache to * see if we didn't just check the transaction status a moment ago. */ if (TransactionIdEquals(transactionId, cachedFetchXid)) return cachedFetchXidStatus; /* * Also, check to see if the transaction ID is a permanent one. */ if (!TransactionIdIsNormal(transactionId)) { if (TransactionIdEquals(transactionId, BootstrapTransactionId)) return TRANSACTION_STATUS_COMMITTED; if (TransactionIdEquals(transactionId, FrozenTransactionId)) return TRANSACTION_STATUS_COMMITTED; return TRANSACTION_STATUS_ABORTED; } /* * Get the transaction status. */ xidstatus = TransactionIdGetStatus(transactionId, &xidlsn); /* * Cache it, but DO NOT cache status for unfinished or sub-committed * transactions! We only cache status that is guaranteed not to change. */ if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS && xidstatus != TRANSACTION_STATUS_SUB_COMMITTED) { cachedFetchXid = transactionId; cachedFetchXidStatus = xidstatus; cachedCommitLSN = xidlsn; } return xidstatus; }
/* * TransactionIdIsKnownCompleted * True iff transaction associated with the identifier is currently * known to have either committed or aborted. * * This does NOT look into pg_clog but merely probes our local cache * (and so it's not named TransactionIdDidComplete, which would be the * appropriate name for a function that worked that way). The intended * use is just to short-circuit TransactionIdIsInProgress calls when doing * repeated tqual.c checks for the same XID. If this isn't extremely fast * then it will be counterproductive. * * Note: * Assumes transaction identifier is valid. */ bool TransactionIdIsKnownCompleted(TransactionId transactionId) { if (TransactionIdEquals(transactionId, cachedFetchXid)) { /* If it's in the cache at all, it must be completed. */ return true; } return false; }
/* * TransactionIdIsCurrentTransactionId * * During bootstrap, we cheat and say "it's not my transaction ID" even though * it is. Along with transam.c's cheat to say that the bootstrap XID is * already committed, this causes the tqual.c routines to see previously * inserted tuples as committed, which is what we need during bootstrap. */ bool TransactionIdIsCurrentTransactionId(TransactionId xid) { TransactionState s = CurrentTransactionState; if (AMI_OVERRIDE) { Assert(xid == BootstrapTransactionId); return false; } return TransactionIdEquals(xid, s->transactionIdData); }
/* * TransactionIdIsKnownCompleted * True iff transaction associated with the identifier is currently * known to have either committed or aborted. * * This does NOT look into pg_clog but merely probes our local cache * (and so it's not named TransactionIdDidComplete, which would be the * appropriate name for a function that worked that way). The intended * use is just to short-circuit TransactionIdIsInProgress calls when doing * repeated tqual.c checks for the same XID. If this isn't extremely fast * then it will be counterproductive. * * Note: * Assumes transaction identifier is valid. */ bool TransactionIdIsKnownCompleted(TransactionId transactionId) { if (TransactionIdEquals(transactionId, cachedFetchXid)) { #ifdef PGXC syncGXID_GTM((GlobalTransactionId)transactionId); #endif /* If it's in the cache at all, it must be completed. */ return true; } return false; }
/* * XactLockTableWait * * Wait for the specified transaction to commit or abort. If an operation * is specified, an error context callback is set up. If 'oper' is passed as * None, no error context callback is set up. * * Note that this does the right thing for subtransactions: if we wait on a * subtransaction, we will exit as soon as it aborts or its top parent commits. * It takes some extra work to ensure this, because to save on shared memory * the XID lock of a subtransaction is released when it ends, whether * successfully or unsuccessfully. So we have to check if it's "still running" * and if so wait for its parent. */ void XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, XLTW_Oper oper) { LOCKTAG tag; XactLockTableWaitInfo info; ErrorContextCallback callback; /* * If an operation is specified, set up our verbose error context * callback. */ if (oper != XLTW_None) { Assert(RelationIsValid(rel)); Assert(ItemPointerIsValid(ctid)); info.rel = rel; info.ctid = ctid; info.oper = oper; callback.callback = XactLockTableWaitErrorCb; callback.arg = &info; callback.previous = error_context_stack; error_context_stack = &callback; } for (;;) { Assert(TransactionIdIsValid(xid)); Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny())); SET_LOCKTAG_TRANSACTION(tag, xid); (void) LockAcquire(&tag, ShareLock, false, false); LockRelease(&tag, ShareLock, false); if (!TransactionIdIsInProgress(xid)) break; xid = SubTransGetParent(xid); } if (oper != XLTW_None) error_context_stack = callback.previous; }
/* * Check if specified heap tuple was inserted by given * xaction/command and return * * - -1 if not * - 0 if there is no tuple at all * - 1 if yes */ int XLogIsOwnerOfTuple(RelFileNode hnode, ItemPointer iptr, TransactionId xid, CommandId cid) { Relation reln; Buffer buffer; Page page; ItemId lp; HeapTupleHeader htup; reln = XLogOpenRelation(false, RM_HEAP_ID, hnode); if (!RelationIsValid(reln)) return (0); buffer = ReadBuffer(reln, ItemPointerGetBlockNumber(iptr)); if (!BufferIsValid(buffer)) return (0); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = (Page) BufferGetPage(buffer); if (PageIsNew((PageHeader) page) || ItemPointerGetOffsetNumber(iptr) > PageGetMaxOffsetNumber(page)) { UnlockAndReleaseBuffer(buffer); return (0); } lp = PageGetItemId(page, ItemPointerGetOffsetNumber(iptr)); if (!ItemIdIsUsed(lp) || ItemIdDeleted(lp)) { UnlockAndReleaseBuffer(buffer); return (0); } htup = (HeapTupleHeader) PageGetItem(page, lp); Assert(PageGetSUI(page) == ThisStartUpID); if (!TransactionIdEquals(HeapTupleHeaderGetXmin(htup), xid) || HeapTupleHeaderGetCmin(htup) != cid) { UnlockAndReleaseBuffer(buffer); return (-1); } UnlockAndReleaseBuffer(buffer); return (1); }
/* * XactLockTableWait * * Wait for the specified transaction to commit or abort. * * Note that this does the right thing for subtransactions: if we wait on a * subtransaction, we will exit as soon as it aborts or its top parent commits. * It takes some extra work to ensure this, because to save on shared memory * the XID lock of a subtransaction is released when it ends, whether * successfully or unsuccessfully. So we have to check if it's "still running" * and if so wait for its parent. */ void XactLockTableWait(TransactionId xid) { LOCKTAG tag; for (;;) { Assert(TransactionIdIsValid(xid)); Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny())); SET_LOCKTAG_TRANSACTION(tag, xid); (void) LockAcquire(&tag, ShareLock, false, false); LockRelease(&tag, ShareLock, false); if (!TransactionIdIsInProgress(xid)) break; xid = SubTransGetParent(xid); } }
/* * ConditionalXactLockTableWait * * As above, but only lock if we can get the lock without blocking. * Returns TRUE if the lock was acquired. */ bool ConditionalXactLockTableWait(TransactionId xid) { LOCKTAG tag; for (;;) { Assert(TransactionIdIsValid(xid)); Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny())); SET_LOCKTAG_TRANSACTION(tag, xid); if (LockAcquire(&tag, ShareLock, false, true) == LOCKACQUIRE_NOT_AVAIL) return false; LockRelease(&tag, ShareLock, false); if (!TransactionIdIsInProgress(xid)) break; xid = SubTransGetParent(xid); } return true; }
/* * Make sure that DistributedLog has room for a newly-allocated XID. * * NB: this is called while holding XidGenLock. We want it to be very fast * most of the time; even when it's not so fast, no actual I/O need happen * unless we're forced to write out a dirty DistributedLog or xlog page * to make room in shared memory. */ void DistributedLog_Extend(TransactionId newestXact) { MIRRORED_LOCK_DECLARE; int page; /* * No work except at first XID of a page. But beware: just after * wraparound, the first XID of page zero is FirstNormalTransactionId. */ if (TransactionIdToEntry(newestXact) != 0 && !TransactionIdEquals(newestXact, FirstNormalTransactionId)) return; page = TransactionIdToPage(newestXact); elog((Debug_print_full_dtm ? LOG : DEBUG5), "DistributedLog_Extend page %d", page); MIRRORED_LOCK; LWLockAcquire(DistributedLogControlLock, LW_EXCLUSIVE); /* Zero the page and make an XLOG entry about it */ DistributedLog_ZeroPage(page, true); LWLockRelease(DistributedLogControlLock); MIRRORED_UNLOCK; elog((Debug_print_full_dtm ? LOG : DEBUG5), "DistributedLog_Extend with newest local xid = %d to page = %d", newestXact, page); }
/* * PrescanPreparedTransactions * * Scan the pg_twophase directory and determine the range of valid XIDs * present. This is run during database startup, after we have completed * reading WAL. ShmemVariableCache->nextXid has been set to one more than * the highest XID for which evidence exists in WAL. * * We throw away any prepared xacts with main XID beyond nextXid --- if any * are present, it suggests that the DBA has done a PITR recovery to an * earlier point in time without cleaning out pg_twophase. We dare not * try to recover such prepared xacts since they likely depend on database * state that doesn't exist now. * * However, we will advance nextXid beyond any subxact XIDs belonging to * valid prepared xacts. We need to do this since subxact commit doesn't * write a WAL entry, and so there might be no evidence in WAL of those * subxact XIDs. * * Our other responsibility is to determine and return the oldest valid XID * among the prepared xacts (if none, return ShmemVariableCache->nextXid). * This is needed to synchronize pg_subtrans startup properly. */ TransactionId PrescanPreparedTransactions(void) { TransactionId origNextXid = ShmemVariableCache->nextXid; TransactionId result = origNextXid; DIR *cldir; struct dirent *clde; cldir = AllocateDir(TWOPHASE_DIR); while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL) { if (strlen(clde->d_name) == 8 && strspn(clde->d_name, "0123456789ABCDEF") == 8) { TransactionId xid; char *buf; TwoPhaseFileHeader *hdr; TransactionId *subxids; int i; xid = (TransactionId) strtoul(clde->d_name, NULL, 16); /* Reject XID if too new */ if (TransactionIdFollowsOrEquals(xid, origNextXid)) { ereport(WARNING, (errmsg("removing future two-phase state file \"%s\"", clde->d_name))); RemoveTwoPhaseFile(xid, true); continue; } /* * Note: we can't check if already processed because clog * subsystem isn't up yet. */ /* Read and validate file */ buf = ReadTwoPhaseFile(xid); if (buf == NULL) { ereport(WARNING, (errmsg("removing corrupt two-phase state file \"%s\"", clde->d_name))); RemoveTwoPhaseFile(xid, true); continue; } /* Deconstruct header */ hdr = (TwoPhaseFileHeader *) buf; if (!TransactionIdEquals(hdr->xid, xid)) { ereport(WARNING, (errmsg("removing corrupt two-phase state file \"%s\"", clde->d_name))); RemoveTwoPhaseFile(xid, true); pfree(buf); continue; } /* * OK, we think this file is valid. Incorporate xid into the * running-minimum result. */ if (TransactionIdPrecedes(xid, result)) result = xid; /* * Examine subtransaction XIDs ... they should all follow main * XID, and they may force us to advance nextXid. */ subxids = (TransactionId *) (buf + MAXALIGN(sizeof(TwoPhaseFileHeader))); for (i = 0; i < hdr->nsubxacts; i++) { TransactionId subxid = subxids[i]; Assert(TransactionIdFollows(subxid, xid)); if (TransactionIdFollowsOrEquals(subxid, ShmemVariableCache->nextXid)) { ShmemVariableCache->nextXid = subxid; TransactionIdAdvance(ShmemVariableCache->nextXid); } } pfree(buf); } } FreeDir(cldir); return result; }
/* * StrategyReplaceBuffer * * Called by the buffer manager to inform us that he flushed a buffer * and is now about to replace the content. Prior to this call, * the cache algorithm still reports the buffer as in the cache. After * this call we report the new block, even if IO might still need to * be done to bring in the new content. * * cdb_found_index and cdb_replace_index must be the auxiliary values * returned by previous calls to StrategyBufferLookup and StrategyGetBuffer. */ void StrategyReplaceBuffer(BufferDesc *buf, BufferTag *newTag, int cdb_found_index, int cdb_replace_index) { BufferStrategyCDB *cdb_found; BufferStrategyCDB *cdb_replace; if (cdb_found_index >= 0) { /* This must have been a ghost buffer cache hit (B1 list) */ cdb_found = &StrategyCDB[cdb_found_index]; /* Assert that the buffer remembered in cdb_found is the one */ /* the buffer manager is currently faulting in */ Assert(BUFFERTAGS_EQUAL(cdb_found->buf_tag, *newTag)); if (cdb_replace_index >= 0) { /* We are satisfying it with an evicted T buffer */ cdb_replace = &StrategyCDB[cdb_replace_index]; /* Assert that the buffer remembered in cdb_replace is */ /* the one the buffer manager has just evicted */ Assert(cdb_replace->list == STRAT_LIST_T1 || cdb_replace->list == STRAT_LIST_T2); Assert(cdb_replace->buf_id == buf->buf_id); Assert(BUFFERTAGS_EQUAL(cdb_replace->buf_tag, buf->tag)); /* * Under normal circumstances we move evicted T1 list entries * to the B1 list. However, T1 entries that exist only because * of VACUUM are just thrown into the unused list instead, * since it's unlikely they'll be touched again soon. Similarly, * evicted T2 entries are thrown away; the LRU T2 entry cannot * have been touched recently. */ if (cdb_replace->t1_vacuum || cdb_replace->list == STRAT_LIST_T2) { BufTableDelete(&(cdb_replace->buf_tag)); STRAT_LIST_REMOVE(cdb_replace); cdb_replace->next = StrategyControl->listUnusedCDB; StrategyControl->listUnusedCDB = cdb_replace_index; } else { STRAT_LIST_REMOVE(cdb_replace); STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B1); } /* And clear its block reference */ cdb_replace->buf_id = -1; } else { /* We are satisfying it with an unused buffer */ } /* Now the found B1 CDB gets the buffer and is moved to T2 */ cdb_found->buf_id = buf->buf_id; STRAT_LIST_REMOVE(cdb_found); STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T2); } else { /* * This was a complete cache miss, so we need to create a new CDB. * We use a free one if available, else reclaim the tail end of B1. */ if (StrategyControl->listUnusedCDB >= 0) { cdb_found = &StrategyCDB[StrategyControl->listUnusedCDB]; StrategyControl->listUnusedCDB = cdb_found->next; } else { /* Can't fail because we have more CDBs than buffers... */ if (B1_LENGTH == 0) elog(PANIC, "StrategyReplaceBuffer: out of CDBs"); cdb_found = &StrategyCDB[StrategyControl->listHead[STRAT_LIST_B1]]; BufTableDelete(&(cdb_found->buf_tag)); STRAT_LIST_REMOVE(cdb_found); } /* Set the CDB's buf_tag and insert it into the hash table */ cdb_found->buf_tag = *newTag; BufTableInsert(&(cdb_found->buf_tag), (cdb_found - StrategyCDB)); if (cdb_replace_index >= 0) { /* * The buffer was formerly in a T list, move its CDB to the * appropriate list: B1 if T1, else discard it, as above */ cdb_replace = &StrategyCDB[cdb_replace_index]; Assert(cdb_replace->list == STRAT_LIST_T1 || cdb_replace->list == STRAT_LIST_T2); Assert(cdb_replace->buf_id == buf->buf_id); Assert(BUFFERTAGS_EQUAL(cdb_replace->buf_tag, buf->tag)); if (cdb_replace->list == STRAT_LIST_T1) { STRAT_LIST_REMOVE(cdb_replace); STRAT_MRU_INSERT(cdb_replace, STRAT_LIST_B1); } else { BufTableDelete(&(cdb_replace->buf_tag)); STRAT_LIST_REMOVE(cdb_replace); cdb_replace->next = StrategyControl->listUnusedCDB; StrategyControl->listUnusedCDB = cdb_replace_index; } /* And clear its block reference */ cdb_replace->buf_id = -1; } else { /* We are satisfying it with an unused buffer */ } /* Assign the buffer id to the new CDB */ cdb_found->buf_id = buf->buf_id; /* * Specialized VACUUM optimization. If this complete cache miss * happened because vacuum needed the page, we place it at the LRU * position of T1; normally it goes at the MRU position. */ if (strategy_hint_vacuum) { if (TransactionIdEquals(strategy_vacuum_xid, GetTopTransactionId())) STRAT_LRU_INSERT(cdb_found, STRAT_LIST_T1); else { /* VACUUM must have been aborted by error, reset flag */ strategy_hint_vacuum = false; STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T1); } } else STRAT_MRU_INSERT(cdb_found, STRAT_LIST_T1); /* * Remember the Xid when this buffer went onto T1 to avoid a * single UPDATE promoting a newcomer straight into T2. Also * remember if it was loaded for VACUUM. */ cdb_found->t1_xid = GetTopTransactionId(); cdb_found->t1_vacuum = strategy_hint_vacuum; } }
/* * For all items in this page, find their respective root line pointers. * If item k is part of a HOT-chain with root at item j, then we set * root_offsets[k - 1] = j. * * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries. * We zero out all unused entries. * * The function must be called with at least share lock on the buffer, to * prevent concurrent prune operations. * * Note: The information collected here is valid only as long as the caller * holds a pin on the buffer. Once pin is released, a tuple might be pruned * and reused by a completely unrelated tuple. */ void heap_get_root_tuples(Page page, OffsetNumber *root_offsets) { OffsetNumber offnum, maxoff; MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber)); maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId lp = PageGetItemId(page, offnum); HeapTupleHeader htup; OffsetNumber nextoffnum; TransactionId priorXmax; /* skip unused and dead items */ if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp)) continue; if (ItemIdIsNormal(lp)) { htup = (HeapTupleHeader) PageGetItem(page, lp); /* * Check if this tuple is part of a HOT-chain rooted at some other * tuple. If so, skip it for now; we'll process it when we find * its root. */ if (HeapTupleHeaderIsHeapOnly(htup)) continue; /* * This is either a plain tuple or the root of a HOT-chain. * Remember it in the mapping. */ root_offsets[offnum - 1] = offnum; /* If it's not the start of a HOT-chain, we're done with it */ if (!HeapTupleHeaderIsHotUpdated(htup)) continue; /* Set up to scan the HOT-chain */ nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); priorXmax = HeapTupleHeaderGetXmax(htup); } else { /* Must be a redirect item. We do not set its root_offsets entry */ Assert(ItemIdIsRedirected(lp)); /* Set up to scan the HOT-chain */ nextoffnum = ItemIdGetRedirect(lp); priorXmax = InvalidTransactionId; } /* * Now follow the HOT-chain and collect other tuples in the chain. * * Note: Even though this is a nested loop, the complexity of the * function is O(N) because a tuple in the page should be visited not * more than twice, once in the outer loop and once in HOT-chain * chases. */ for (;;) { lp = PageGetItemId(page, nextoffnum); /* Check for broken chains */ if (!ItemIdIsNormal(lp)) break; htup = (HeapTupleHeader) PageGetItem(page, lp); if (TransactionIdIsValid(priorXmax) && !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup))) break; /* Remember the root line pointer for this item */ root_offsets[nextoffnum - 1] = offnum; /* Advance to next chain member, if any */ if (!HeapTupleHeaderIsHotUpdated(htup)) break; nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); priorXmax = HeapTupleHeaderGetXmax(htup); } } }
/* * XidInMVCCSnapshot * Is the given XID still-in-progress according to the snapshot? * * Note: GetSnapshotData never stores either top xid or subxids of our own * backend into a snapshot, so these xids will not be reported as "running" * by this function. This is OK for current uses, because we actually only * apply this for known-committed XIDs. */ static bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) { uint32 i; /* * Make a quick range check to eliminate most XIDs without looking at the * xip arrays. Note that this is OK even if we convert a subxact XID to * its parent below, because a subxact with XID < xmin has surely also got * a parent with XID < xmin, while one with XID >= xmax must belong to a * parent that was not yet committed at the time of this snapshot. */ /* Any xid < xmin is not in-progress */ if (TransactionIdPrecedes(xid, snapshot->xmin)) return false; /* Any xid >= xmax is in-progress */ if (TransactionIdFollowsOrEquals(xid, snapshot->xmax)) return true; /* * Snapshot information is stored slightly differently in snapshots taken * during recovery. */ if (!snapshot->takenDuringRecovery) { /* * If the snapshot contains full subxact data, the fastest way to * check things is just to compare the given XID against both subxact * XIDs and top-level XIDs. If the snapshot overflowed, we have to * use pg_subtrans to convert a subxact XID to its parent XID, but * then we need only look at top-level XIDs not subxacts. */ if (!snapshot->suboverflowed) { /* full data, so search subxip */ int32 j; for (j = 0; j < snapshot->subxcnt; j++) { if (TransactionIdEquals(xid, snapshot->subxip[j])) return true; } /* not there, fall through to search xip[] */ } else { /* overflowed, so convert xid to top-level */ xid = SubTransGetTopmostTransaction(xid); /* * If xid was indeed a subxact, we might now have an xid < xmin, * so recheck to avoid an array scan. No point in rechecking * xmax. */ if (TransactionIdPrecedes(xid, snapshot->xmin)) return false; } for (i = 0; i < snapshot->xcnt; i++) { if (TransactionIdEquals(xid, snapshot->xip[i])) return true; } } else { int32 j; /* * In recovery we store all xids in the subxact array because it is by * far the bigger array, and we mostly don't know which xids are * top-level and which are subxacts. The xip array is empty. * * We start by searching subtrans, if we overflowed. */ if (snapshot->suboverflowed) { /* overflowed, so convert xid to top-level */ xid = SubTransGetTopmostTransaction(xid); /* * If xid was indeed a subxact, we might now have an xid < xmin, * so recheck to avoid an array scan. No point in rechecking * xmax. */ if (TransactionIdPrecedes(xid, snapshot->xmin)) return false; } /* * We now have either a top-level xid higher than xmin or an * indeterminate xid. We don't know whether it's top level or subxact * but it doesn't matter. If it's present, the xid is visible. */ for (j = 0; j < snapshot->subxcnt; j++) { if (TransactionIdEquals(xid, snapshot->subxip[j])) return true; } } return false; }
Datum _Slony_I_createEvent(PG_FUNCTION_ARGS) { TransactionId newXid = GetTopTransactionId(); Slony_I_ClusterStatus *cs; char *ev_type_c; Datum argv[9]; char nulls[10]; char *buf; size_t buf_size; int rc; int i; int64 retval; bool isnull; #ifdef HAVE_GETACTIVESNAPSHOT if (GetActiveSnapshot() == NULL) elog(ERROR, "Slony-I: ActiveSnapshot is NULL in createEvent()"); #else if (SerializableSnapshot == NULL) elog(ERROR, "Slony-I: SerializableSnapshot is NULL in createEvent()"); #endif if ((rc = SPI_connect()) < 0) elog(ERROR, "Slony-I: SPI_connect() failed in createEvent()"); /* * Get or create the cluster status information and make sure it has the * SPI plans that we need here. */ cs = getClusterStatus(PG_GETARG_NAME(0), PLAN_INSERT_EVENT); buf_size = 8192; buf = palloc(buf_size); /* * Do the following only once per transaction. */ if (!TransactionIdEquals(cs->currentXid, newXid)) { cs->currentXid = newXid; } /* * Call the saved INSERT plan */ for (i = 1; i < 10; i++) { if (i >= PG_NARGS() || PG_ARGISNULL(i)) { argv[i - 1] = (Datum) 0; nulls[i - 1] = 'n'; } else { argv[i - 1] = PG_GETARG_DATUM(i); nulls[i - 1] = ' '; } } nulls[9] = '\0'; if ((rc = SPI_execp(cs->plan_insert_event, argv, nulls, 0)) < 0) elog(ERROR, "Slony-I: SPI_execp() failed for \"INSERT INTO sl_event ...\""); /* * The INSERT plan also contains a SELECT currval('sl_event_seq'), use the * new sequence number as return value. */ if (SPI_processed != 1) elog(ERROR, "Slony-I: INSERT plan did not return 1 result row"); retval = DatumGetInt64(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isnull)); /* * For SYNC and ENABLE_SUBSCRIPTION events, we also remember all current * sequence values. */ if (PG_NARGS() > 1 && !PG_ARGISNULL(1)) { ev_type_c = DatumGetPointer(DirectFunctionCall1( textout, PG_GETARG_DATUM(1))); if (strcmp(ev_type_c, "SYNC") == 0 || strcmp(ev_type_c, "ENABLE_SUBSCRIPTION") == 0) { /*@-nullpass@*/ if ((rc = SPI_execp(cs->plan_record_sequences, NULL, NULL, 0)) < 0) elog(ERROR, "Slony-I: SPI_execp() failed for \"INSERT INTO sl_seqlog ...\""); /*@+nullpass@*/ } } (void) SPI_finish(); /*@-mustfreefresh@*/ PG_RETURN_INT64(retval); }
Datum _Slony_I_logTrigger(PG_FUNCTION_ARGS) { TransactionId newXid = GetTopTransactionId(); Slony_I_ClusterStatus *cs; TriggerData *tg; Datum argv[4]; text *cmdtype = NULL; int rc; Name cluster_name; int32 tab_id; char *attkind; int attkind_idx; int cmddata_need; /* * Don't do any logging if the current session role isn't Origin. */ if (SessionReplicationRole != SESSION_REPLICATION_ROLE_ORIGIN) return PointerGetDatum(NULL); /* * Get the trigger call context */ if (!CALLED_AS_TRIGGER(fcinfo)) elog(ERROR, "Slony-I: logTrigger() not called as trigger"); tg = (TriggerData *) (fcinfo->context); /* * Check all logTrigger() calling conventions */ if (!TRIGGER_FIRED_AFTER(tg->tg_event)) elog(ERROR, "Slony-I: logTrigger() must be fired AFTER"); if (!TRIGGER_FIRED_FOR_ROW(tg->tg_event)) elog(ERROR, "Slony-I: logTrigger() must be fired FOR EACH ROW"); if (tg->tg_trigger->tgnargs != 3) elog(ERROR, "Slony-I: logTrigger() must be defined with 3 args"); /* * Connect to the SPI manager */ if ((rc = SPI_connect()) < 0) elog(ERROR, "Slony-I: SPI_connect() failed in createEvent()"); /* * Get all the trigger arguments */ cluster_name = DatumGetName(DirectFunctionCall1(namein, CStringGetDatum(tg->tg_trigger->tgargs[0]))); tab_id = strtol(tg->tg_trigger->tgargs[1], NULL, 10); attkind = tg->tg_trigger->tgargs[2]; /* * Get or create the cluster status information and make sure it has the * SPI plans that we need here. */ cs = getClusterStatus(cluster_name, PLAN_INSERT_LOG); /* * Do the following only once per transaction. */ if (!TransactionIdEquals(cs->currentXid, newXid)) { int32 log_status; bool isnull; /* * Determine the currently active log table */ if (SPI_execp(cs->plan_get_logstatus, NULL, NULL, 0) < 0) elog(ERROR, "Slony-I: cannot determine log status"); if (SPI_processed != 1) elog(ERROR, "Slony-I: cannot determine log status"); log_status = DatumGetInt32(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isnull)); SPI_freetuptable(SPI_tuptable); switch (log_status) { case 0: case 2: cs->plan_active_log = cs->plan_insert_log_1; break; case 1: case 3: cs->plan_active_log = cs->plan_insert_log_2; break; default: elog(ERROR, "Slony-I: illegal log status %d", log_status); break; } cs->currentXid = newXid; } /* * Determine cmdtype and cmddata depending on the command type */ if (TRIGGER_FIRED_BY_INSERT(tg->tg_event)) { HeapTuple new_row = tg->tg_trigtuple; TupleDesc tupdesc = tg->tg_relation->rd_att; char *col_ident; char *col_value; int len_ident; int len_value; int i; int need_comma = false; char *OldDateStyle; char *cp = VARDATA(cs->cmddata_buf); /* * INSERT * * cmdtype = 'I' cmddata = ("col" [, ...]) values ('value' [, ...]) */ cmdtype = cs->cmdtype_I; /* * Specify all the columns */ *cp++ = '('; for (i = 0; i < tg->tg_relation->rd_att->natts; i++) { /* * Skip dropped columns */ if (tupdesc->attrs[i]->attisdropped) continue; col_ident = (char *) slon_quote_identifier(SPI_fname(tupdesc, i + 1)); cmddata_need = (cp - (char *) (cs->cmddata_buf)) + 16 + (len_ident = strlen(col_ident)); if (cs->cmddata_size < cmddata_need) { int have = (cp - (char *) (cs->cmddata_buf)); while (cs->cmddata_size < cmddata_need) cs->cmddata_size *= 2; cs->cmddata_buf = realloc(cs->cmddata_buf, cs->cmddata_size); cp = (char *) (cs->cmddata_buf) + have; } if (need_comma) *cp++ = ','; else need_comma = true; memcpy(cp, col_ident, len_ident); cp += len_ident; } /* * Append the string ") values (" */ *cp++ = ')'; *cp++ = ' '; *cp++ = 'v'; *cp++ = 'a'; *cp++ = 'l'; *cp++ = 'u'; *cp++ = 'e'; *cp++ = 's'; *cp++ = ' '; *cp++ = '('; /* * Append the values */ need_comma = false; OldDateStyle = GetConfigOptionByName("DateStyle", NULL); if (!strstr(OldDateStyle, "ISO")) set_config_option("DateStyle", "ISO", PGC_USERSET, PGC_S_SESSION, true, true); for (i = 0; i < tg->tg_relation->rd_att->natts; i++) { /* * Skip dropped columns */ if (tupdesc->attrs[i]->attisdropped) continue; if ((col_value = SPI_getvalue(new_row, tupdesc, i + 1)) == NULL) { col_value = "NULL"; } else { col_value = slon_quote_literal(col_value); } cmddata_need = (cp - (char *) (cs->cmddata_buf)) + 16 + (len_value = strlen(col_value)); if (cs->cmddata_size < cmddata_need) { int have = (cp - (char *) (cs->cmddata_buf)); while (cs->cmddata_size < cmddata_need) cs->cmddata_size *= 2; cs->cmddata_buf = realloc(cs->cmddata_buf, cs->cmddata_size); cp = (char *) (cs->cmddata_buf) + have; } if (need_comma) *cp++ = ','; else need_comma = true; memcpy(cp, col_value, len_value); cp += len_value; } if (!strstr(OldDateStyle, "ISO")) set_config_option("DateStyle", OldDateStyle, PGC_USERSET, PGC_S_SESSION, true, true); /* * Terminate and done */ *cp++ = ')'; *cp = '\0'; SET_VARSIZE(cs->cmddata_buf, VARHDRSZ + (cp - VARDATA(cs->cmddata_buf))); } else if (TRIGGER_FIRED_BY_UPDATE(tg->tg_event)) { HeapTuple old_row = tg->tg_trigtuple; HeapTuple new_row = tg->tg_newtuple; TupleDesc tupdesc = tg->tg_relation->rd_att; Datum old_value; Datum new_value; bool old_isnull; bool new_isnull; char *col_ident; char *col_value; int len_ident; int len_value; int i; int need_comma = false; int need_and = false; char *OldDateStyle; char *cp = VARDATA(cs->cmddata_buf); /* * UPDATE * * cmdtype = 'U' cmddata = "col_ident"='value' [, ...] where * "pk_ident" = 'value' [ and ...] */ cmdtype = cs->cmdtype_U; for (i = 0; i < tg->tg_relation->rd_att->natts; i++) { /* * Ignore dropped columns */ if (tupdesc->attrs[i]->attisdropped) continue; old_value = SPI_getbinval(old_row, tupdesc, i + 1, &old_isnull); new_value = SPI_getbinval(new_row, tupdesc, i + 1, &new_isnull); /* * If old and new value are NULL, the column is unchanged */ if (old_isnull && new_isnull) continue; /* * If both are NOT NULL, we need to compare the values and skip * setting the column if equal */ if (!old_isnull && !new_isnull) { Oid opr_oid; FmgrInfo *opr_finfo_p; /* * Lookup the equal operators function call info using the * typecache if available */ #ifdef HAVE_TYPCACHE TypeCacheEntry *type_cache; type_cache = lookup_type_cache( SPI_gettypeid(tupdesc, i + 1), TYPECACHE_EQ_OPR | TYPECACHE_EQ_OPR_FINFO); opr_oid = type_cache->eq_opr; if (opr_oid == ARRAY_EQ_OP) opr_oid = InvalidOid; else opr_finfo_p = &(type_cache->eq_opr_finfo); #else FmgrInfo opr_finfo; opr_oid = compatible_oper_funcid(makeList1(makeString("=")), SPI_gettypeid(tupdesc, i + 1), SPI_gettypeid(tupdesc, i + 1), true); if (OidIsValid(opr_oid)) { fmgr_info(opr_oid, &opr_finfo); opr_finfo_p = &opr_finfo; } #endif /* * If we have an equal operator, use that to do binary * comparision. Else get the string representation of both * attributes and do string comparision. */ if (OidIsValid(opr_oid)) { if (DatumGetBool(FunctionCall2(opr_finfo_p, old_value, new_value))) continue; } else { char *old_strval = SPI_getvalue(old_row, tupdesc, i + 1); char *new_strval = SPI_getvalue(new_row, tupdesc, i + 1); if (strcmp(old_strval, new_strval) == 0) continue; } } if (need_comma) *cp++ = ','; else need_comma = true; col_ident = (char *) slon_quote_identifier(SPI_fname(tupdesc, i + 1)); if (new_isnull) col_value = "NULL"; else { OldDateStyle = GetConfigOptionByName("DateStyle", NULL); if (!strstr(OldDateStyle, "ISO")) set_config_option("DateStyle", "ISO", PGC_USERSET, PGC_S_SESSION, true, true); col_value = slon_quote_literal(SPI_getvalue(new_row, tupdesc, i + 1)); if (!strstr(OldDateStyle, "ISO")) set_config_option("DateStyle", OldDateStyle, PGC_USERSET, PGC_S_SESSION, true, true); } cmddata_need = (cp - (char *) (cs->cmddata_buf)) + 16 + (len_ident = strlen(col_ident)) + (len_value = strlen(col_value)); if (cs->cmddata_size < cmddata_need) { int have = (cp - (char *) (cs->cmddata_buf)); while (cs->cmddata_size < cmddata_need) cs->cmddata_size *= 2; cs->cmddata_buf = realloc(cs->cmddata_buf, cs->cmddata_size); cp = (char *) (cs->cmddata_buf) + have; } memcpy(cp, col_ident, len_ident); cp += len_ident; *cp++ = '='; memcpy(cp, col_value, len_value); cp += len_value; } /* * It can happen that the only UPDATE an application does is to set a * column to the same value again. In that case, we'd end up here with * no columns in the SET clause yet. We add the first key column here * with it's old value to simulate the same for the replication * engine. */ if (!need_comma) { for (i = 0, attkind_idx = -1; i < tg->tg_relation->rd_att->natts; i++) { if (tupdesc->attrs[i]->attisdropped) continue; attkind_idx++; if (!attkind[attkind_idx]) elog(ERROR, "Slony-I: no key columns found in logTrigger() attkind parameter"); if (attkind[attkind_idx] == 'k') break; } col_ident = (char *) slon_quote_identifier(SPI_fname(tupdesc, i + 1)); col_value = slon_quote_literal(SPI_getvalue(old_row, tupdesc, i + 1)); cmddata_need = (cp - (char *) (cs->cmddata_buf)) + 16 + (len_ident = strlen(col_ident)) + (len_value = strlen(col_value)); if (cs->cmddata_size < cmddata_need) { int have = (cp - (char *) (cs->cmddata_buf)); while (cs->cmddata_size < cmddata_need) cs->cmddata_size *= 2; cs->cmddata_buf = realloc(cs->cmddata_buf, cs->cmddata_size); cp = (char *) (cs->cmddata_buf) + have; } memcpy(cp, col_ident, len_ident); cp += len_ident; *cp++ = '='; memcpy(cp, col_value, len_value); cp += len_value; } *cp++ = ' '; *cp++ = 'w'; *cp++ = 'h'; *cp++ = 'e'; *cp++ = 'r'; *cp++ = 'e'; *cp++ = ' '; for (i = 0, attkind_idx = -1; i < tg->tg_relation->rd_att->natts; i++) { /* * Ignore dropped columns */ if (tupdesc->attrs[i]->attisdropped) continue; attkind_idx++; if (!attkind[attkind_idx]) break; if (attkind[attkind_idx] != 'k') continue; col_ident = (char *) slon_quote_identifier(SPI_fname(tupdesc, i + 1)); col_value = slon_quote_literal(SPI_getvalue(old_row, tupdesc, i + 1)); if (col_value == NULL) elog(ERROR, "Slony-I: old key column %s.%s IS NULL on UPDATE", NameStr(tg->tg_relation->rd_rel->relname), col_ident); cmddata_need = (cp - (char *) (cs->cmddata_buf)) + 16 + (len_ident = strlen(col_ident)) + (len_value = strlen(col_value)); if (cs->cmddata_size < cmddata_need) { int have = (cp - (char *) (cs->cmddata_buf)); while (cs->cmddata_size < cmddata_need) cs->cmddata_size *= 2; cs->cmddata_buf = realloc(cs->cmddata_buf, cs->cmddata_size); cp = (char *) (cs->cmddata_buf) + have; } if (need_and) { *cp++ = ' '; *cp++ = 'a'; *cp++ = 'n'; *cp++ = 'd'; *cp++ = ' '; } else need_and = true; memcpy(cp, col_ident, len_ident); cp += len_ident; *cp++ = '='; memcpy(cp, col_value, len_value); cp += len_value; } *cp = '\0'; SET_VARSIZE(cs->cmddata_buf, VARHDRSZ + (cp - VARDATA(cs->cmddata_buf))); } else if (TRIGGER_FIRED_BY_DELETE(tg->tg_event)) { HeapTuple old_row = tg->tg_trigtuple; TupleDesc tupdesc = tg->tg_relation->rd_att; char *col_ident; char *col_value; int len_ident; int len_value; int i; int need_and = false; char *cp = VARDATA(cs->cmddata_buf); /* * DELETE * * cmdtype = 'D' cmddata = "pk_ident"='value' [and ...] */ cmdtype = cs->cmdtype_D; for (i = 0, attkind_idx = -1; i < tg->tg_relation->rd_att->natts; i++) { if (tupdesc->attrs[i]->attisdropped) continue; attkind_idx++; if (!attkind[attkind_idx]) break; if (attkind[attkind_idx] != 'k') continue; col_ident = (char *) slon_quote_identifier(SPI_fname(tupdesc, i + 1)); col_value = slon_quote_literal(SPI_getvalue(old_row, tupdesc, i + 1)); if (col_value == NULL) elog(ERROR, "Slony-I: old key column %s.%s IS NULL on DELETE", NameStr(tg->tg_relation->rd_rel->relname), col_ident); cmddata_need = (cp - (char *) (cs->cmddata_buf)) + 16 + (len_ident = strlen(col_ident)) + (len_value = strlen(col_value)); if (cs->cmddata_size < cmddata_need) { int have = (cp - (char *) (cs->cmddata_buf)); while (cs->cmddata_size < cmddata_need) cs->cmddata_size *= 2; cs->cmddata_buf = realloc(cs->cmddata_buf, cs->cmddata_size); cp = (char *) (cs->cmddata_buf) + have; } if (need_and) { *cp++ = ' '; *cp++ = 'a'; *cp++ = 'n'; *cp++ = 'd'; *cp++ = ' '; } else need_and = true; memcpy(cp, col_ident, len_ident); cp += len_ident; *cp++ = '='; memcpy(cp, col_value, len_value); cp += len_value; } *cp = '\0'; SET_VARSIZE(cs->cmddata_buf, VARHDRSZ + (cp - VARDATA(cs->cmddata_buf))); } else elog(ERROR, "Slony-I: logTrigger() fired for unhandled event"); /* * Construct the parameter array and insert the log row. */ argv[0] = Int32GetDatum(tab_id); argv[1] = PointerGetDatum(cmdtype); argv[2] = PointerGetDatum(cs->cmddata_buf); SPI_execp(cs->plan_active_log, argv, NULL, 0); SPI_finish(); return PointerGetDatum(NULL); }
/* ---------------- * index_getnext - get the next heap tuple from a scan * * The result is the next heap tuple satisfying the scan keys and the * snapshot, or NULL if no more matching tuples exist. On success, * the buffer containing the heap tuple is pinned (the pin will be dropped * at the next index_getnext or index_endscan). * * Note: caller must check scan->xs_recheck, and perform rechecking of the * scan keys if required. We do not do that here because we don't have * enough information to do it efficiently in the general case. * ---------------- */ HeapTuple index_getnext(IndexScanDesc scan, ScanDirection direction) { HeapTuple heapTuple = &scan->xs_ctup; ItemPointer tid = &heapTuple->t_self; FmgrInfo *procedure; SCAN_CHECKS; GET_SCAN_PROCEDURE(amgettuple); Assert(TransactionIdIsValid(RecentGlobalXmin)); /* * We always reset xs_hot_dead; if we are here then either we are just * starting the scan, or we previously returned a visible tuple, and in * either case it's inappropriate to kill the prior index entry. */ scan->xs_hot_dead = false; for (;;) { OffsetNumber offnum; bool at_chain_start; Page dp; if (scan->xs_next_hot != InvalidOffsetNumber) { /* * We are resuming scan of a HOT chain after having returned an * earlier member. Must still hold pin on current heap page. */ Assert(BufferIsValid(scan->xs_cbuf)); Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(scan->xs_cbuf)); Assert(TransactionIdIsValid(scan->xs_prev_xmax)); offnum = scan->xs_next_hot; at_chain_start = false; scan->xs_next_hot = InvalidOffsetNumber; } else { bool found; Buffer prev_buf; /* * If we scanned a whole HOT chain and found only dead tuples, * tell index AM to kill its entry for that TID. We do not do this * when in recovery because it may violate MVCC to do so. see * comments in RelationGetIndexScan(). */ if (!scan->xactStartedInRecovery) scan->kill_prior_tuple = scan->xs_hot_dead; /* * The AM's gettuple proc finds the next index entry matching the * scan keys, and puts the TID in xs_ctup.t_self (ie, *tid). It * should also set scan->xs_recheck, though we pay no attention to * that here. */ found = DatumGetBool(FunctionCall2(procedure, PointerGetDatum(scan), Int32GetDatum(direction))); /* Reset kill flag immediately for safety */ scan->kill_prior_tuple = false; /* If we're out of index entries, break out of outer loop */ if (!found) break; pgstat_count_index_tuples(scan->indexRelation, 1); /* Switch to correct buffer if we don't have it already */ prev_buf = scan->xs_cbuf; scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf, scan->heapRelation, ItemPointerGetBlockNumber(tid)); /* * Prune page, but only if we weren't already on this page */ if (prev_buf != scan->xs_cbuf) heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf, RecentGlobalXmin); /* Prepare to scan HOT chain starting at index-referenced offnum */ offnum = ItemPointerGetOffsetNumber(tid); at_chain_start = true; /* We don't know what the first tuple's xmin should be */ scan->xs_prev_xmax = InvalidTransactionId; /* Initialize flag to detect if all entries are dead */ scan->xs_hot_dead = true; } /* Obtain share-lock on the buffer so we can examine visibility */ LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); dp = (Page) BufferGetPage(scan->xs_cbuf); /* Scan through possible multiple members of HOT-chain */ for (;;) { ItemId lp; ItemPointer ctid; bool valid; /* check for bogus TID */ if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp)) break; lp = PageGetItemId(dp, offnum); /* check for unused, dead, or redirected items */ if (!ItemIdIsNormal(lp)) { /* We should only see a redirect at start of chain */ if (ItemIdIsRedirected(lp) && at_chain_start) { /* Follow the redirect */ offnum = ItemIdGetRedirect(lp); at_chain_start = false; continue; } /* else must be end of chain */ break; } /* * We must initialize all of *heapTuple (ie, scan->xs_ctup) since * it is returned to the executor on success. */ heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp); heapTuple->t_len = ItemIdGetLength(lp); ItemPointerSetOffsetNumber(tid, offnum); heapTuple->t_tableOid = RelationGetRelid(scan->heapRelation); ctid = &heapTuple->t_data->t_ctid; /* * Shouldn't see a HEAP_ONLY tuple at chain start. (This test * should be unnecessary, since the chain root can't be removed * while we have pin on the index entry, but let's make it * anyway.) */ if (at_chain_start && HeapTupleIsHeapOnly(heapTuple)) break; /* * The xmin should match the previous xmax value, else chain is * broken. (Note: this test is not optional because it protects * us against the case where the prior chain member's xmax aborted * since we looked at it.) */ if (TransactionIdIsValid(scan->xs_prev_xmax) && !TransactionIdEquals(scan->xs_prev_xmax, HeapTupleHeaderGetXmin(heapTuple->t_data))) break; /* If it's visible per the snapshot, we must return it */ valid = HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot, scan->xs_cbuf); CheckForSerializableConflictOut(valid, scan->heapRelation, heapTuple, scan->xs_cbuf); if (valid) { /* * If the snapshot is MVCC, we know that it could accept at * most one member of the HOT chain, so we can skip examining * any more members. Otherwise, check for continuation of the * HOT-chain, and set state for next time. */ if (IsMVCCSnapshot(scan->xs_snapshot) && !IsolationIsSerializable()) scan->xs_next_hot = InvalidOffsetNumber; else if (HeapTupleIsHotUpdated(heapTuple)) { Assert(ItemPointerGetBlockNumber(ctid) == ItemPointerGetBlockNumber(tid)); scan->xs_next_hot = ItemPointerGetOffsetNumber(ctid); scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data); } else scan->xs_next_hot = InvalidOffsetNumber; PredicateLockTuple(scan->heapRelation, heapTuple); LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); pgstat_count_heap_fetch(scan->indexRelation); return heapTuple; } /* * If we can't see it, maybe no one else can either. Check to see * if the tuple is dead to all transactions. If we find that all * the tuples in the HOT chain are dead, we'll signal the index AM * to not return that TID on future indexscans. */ if (scan->xs_hot_dead && HeapTupleSatisfiesVacuum(heapTuple->t_data, RecentGlobalXmin, scan->xs_cbuf) != HEAPTUPLE_DEAD) scan->xs_hot_dead = false; /* * Check to see if HOT chain continues past this tuple; if so * fetch the next offnum (we don't bother storing it into * xs_next_hot, but must store xs_prev_xmax), and loop around. */ if (HeapTupleIsHotUpdated(heapTuple)) { Assert(ItemPointerGetBlockNumber(ctid) == ItemPointerGetBlockNumber(tid)); offnum = ItemPointerGetOffsetNumber(ctid); at_chain_start = false; scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data); } else break; /* end of chain */ } /* loop over a single HOT chain */ LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); /* Loop around to ask index AM for another TID */ scan->xs_next_hot = InvalidOffsetNumber; } /* Release any held pin on a heap page */ if (BufferIsValid(scan->xs_cbuf)) { ReleaseBuffer(scan->xs_cbuf); scan->xs_cbuf = InvalidBuffer; } return NULL; /* failure exit */ }
/* * Prune specified item pointer or a HOT chain originating at that item. * * If the item is an index-referenced tuple (i.e. not a heap-only tuple), * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple. * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really * DEAD, the OldestXmin test is just too coarse to detect it. * * The root line pointer is redirected to the tuple immediately after the * latest DEAD tuple. If all tuples in the chain are DEAD, the root line * pointer is marked LP_DEAD. (This includes the case of a DEAD simple * tuple, which we treat as a chain of length 1.) * * OldestXmin is the cutoff XID used to identify dead tuples. * * We don't actually change the page here, except perhaps for hint-bit updates * caused by HeapTupleSatisfiesVacuum. We just add entries to the arrays in * prstate showing the changes to be made. Items to be redirected are added * to the redirected[] array (two entries per redirection); items to be set to * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED * state are added to nowunused[]. * * If redirect_move is true, we intend to get rid of redirecting line pointers, * not just make redirection entries. * * Returns the number of tuples (to be) deleted from the page. */ static int heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum, TransactionId OldestXmin, PruneState *prstate, bool redirect_move) { int ndeleted = 0; Page dp = (Page) BufferGetPage(buffer); TransactionId priorXmax = InvalidTransactionId; ItemId rootlp; HeapTupleHeader htup; OffsetNumber latestdead = InvalidOffsetNumber, redirect_target = InvalidOffsetNumber, maxoff = PageGetMaxOffsetNumber(dp), offnum; OffsetNumber chainitems[MaxHeapTuplesPerPage]; int nchain = 0, i; rootlp = PageGetItemId(dp, rootoffnum); /* * If it's a heap-only tuple, then it is not the start of a HOT chain. */ if (ItemIdIsNormal(rootlp)) { htup = (HeapTupleHeader) PageGetItem(dp, rootlp); if (HeapTupleHeaderIsHeapOnly(htup)) { /* * If the tuple is DEAD and doesn't chain to anything else, mark * it unused immediately. (If it does chain, we can only remove * it as part of pruning its chain.) * * We need this primarily to handle aborted HOT updates, that is, * XMIN_INVALID heap-only tuples. Those might not be linked to by * any chain, since the parent tuple might be re-updated before * any pruning occurs. So we have to be able to reap them * separately from chain-pruning. (Note that * HeapTupleHeaderIsHotUpdated will never return true for an * XMIN_INVALID tuple, so this code will work even when there were * sequential updates within the aborted transaction.) * * Note that we might first arrive at a dead heap-only tuple * either here or while following a chain below. Whichever path * gets there first will mark the tuple unused. */ if (HeapTupleSatisfiesVacuum(relation, htup, OldestXmin, buffer) == HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup)) { heap_prune_record_unused(prstate, rootoffnum); ndeleted++; } /* Nothing more to do */ return ndeleted; } } /* Start from the root tuple */ offnum = rootoffnum; /* while not end of the chain */ for (;;) { ItemId lp; bool tupdead, recent_dead; /* Some sanity checks */ if (offnum < FirstOffsetNumber || offnum > maxoff) break; /* If item is already processed, stop --- it must not be same chain */ if (prstate->marked[offnum]) break; lp = PageGetItemId(dp, offnum); /* Unused item obviously isn't part of the chain */ if (!ItemIdIsUsed(lp)) break; /* * If we are looking at the redirected root line pointer, jump to the * first normal tuple in the chain. If we find a redirect somewhere * else, stop --- it must not be same chain. */ if (ItemIdIsRedirected(lp)) { if (nchain > 0) break; /* not at start of chain */ chainitems[nchain++] = offnum; offnum = ItemIdGetRedirect(rootlp); continue; } /* * Likewise, a dead item pointer can't be part of the chain. (We * already eliminated the case of dead root tuple outside this * function.) */ if (ItemIdIsDead(lp)) break; Assert(ItemIdIsNormal(lp)); htup = (HeapTupleHeader) PageGetItem(dp, lp); /* * Check the tuple XMIN against prior XMAX, if any */ if (TransactionIdIsValid(priorXmax) && !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax)) break; /* * OK, this tuple is indeed a member of the chain. */ chainitems[nchain++] = offnum; /* * Check tuple's visibility status. */ tupdead = recent_dead = false; switch (HeapTupleSatisfiesVacuum(relation, htup, OldestXmin, buffer)) { case HEAPTUPLE_DEAD: tupdead = true; break; case HEAPTUPLE_RECENTLY_DEAD: recent_dead = true; /* * This tuple may soon become DEAD. Update the hint field so * that the page is reconsidered for pruning in future. */ heap_prune_record_prunable(prstate, HeapTupleHeaderGetXmax(htup)); break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* * This tuple may soon become DEAD. Update the hint field so * that the page is reconsidered for pruning in future. */ heap_prune_record_prunable(prstate, HeapTupleHeaderGetXmax(htup)); break; case HEAPTUPLE_LIVE: case HEAPTUPLE_INSERT_IN_PROGRESS: /* * If we wanted to optimize for aborts, we might consider * marking the page prunable when we see INSERT_IN_PROGRESS. * But we don't. See related decisions about when to mark the * page prunable in heapam.c. */ break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } /* * Remember the last DEAD tuple seen. We will advance past * RECENTLY_DEAD tuples just in case there's a DEAD one after them; * but we can't advance past anything else. (XXX is it really worth * continuing to scan beyond RECENTLY_DEAD? The case where we will * find another DEAD tuple is a fairly unusual corner case.) */ if (tupdead) latestdead = offnum; else if (!recent_dead) break; /* * If the tuple is not HOT-updated, then we are at the end of this * HOT-update chain. */ if (!HeapTupleHeaderIsHotUpdated(htup)) break; /* * Advance to next chain member. */ Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == BufferGetBlockNumber(buffer)); offnum = ItemPointerGetOffsetNumber(&htup->t_ctid); priorXmax = HeapTupleHeaderGetXmax(htup); } /* * If we found a DEAD tuple in the chain, adjust the HOT chain so that all * the DEAD tuples at the start of the chain are removed and the root line * pointer is appropriately redirected. */ if (OffsetNumberIsValid(latestdead)) { /* * Mark as unused each intermediate item that we are able to remove * from the chain. * * When the previous item is the last dead tuple seen, we are at the * right candidate for redirection. */ for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++) { heap_prune_record_unused(prstate, chainitems[i]); ndeleted++; } /* * If the root entry had been a normal tuple, we are deleting it, so * count it in the result. But changing a redirect (even to DEAD * state) doesn't count. */ if (ItemIdIsNormal(rootlp)) ndeleted++; /* * If the DEAD tuple is at the end of the chain, the entire chain is * dead and the root line pointer can be marked dead. Otherwise just * redirect the root to the correct chain member. */ if (i >= nchain) heap_prune_record_dead(prstate, rootoffnum); else { heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]); /* If the redirection will be a move, need more processing */ if (redirect_move) redirect_target = chainitems[i]; } } else if (nchain < 2 && ItemIdIsRedirected(rootlp)) { /* * We found a redirect item that doesn't point to a valid follow-on * item. This can happen if the loop in heap_page_prune caused us to * visit the dead successor of a redirect item before visiting the * redirect item. We can clean up by setting the redirect item to * DEAD state. */ heap_prune_record_dead(prstate, rootoffnum); } else if (redirect_move && ItemIdIsRedirected(rootlp)) { /* * If we desire to eliminate LP_REDIRECT items by moving tuples, make * a redirection entry for each redirected root item; this will cause * heap_page_prune_execute to actually do the move. (We get here only * when there are no DEAD tuples in the chain; otherwise the * redirection entry was made above.) */ heap_prune_record_redirect(prstate, rootoffnum, chainitems[1]); redirect_target = chainitems[1]; } /* * If we are going to implement a redirect by moving tuples, we have to * issue a cache invalidation against the redirection target tuple, * because its CTID will be effectively changed by the move. Note that * CacheInvalidateHeapTuple only queues the request, it doesn't send it; * if we fail before reaching EndNonTransactionalInvalidation, nothing * happens and no harm is done. */ if (OffsetNumberIsValid(redirect_target)) { ItemId firstlp = PageGetItemId(dp, redirect_target); HeapTupleData firsttup; Assert(ItemIdIsNormal(firstlp)); /* Set up firsttup to reference the tuple at its existing CTID */ firsttup.t_data = (HeapTupleHeader) PageGetItem(dp, firstlp); firsttup.t_len = ItemIdGetLength(firstlp); ItemPointerSet(&firsttup.t_self, BufferGetBlockNumber(buffer), redirect_target); CacheInvalidateHeapTuple(relation, &firsttup); } return ndeleted; }
/* * FinishPreparedTransaction: execute COMMIT PREPARED or ROLLBACK PREPARED */ void FinishPreparedTransaction(const char *gid, bool isCommit) { GlobalTransaction gxact; TransactionId xid; char *buf; char *bufptr; TwoPhaseFileHeader *hdr; TransactionId latestXid; TransactionId *children; RelFileNode *commitrels; RelFileNode *abortrels; RelFileNode *delrels; int ndelrels; int i; /* * Validate the GID, and lock the GXACT to ensure that two backends do not * try to commit the same GID at once. */ gxact = LockGXact(gid, GetUserId()); xid = gxact->proc.xid; /* * Read and validate the state file */ buf = ReadTwoPhaseFile(xid); if (buf == NULL) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("two-phase state file for transaction %u is corrupt", xid))); /* * Disassemble the header area */ hdr = (TwoPhaseFileHeader *) buf; Assert(TransactionIdEquals(hdr->xid, xid)); bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader)); children = (TransactionId *) bufptr; bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId)); commitrels = (RelFileNode *) bufptr; bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode)); abortrels = (RelFileNode *) bufptr; bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode)); /* compute latestXid among all children */ latestXid = TransactionIdLatest(xid, hdr->nsubxacts, children); /* * The order of operations here is critical: make the XLOG entry for * commit or abort, then mark the transaction committed or aborted in * pg_clog, then remove its PGPROC from the global ProcArray (which means * TransactionIdIsInProgress will stop saying the prepared xact is in * progress), then run the post-commit or post-abort callbacks. The * callbacks will release the locks the transaction held. */ if (isCommit) RecordTransactionCommitPrepared(xid, hdr->nsubxacts, children, hdr->ncommitrels, commitrels); else RecordTransactionAbortPrepared(xid, hdr->nsubxacts, children, hdr->nabortrels, abortrels); ProcArrayRemove(&gxact->proc, latestXid); /* * In case we fail while running the callbacks, mark the gxact invalid so * no one else will try to commit/rollback, and so it can be recycled * properly later. It is still locked by our XID so it won't go away yet. * * (We assume it's safe to do this without taking TwoPhaseStateLock.) */ gxact->valid = false; /* * We have to remove any files that were supposed to be dropped. For * consistency with the regular xact.c code paths, must do this before * releasing locks, so do it before running the callbacks. * * NB: this code knows that we couldn't be dropping any temp rels ... */ if (isCommit) { delrels = commitrels; ndelrels = hdr->ncommitrels; } else { delrels = abortrels; ndelrels = hdr->nabortrels; } for (i = 0; i < ndelrels; i++) { SMgrRelation srel = smgropen(delrels[i]); ForkNumber fork; for (fork = 0; fork <= MAX_FORKNUM; fork++) { if (smgrexists(srel, fork)) smgrdounlink(srel, fork, false, false); } smgrclose(srel); } /* And now do the callbacks */ if (isCommit) ProcessRecords(bufptr, xid, twophase_postcommit_callbacks); else ProcessRecords(bufptr, xid, twophase_postabort_callbacks); /* Count the prepared xact as committed or aborted */ AtEOXact_PgStat(isCommit); /* * And now we can clean up our mess. */ RemoveTwoPhaseFile(xid, true); RemoveGXact(gxact); pfree(buf); }
/* * XidInMVCCSnapshot * Is the given XID still-in-progress according to the snapshot? * * Note: GetSnapshotData never stores either top xid or subxids of our own * backend into a snapshot, so these xids will not be reported as "running" * by this function. This is OK for current uses, because we actually only * apply this for known-committed XIDs. */ static bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) { uint32 i; /* * Make a quick range check to eliminate most XIDs without looking at the * xip arrays. Note that this is OK even if we convert a subxact XID to * its parent below, because a subxact with XID < xmin has surely also got * a parent with XID < xmin, while one with XID >= xmax must belong to a * parent that was not yet committed at the time of this snapshot. */ /* Any xid < xmin is not in-progress */ if (TransactionIdPrecedes(xid, snapshot->xmin)) return false; /* Any xid >= xmax is in-progress */ if (TransactionIdFollowsOrEquals(xid, snapshot->xmax)) return true; /* * If the snapshot contains full subxact data, the fastest way to check * things is just to compare the given XID against both subxact XIDs and * top-level XIDs. If the snapshot overflowed, we have to use pg_subtrans * to convert a subxact XID to its parent XID, but then we need only look * at top-level XIDs not subxacts. */ if (snapshot->subxcnt >= 0) { /* full data, so search subxip */ int32 j; for (j = 0; j < snapshot->subxcnt; j++) { if (TransactionIdEquals(xid, snapshot->subxip[j])) return true; } /* not there, fall through to search xip[] */ } else { /* overflowed, so convert xid to top-level */ xid = SubTransGetTopmostTransaction(xid); /* * If xid was indeed a subxact, we might now have an xid < xmin, so * recheck to avoid an array scan. No point in rechecking xmax. */ if (TransactionIdPrecedes(xid, snapshot->xmin)) return false; } for (i = 0; i < snapshot->xcnt; i++) { if (TransactionIdEquals(xid, snapshot->xip[i])) return true; } return false; }
/* * HeapTupleSatisfiesVacuum * * Determine the status of tuples for VACUUM purposes. Here, what * we mainly want to know is if a tuple is potentially visible to *any* * running transaction. If so, it can't be removed yet by VACUUM. * * OldestXmin is a cutoff XID (obtained from GetOldestXmin()). Tuples * deleted by XIDs >= OldestXmin are deemed "recently dead"; they might * still be visible to some open transaction, so we can't remove them, * even if we see that the deleting transaction has committed. */ HTSV_Result HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin, Buffer buffer) { /* * Has inserting transaction committed? * * If the inserting transaction aborted, then the tuple was never visible * to any other transaction, so we can delete it immediately. */ if (!(tuple->t_infomask & HEAP_XMIN_COMMITTED)) { if (tuple->t_infomask & HEAP_XMIN_INVALID) return HEAPTUPLE_DEAD; else if (tuple->t_infomask & HEAP_MOVED_OFF) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); if (TransactionIdIsCurrentTransactionId(xvac)) return HEAPTUPLE_DELETE_IN_PROGRESS; if (TransactionIdIsInProgress(xvac)) return HEAPTUPLE_DELETE_IN_PROGRESS; if (TransactionIdDidCommit(xvac)) { SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, InvalidTransactionId); return HEAPTUPLE_DEAD; } SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, InvalidTransactionId); } else if (tuple->t_infomask & HEAP_MOVED_IN) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); if (TransactionIdIsCurrentTransactionId(xvac)) return HEAPTUPLE_INSERT_IN_PROGRESS; if (TransactionIdIsInProgress(xvac)) return HEAPTUPLE_INSERT_IN_PROGRESS; if (TransactionIdDidCommit(xvac)) SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, InvalidTransactionId); else { SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, InvalidTransactionId); return HEAPTUPLE_DEAD; } } else if (TransactionIdIsInProgress(HeapTupleHeaderGetXmin(tuple))) { if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return HEAPTUPLE_INSERT_IN_PROGRESS; if (tuple->t_infomask & HEAP_IS_LOCKED) return HEAPTUPLE_INSERT_IN_PROGRESS; /* inserted and then deleted by same xact */ return HEAPTUPLE_DELETE_IN_PROGRESS; } else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple))) SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, HeapTupleHeaderGetXmin(tuple)); else { /* * Not in Progress, Not Committed, so either Aborted or crashed */ SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, InvalidTransactionId); return HEAPTUPLE_DEAD; } /* * At this point the xmin is known committed, but we might not have * been able to set the hint bit yet; so we can no longer Assert that * it's set. */ } /* * Okay, the inserter committed, so it was good at some point. Now what * about the deleting transaction? */ if (tuple->t_infomask & HEAP_XMAX_INVALID) return HEAPTUPLE_LIVE; if (tuple->t_infomask & HEAP_IS_LOCKED) { /* * "Deleting" xact really only locked it, so the tuple is live in any * case. However, we should make sure that either XMAX_COMMITTED or * XMAX_INVALID gets set once the xact is gone, to reduce the costs of * examining the tuple for future xacts. Also, marking dead * MultiXacts as invalid here provides defense against MultiXactId * wraparound (see also comments in heap_freeze_tuple()). */ if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) { if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { if (MultiXactIdIsRunning(HeapTupleHeaderGetXmax(tuple))) return HEAPTUPLE_LIVE; } else { if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple))) return HEAPTUPLE_LIVE; } /* * We don't really care whether xmax did commit, abort or crash. * We know that xmax did lock the tuple, but it did not and will * never actually update it. */ SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); } return HEAPTUPLE_LIVE; } if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { /* MultiXacts are currently only allowed to lock tuples */ Assert(tuple->t_infomask & HEAP_IS_LOCKED); return HEAPTUPLE_LIVE; } if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) { if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple))) return HEAPTUPLE_DELETE_IN_PROGRESS; else if (TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple))) SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, HeapTupleHeaderGetXmax(tuple)); else { /* * Not in Progress, Not Committed, so either Aborted or crashed */ SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); return HEAPTUPLE_LIVE; } /* * At this point the xmax is known committed, but we might not have * been able to set the hint bit yet; so we can no longer Assert that * it's set. */ } /* * Deleter committed, but check special cases. */ if (TransactionIdEquals(HeapTupleHeaderGetXmin(tuple), HeapTupleHeaderGetXmax(tuple))) { /* * Inserter also deleted it, so it was never visible to anyone else. * However, we can only remove it early if it's not an updated tuple; * else its parent tuple is linking to it via t_ctid, and this tuple * mustn't go away before the parent does. */ if (!(tuple->t_infomask & HEAP_UPDATED)) return HEAPTUPLE_DEAD; } if (!TransactionIdPrecedes(HeapTupleHeaderGetXmax(tuple), OldestXmin)) { /* deleting xact is too recent, tuple could still be visible */ return HEAPTUPLE_RECENTLY_DEAD; } /* Otherwise, it's dead and removable */ return HEAPTUPLE_DEAD; }
/* * HeapTupleSatisfiesSnapshot * True iff heap tuple is valid for the given snapshot. * * Here, we consider the effects of: * all transactions committed as of the time of the given snapshot * previous commands of this transaction * * Does _not_ include: * transactions shown as in-progress by the snapshot * transactions started after the snapshot was taken * changes made by the current command * * This is the same as HeapTupleSatisfiesNow, except that transactions that * were in progress or as yet unstarted when the snapshot was taken will * be treated as uncommitted, even if they have committed by now. * * (Notice, however, that the tuple status hint bits will be updated on the * basis of the true state of the transaction, even if we then pretend we * can't see it.) */ bool HeapTupleSatisfiesSnapshot(HeapTupleHeader tuple, Snapshot snapshot) { if (!(tuple->t_infomask & HEAP_XMIN_COMMITTED)) { if (tuple->t_infomask & HEAP_XMIN_INVALID) return false; if (tuple->t_infomask & HEAP_MOVED_OFF) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); if (TransactionIdIsCurrentTransactionId(xvac)) return false; if (!TransactionIdIsInProgress(xvac)) { if (TransactionIdDidCommit(xvac)) { tuple->t_infomask |= HEAP_XMIN_INVALID; return false; } tuple->t_infomask |= HEAP_XMIN_COMMITTED; } } else if (tuple->t_infomask & HEAP_MOVED_IN) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); if (!TransactionIdIsCurrentTransactionId(xvac)) { if (TransactionIdIsInProgress(xvac)) return false; if (TransactionIdDidCommit(xvac)) tuple->t_infomask |= HEAP_XMIN_COMMITTED; else { tuple->t_infomask |= HEAP_XMIN_INVALID; return false; } } } else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple))) { if (HeapTupleHeaderGetCmin(tuple) >= snapshot->curcid) return false; /* inserted after scan started */ if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return true; Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple))); if (tuple->t_infomask & HEAP_MARKED_FOR_UPDATE) return true; if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) return true; /* deleted after scan started */ else return false; /* deleted before scan started */ } else if (TransactionIdIsInProgress(HeapTupleHeaderGetXmin(tuple))) return false; else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple))) tuple->t_infomask |= HEAP_XMIN_COMMITTED; else { /* it must have aborted or crashed */ tuple->t_infomask |= HEAP_XMIN_INVALID; return false; } } /* * By here, the inserting transaction has committed - have to check * when... */ if (TransactionIdFollowsOrEquals(HeapTupleHeaderGetXmin(tuple), snapshot->xmin)) { uint32 i; if (TransactionIdFollowsOrEquals(HeapTupleHeaderGetXmin(tuple), snapshot->xmax)) return false; for (i = 0; i < snapshot->xcnt; i++) { if (TransactionIdEquals(HeapTupleHeaderGetXmin(tuple), snapshot->xip[i])) return false; } } if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ return true; if (tuple->t_infomask & HEAP_MARKED_FOR_UPDATE) return true; if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) { if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple))) { if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) return true; /* deleted after scan started */ else return false; /* deleted before scan started */ } if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple))) return true; if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple))) { /* it must have aborted or crashed */ tuple->t_infomask |= HEAP_XMAX_INVALID; return true; } /* xmax transaction committed */ tuple->t_infomask |= HEAP_XMAX_COMMITTED; } /* * OK, the deleting transaction committed too ... but when? */ if (TransactionIdFollowsOrEquals(HeapTupleHeaderGetXmax(tuple), snapshot->xmin)) { uint32 i; if (TransactionIdFollowsOrEquals(HeapTupleHeaderGetXmax(tuple), snapshot->xmax)) return true; for (i = 0; i < snapshot->xcnt; i++) { if (TransactionIdEquals(HeapTupleHeaderGetXmax(tuple), snapshot->xip[i])) return true; } } return false; }
/* * RecoverPreparedTransactions * * Scan the pg_twophase directory and reload shared-memory state for each * prepared transaction (reacquire locks, etc). This is run during database * startup. */ void RecoverPreparedTransactions(void) { char dir[MAXPGPATH]; DIR *cldir; struct dirent *clde; snprintf(dir, MAXPGPATH, "%s", TWOPHASE_DIR); cldir = AllocateDir(dir); while ((clde = ReadDir(cldir, dir)) != NULL) { if (strlen(clde->d_name) == 8 && strspn(clde->d_name, "0123456789ABCDEF") == 8) { TransactionId xid; char *buf; char *bufptr; TwoPhaseFileHeader *hdr; TransactionId *subxids; GlobalTransaction gxact; int i; xid = (TransactionId) strtoul(clde->d_name, NULL, 16); /* Already processed? */ if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid)) { ereport(WARNING, (errmsg("removing stale two-phase state file \"%s\"", clde->d_name))); RemoveTwoPhaseFile(xid, true); continue; } /* Read and validate file */ buf = ReadTwoPhaseFile(xid); if (buf == NULL) { ereport(WARNING, (errmsg("removing corrupt two-phase state file \"%s\"", clde->d_name))); RemoveTwoPhaseFile(xid, true); continue; } ereport(LOG, (errmsg("recovering prepared transaction %u", xid))); /* Deconstruct header */ hdr = (TwoPhaseFileHeader *) buf; Assert(TransactionIdEquals(hdr->xid, xid)); bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader)); subxids = (TransactionId *) bufptr; bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId)); bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode)); bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode)); /* * Reconstruct subtrans state for the transaction --- needed * because pg_subtrans is not preserved over a restart. Note that * we are linking all the subtransactions directly to the * top-level XID; there may originally have been a more complex * hierarchy, but there's no need to restore that exactly. */ for (i = 0; i < hdr->nsubxacts; i++) SubTransSetParent(subxids[i], xid); /* * Recreate its GXACT and dummy PGPROC * * Note: since we don't have the PREPARE record's WAL location at * hand, we leave prepare_lsn zeroes. This means the GXACT will * be fsync'd on every future checkpoint. We assume this * situation is infrequent enough that the performance cost is * negligible (especially since we know the state file has already * been fsynced). */ gxact = MarkAsPreparing(xid, hdr->gid, hdr->prepared_at, hdr->owner, hdr->database); GXactLoadSubxactData(gxact, hdr->nsubxacts, subxids); MarkAsPrepared(gxact); /* * Recover other state (notably locks) using resource managers */ ProcessRecords(bufptr, xid, twophase_recover_callbacks); pfree(buf); } } FreeDir(cldir); }
/* * HeapTupleSatisfiesVacuum * * Determine the status of tuples for VACUUM purposes. Here, what * we mainly want to know is if a tuple is potentially visible to *any* * running transaction. If so, it can't be removed yet by VACUUM. * * OldestXmin is a cutoff XID (obtained from GetOldestXmin()). Tuples * deleted by XIDs >= OldestXmin are deemed "recently dead"; they might * still be visible to some open transaction, so we can't remove them, * even if we see that the deleting transaction has committed. */ HTSV_Result HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin) { /* * Has inserting transaction committed? * * If the inserting transaction aborted, then the tuple was never visible * to any other transaction, so we can delete it immediately. */ if (!(tuple->t_infomask & HEAP_XMIN_COMMITTED)) { if (tuple->t_infomask & HEAP_XMIN_INVALID) return HEAPTUPLE_DEAD; else if (tuple->t_infomask & HEAP_MOVED_OFF) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); if (TransactionIdIsCurrentTransactionId(xvac)) return HEAPTUPLE_DELETE_IN_PROGRESS; if (TransactionIdIsInProgress(xvac)) return HEAPTUPLE_DELETE_IN_PROGRESS; if (TransactionIdDidCommit(xvac)) { tuple->t_infomask |= HEAP_XMIN_INVALID; return HEAPTUPLE_DEAD; } tuple->t_infomask |= HEAP_XMIN_COMMITTED; } else if (tuple->t_infomask & HEAP_MOVED_IN) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); if (TransactionIdIsCurrentTransactionId(xvac)) return HEAPTUPLE_INSERT_IN_PROGRESS; if (TransactionIdIsInProgress(xvac)) return HEAPTUPLE_INSERT_IN_PROGRESS; if (TransactionIdDidCommit(xvac)) tuple->t_infomask |= HEAP_XMIN_COMMITTED; else { tuple->t_infomask |= HEAP_XMIN_INVALID; return HEAPTUPLE_DEAD; } } else if (TransactionIdIsInProgress(HeapTupleHeaderGetXmin(tuple))) { if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return HEAPTUPLE_INSERT_IN_PROGRESS; Assert(HeapTupleHeaderGetXmin(tuple) == HeapTupleHeaderGetXmax(tuple)); if (tuple->t_infomask & HEAP_MARKED_FOR_UPDATE) return HEAPTUPLE_INSERT_IN_PROGRESS; /* inserted and then deleted by same xact */ return HEAPTUPLE_DELETE_IN_PROGRESS; } else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple))) tuple->t_infomask |= HEAP_XMIN_COMMITTED; else { /* * Not in Progress, Not Committed, so either Aborted or * crashed */ tuple->t_infomask |= HEAP_XMIN_INVALID; return HEAPTUPLE_DEAD; } /* Should only get here if we set XMIN_COMMITTED */ Assert(tuple->t_infomask & HEAP_XMIN_COMMITTED); } /* * Okay, the inserter committed, so it was good at some point. Now * what about the deleting transaction? */ if (tuple->t_infomask & HEAP_XMAX_INVALID) return HEAPTUPLE_LIVE; if (tuple->t_infomask & HEAP_MARKED_FOR_UPDATE) { /* * "Deleting" xact really only marked it for update, so the tuple * is live in any case. However, we must make sure that either * XMAX_COMMITTED or XMAX_INVALID gets set once the xact is gone; * otherwise it is unsafe to recycle CLOG status after vacuuming. */ if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) { if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple))) return HEAPTUPLE_LIVE; /* * We don't really care whether xmax did commit, abort or * crash. We know that xmax did mark the tuple for update, but * it did not and will never actually update it. */ tuple->t_infomask |= HEAP_XMAX_INVALID; } return HEAPTUPLE_LIVE; } if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) { if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple))) return HEAPTUPLE_DELETE_IN_PROGRESS; else if (TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple))) tuple->t_infomask |= HEAP_XMAX_COMMITTED; else { /* * Not in Progress, Not Committed, so either Aborted or * crashed */ tuple->t_infomask |= HEAP_XMAX_INVALID; return HEAPTUPLE_LIVE; } /* Should only get here if we set XMAX_COMMITTED */ Assert(tuple->t_infomask & HEAP_XMAX_COMMITTED); } /* * Deleter committed, but check special cases. */ if (TransactionIdEquals(HeapTupleHeaderGetXmin(tuple), HeapTupleHeaderGetXmax(tuple))) { /* * Inserter also deleted it, so it was never visible to anyone * else. However, we can only remove it early if it's not an * updated tuple; else its parent tuple is linking to it via t_ctid, * and this tuple mustn't go away before the parent does. */ if (!(tuple->t_infomask & HEAP_UPDATED)) return HEAPTUPLE_DEAD; } if (!TransactionIdPrecedes(HeapTupleHeaderGetXmax(tuple), OldestXmin)) { /* deleting xact is too recent, tuple could still be visible */ return HEAPTUPLE_RECENTLY_DEAD; } /* Otherwise, it's dead and removable */ return HEAPTUPLE_DEAD; }