/* * Interrogate the parent of a transaction in the subtrans log. */ TransactionId SubTransGetParent(TransactionId xid) { int pageno = TransactionIdToPage(xid); int entryno = TransactionIdToEntry(xid); int slotno; TransactionId *ptr; TransactionId parent; /* Can't ask about stuff that might not be around anymore */ Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); /* Bootstrap and frozen XIDs have no parent */ if (!TransactionIdIsNormal(xid)) return InvalidTransactionId; /* lock is acquired by SimpleLruReadPage_ReadOnly */ slotno = SimpleLruReadPage_ReadOnly(SubTransCtl, pageno, xid); ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno]; ptr += entryno; parent = *ptr; LWLockRelease(SubtransControlLock); return parent; }
/* * SubTransGetTopmostTransaction * * Returns the topmost transaction of the given transaction id. * * Because we cannot look back further than TransactionXmin, it is possible * that this function will lie and return an intermediate subtransaction ID * instead of the true topmost parent ID. This is OK, because in practice * we only care about detecting whether the topmost parent is still running * or is part of a current snapshot's list of still-running transactions. * Therefore, any XID before TransactionXmin is as good as any other. */ TransactionId SubTransGetTopmostTransaction(TransactionId xid) { TransactionId parentXid = xid, previousXid = xid; /* Can't ask about stuff that might not be around anymore */ Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); while (TransactionIdIsValid(parentXid)) { previousXid = parentXid; if (TransactionIdPrecedes(parentXid, TransactionXmin)) break; parentXid = SubTransGetParent(parentXid); /* * By convention the parent xid gets allocated first, so should always * precede the child xid. Anything else points to a corrupted data * structure that could lead to an infinite loop, so exit. */ if (!TransactionIdPrecedes(parentXid, previousXid)) elog(ERROR, "pg_subtrans contains invalid entry: xid %u points to parent xid %u", previousXid, parentXid); } Assert(TransactionIdIsValid(previousXid)); return previousXid; }
/* * Get new XID. For global transaction is it previsly set by dtm_begin_transaction or dtm_join_transaction. * Local transactions are using range of local Xids obtains from DTM. */ static TransactionId DtmGetNextXid() { TransactionId xid; LWLockAcquire(dtm->xidLock, LW_EXCLUSIVE); if (TransactionIdIsValid(DtmNextXid)) { XTM_INFO("Use global XID %d\n", DtmNextXid); xid = DtmNextXid; if (TransactionIdPrecedesOrEquals(ShmemVariableCache->nextXid, xid)) { /* Advance ShmemVariableCache->nextXid formward until new Xid */ while (TransactionIdPrecedes(ShmemVariableCache->nextXid, xid)) { XTM_INFO("Extend CLOG for global transaction to %d\n", ShmemVariableCache->nextXid); ExtendCLOG(ShmemVariableCache->nextXid); ExtendCommitTs(ShmemVariableCache->nextXid); ExtendSUBTRANS(ShmemVariableCache->nextXid); TransactionIdAdvance(ShmemVariableCache->nextXid); } dtm->nReservedXids = 0; } } else { if (dtm->nReservedXids == 0) { XTM_INFO("%d: reserve new XID range\n", getpid()); dtm->nReservedXids = ArbiterReserve(ShmemVariableCache->nextXid, DtmLocalXidReserve, &dtm->nextXid); Assert(dtm->nReservedXids > 0); Assert(TransactionIdFollowsOrEquals(dtm->nextXid, ShmemVariableCache->nextXid)); /* Advance ShmemVariableCache->nextXid formward until new Xid */ while (TransactionIdPrecedes(ShmemVariableCache->nextXid, dtm->nextXid)) { XTM_INFO("Extend CLOG for local transaction to %d\n", ShmemVariableCache->nextXid); ExtendCLOG(ShmemVariableCache->nextXid); ExtendCommitTs(ShmemVariableCache->nextXid); ExtendSUBTRANS(ShmemVariableCache->nextXid); TransactionIdAdvance(ShmemVariableCache->nextXid); } } Assert(ShmemVariableCache->nextXid == dtm->nextXid); xid = dtm->nextXid++; dtm->nReservedXids -= 1; XTM_INFO("Obtain new local XID %d\n", xid); } LWLockRelease(dtm->xidLock); return xid; }
static void SubTransGetData(TransactionId xid, SubTransData* subData) { MIRRORED_LOCK_DECLARE; int pageno = TransactionIdToPage(xid); int entryno = TransactionIdToEntry(xid); int slotno; SubTransData *ptr; /* Can't ask about stuff that might not be around anymore */ Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); /* Bootstrap and frozen XIDs have no parent and itself as topMostParent */ if (!TransactionIdIsNormal(xid)) { subData->parent = InvalidTransactionId; subData->topMostParent = xid; return; } MIRRORED_LOCK; /* lock is acquired by SimpleLruReadPage_ReadOnly */ slotno = SimpleLruReadPage_ReadOnly(SubTransCtl, pageno, xid, NULL); ptr = (SubTransData *) SubTransCtl->shared->page_buffer[slotno]; ptr += entryno; subData->parent = ptr->parent; subData->topMostParent = ptr->topMostParent; if ( subData->topMostParent == InvalidTransactionId ) { /* Here means parent is Main XID, hence set parent itself as topMostParent */ subData->topMostParent = xid; } LWLockRelease(SubtransControlLock); MIRRORED_UNLOCK; return; }
/* * SubTransGetTopmostTransaction * * Returns the topmost transaction of the given transaction id. * * Because we cannot look back further than TransactionXmin, it is possible * that this function will lie and return an intermediate subtransaction ID * instead of the true topmost parent ID. This is OK, because in practice * we only care about detecting whether the topmost parent is still running * or is part of a current snapshot's list of still-running transactions. * Therefore, any XID before TransactionXmin is as good as any other. */ TransactionId SubTransGetTopmostTransaction(TransactionId xid) { TransactionId parentXid = xid, previousXid = xid; /* Can't ask about stuff that might not be around anymore */ Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); while (TransactionIdIsValid(parentXid)) { previousXid = parentXid; if (TransactionIdPrecedes(parentXid, TransactionXmin)) break; parentXid = SubTransGetParent(parentXid); } Assert(TransactionIdIsValid(previousXid)); return previousXid; }
/* * See the comments for HeapTupleSatisfiesMVCC for the semantics this function * obeys. * * Only usable on tuples from catalog tables! * * We don't need to support HEAP_MOVED_(IN|OFF) for now because we only support * reading catalog pages which couldn't have been created in an older version. * * We don't set any hint bits in here as it seems unlikely to be beneficial as * those should already be set by normal access and it seems to be too * dangerous to do so as the semantics of doing so during timetravel are more * complicated than when dealing "only" with the present. */ bool HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, Buffer buffer) { HeapTupleHeader tuple = htup->t_data; TransactionId xmin = HeapTupleHeaderGetXmin(tuple); TransactionId xmax = HeapTupleHeaderGetRawXmax(tuple); Assert(ItemPointerIsValid(&htup->t_self)); Assert(htup->t_tableOid != InvalidOid); /* inserting transaction aborted */ if (HeapTupleHeaderXminInvalid(tuple)) { Assert(!TransactionIdDidCommit(xmin)); return false; } /* check if it's one of our txids, toplevel is also in there */ else if (TransactionIdInArray(xmin, snapshot->subxip, snapshot->subxcnt)) { bool resolved; CommandId cmin = HeapTupleHeaderGetRawCommandId(tuple); CommandId cmax = InvalidCommandId; /* * another transaction might have (tried to) delete this tuple or * cmin/cmax was stored in a combocid. So we need to lookup the actual * values externally. */ resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), snapshot, htup, buffer, &cmin, &cmax); if (!resolved) elog(ERROR, "could not resolve cmin/cmax of catalog tuple"); Assert(cmin != InvalidCommandId); if (cmin >= snapshot->curcid) return false; /* inserted after scan started */ /* fall through */ } /* committed before our xmin horizon. Do a normal visibility check. */ else if (TransactionIdPrecedes(xmin, snapshot->xmin)) { Assert(!(HeapTupleHeaderXminCommitted(tuple) && !TransactionIdDidCommit(xmin))); /* check for hint bit first, consult clog afterwards */ if (!HeapTupleHeaderXminCommitted(tuple) && !TransactionIdDidCommit(xmin)) return false; /* fall through */ } /* beyond our xmax horizon, i.e. invisible */ else if (TransactionIdFollowsOrEquals(xmin, snapshot->xmax)) { return false; } /* check if it's a committed transaction in [xmin, xmax) */ else if (TransactionIdInArray(xmin, snapshot->xip, snapshot->xcnt)) { /* fall through */ } /* * none of the above, i.e. between [xmin, xmax) but hasn't committed. I.e. * invisible. */ else { return false; } /* at this point we know xmin is visible, go on to check xmax */ /* xid invalid or aborted */ if (tuple->t_infomask & HEAP_XMAX_INVALID) return true; /* locked tuples are always visible */ else if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) return true; /* * We can see multis here if we're looking at user tables or if somebody * SELECT ... FOR SHARE/UPDATE a system table. */ else if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) { xmax = HeapTupleGetUpdateXid(tuple); } /* check if it's one of our txids, toplevel is also in there */ if (TransactionIdInArray(xmax, snapshot->subxip, snapshot->subxcnt)) { bool resolved; CommandId cmin; CommandId cmax = HeapTupleHeaderGetRawCommandId(tuple); /* Lookup actual cmin/cmax values */ resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), snapshot, htup, buffer, &cmin, &cmax); if (!resolved) elog(ERROR, "could not resolve combocid to cmax"); Assert(cmax != InvalidCommandId); if (cmax >= snapshot->curcid) return true; /* deleted after scan started */ else return false; /* deleted before scan started */ } /* below xmin horizon, normal transaction state is valid */ else if (TransactionIdPrecedes(xmax, snapshot->xmin)) { Assert(!(tuple->t_infomask & HEAP_XMAX_COMMITTED && !TransactionIdDidCommit(xmax))); /* check hint bit first */ if (tuple->t_infomask & HEAP_XMAX_COMMITTED) return false; /* check clog */ return !TransactionIdDidCommit(xmax); } /* above xmax horizon, we cannot possibly see the deleting transaction */ else if (TransactionIdFollowsOrEquals(xmax, snapshot->xmax)) return true; /* xmax is between [xmin, xmax), check known committed array */ else if (TransactionIdInArray(xmax, snapshot->xip, snapshot->xcnt)) return false; /* xmax is between [xmin, xmax), but known not to have committed yet */ else return true; }
/* * XidInMVCCSnapshot * Is the given XID still-in-progress according to the snapshot? * * Note: GetSnapshotData never stores either top xid or subxids of our own * backend into a snapshot, so these xids will not be reported as "running" * by this function. This is OK for current uses, because we actually only * apply this for known-committed XIDs. */ static bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) { uint32 i; /* * Make a quick range check to eliminate most XIDs without looking at the * xip arrays. Note that this is OK even if we convert a subxact XID to * its parent below, because a subxact with XID < xmin has surely also got * a parent with XID < xmin, while one with XID >= xmax must belong to a * parent that was not yet committed at the time of this snapshot. */ /* Any xid < xmin is not in-progress */ if (TransactionIdPrecedes(xid, snapshot->xmin)) return false; /* Any xid >= xmax is in-progress */ if (TransactionIdFollowsOrEquals(xid, snapshot->xmax)) return true; /* * Snapshot information is stored slightly differently in snapshots taken * during recovery. */ if (!snapshot->takenDuringRecovery) { /* * If the snapshot contains full subxact data, the fastest way to * check things is just to compare the given XID against both subxact * XIDs and top-level XIDs. If the snapshot overflowed, we have to * use pg_subtrans to convert a subxact XID to its parent XID, but * then we need only look at top-level XIDs not subxacts. */ if (!snapshot->suboverflowed) { /* full data, so search subxip */ int32 j; for (j = 0; j < snapshot->subxcnt; j++) { if (TransactionIdEquals(xid, snapshot->subxip[j])) return true; } /* not there, fall through to search xip[] */ } else { /* overflowed, so convert xid to top-level */ xid = SubTransGetTopmostTransaction(xid); /* * If xid was indeed a subxact, we might now have an xid < xmin, * so recheck to avoid an array scan. No point in rechecking * xmax. */ if (TransactionIdPrecedes(xid, snapshot->xmin)) return false; } for (i = 0; i < snapshot->xcnt; i++) { if (TransactionIdEquals(xid, snapshot->xip[i])) return true; } } else { int32 j; /* * In recovery we store all xids in the subxact array because it is by * far the bigger array, and we mostly don't know which xids are * top-level and which are subxacts. The xip array is empty. * * We start by searching subtrans, if we overflowed. */ if (snapshot->suboverflowed) { /* overflowed, so convert xid to top-level */ xid = SubTransGetTopmostTransaction(xid); /* * If xid was indeed a subxact, we might now have an xid < xmin, * so recheck to avoid an array scan. No point in rechecking * xmax. */ if (TransactionIdPrecedes(xid, snapshot->xmin)) return false; } /* * We now have either a top-level xid higher than xmin or an * indeterminate xid. We don't know whether it's top level or subxact * but it doesn't matter. If it's present, the xid is visible. */ for (j = 0; j < snapshot->subxcnt; j++) { if (TransactionIdEquals(xid, snapshot->subxip[j])) return true; } } return false; }
/* * PrescanPreparedTransactions * * Scan the pg_twophase directory and determine the range of valid XIDs * present. This is run during database startup, after we have completed * reading WAL. ShmemVariableCache->nextXid has been set to one more than * the highest XID for which evidence exists in WAL. * * We throw away any prepared xacts with main XID beyond nextXid --- if any * are present, it suggests that the DBA has done a PITR recovery to an * earlier point in time without cleaning out pg_twophase. We dare not * try to recover such prepared xacts since they likely depend on database * state that doesn't exist now. * * However, we will advance nextXid beyond any subxact XIDs belonging to * valid prepared xacts. We need to do this since subxact commit doesn't * write a WAL entry, and so there might be no evidence in WAL of those * subxact XIDs. * * Our other responsibility is to determine and return the oldest valid XID * among the prepared xacts (if none, return ShmemVariableCache->nextXid). * This is needed to synchronize pg_subtrans startup properly. */ TransactionId PrescanPreparedTransactions(void) { TransactionId origNextXid = ShmemVariableCache->nextXid; TransactionId result = origNextXid; DIR *cldir; struct dirent *clde; cldir = AllocateDir(TWOPHASE_DIR); while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL) { if (strlen(clde->d_name) == 8 && strspn(clde->d_name, "0123456789ABCDEF") == 8) { TransactionId xid; char *buf; TwoPhaseFileHeader *hdr; TransactionId *subxids; int i; xid = (TransactionId) strtoul(clde->d_name, NULL, 16); /* Reject XID if too new */ if (TransactionIdFollowsOrEquals(xid, origNextXid)) { ereport(WARNING, (errmsg("removing future two-phase state file \"%s\"", clde->d_name))); RemoveTwoPhaseFile(xid, true); continue; } /* * Note: we can't check if already processed because clog * subsystem isn't up yet. */ /* Read and validate file */ buf = ReadTwoPhaseFile(xid); if (buf == NULL) { ereport(WARNING, (errmsg("removing corrupt two-phase state file \"%s\"", clde->d_name))); RemoveTwoPhaseFile(xid, true); continue; } /* Deconstruct header */ hdr = (TwoPhaseFileHeader *) buf; if (!TransactionIdEquals(hdr->xid, xid)) { ereport(WARNING, (errmsg("removing corrupt two-phase state file \"%s\"", clde->d_name))); RemoveTwoPhaseFile(xid, true); pfree(buf); continue; } /* * OK, we think this file is valid. Incorporate xid into the * running-minimum result. */ if (TransactionIdPrecedes(xid, result)) result = xid; /* * Examine subtransaction XIDs ... they should all follow main * XID, and they may force us to advance nextXid. */ subxids = (TransactionId *) (buf + MAXALIGN(sizeof(TwoPhaseFileHeader))); for (i = 0; i < hdr->nsubxacts; i++) { TransactionId subxid = subxids[i]; Assert(TransactionIdFollows(subxid, xid)); if (TransactionIdFollowsOrEquals(subxid, ShmemVariableCache->nextXid)) { ShmemVariableCache->nextXid = subxid; TransactionIdAdvance(ShmemVariableCache->nextXid); } } pfree(buf); } } FreeDir(cldir); return result; }
/* * XidInMVCCSnapshot * Is the given XID still-in-progress according to the snapshot? * * Note: GetSnapshotData never stores either top xid or subxids of our own * backend into a snapshot, so these xids will not be reported as "running" * by this function. This is OK for current uses, because we actually only * apply this for known-committed XIDs. */ static bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) { uint32 i; /* * Make a quick range check to eliminate most XIDs without looking at the * xip arrays. Note that this is OK even if we convert a subxact XID to * its parent below, because a subxact with XID < xmin has surely also got * a parent with XID < xmin, while one with XID >= xmax must belong to a * parent that was not yet committed at the time of this snapshot. */ /* Any xid < xmin is not in-progress */ if (TransactionIdPrecedes(xid, snapshot->xmin)) return false; /* Any xid >= xmax is in-progress */ if (TransactionIdFollowsOrEquals(xid, snapshot->xmax)) return true; /* * If the snapshot contains full subxact data, the fastest way to check * things is just to compare the given XID against both subxact XIDs and * top-level XIDs. If the snapshot overflowed, we have to use pg_subtrans * to convert a subxact XID to its parent XID, but then we need only look * at top-level XIDs not subxacts. */ if (snapshot->subxcnt >= 0) { /* full data, so search subxip */ int32 j; for (j = 0; j < snapshot->subxcnt; j++) { if (TransactionIdEquals(xid, snapshot->subxip[j])) return true; } /* not there, fall through to search xip[] */ } else { /* overflowed, so convert xid to top-level */ xid = SubTransGetTopmostTransaction(xid); /* * If xid was indeed a subxact, we might now have an xid < xmin, so * recheck to avoid an array scan. No point in rechecking xmax. */ if (TransactionIdPrecedes(xid, snapshot->xmin)) return false; } for (i = 0; i < snapshot->xcnt; i++) { if (TransactionIdEquals(xid, snapshot->xip[i])) return true; } return false; }
/* * Vacuum a regular (non-root) leaf page * * We must delete tuples that are targeted for deletion by the VACUUM, * but not move any tuples that are referenced by outside links; we assume * those are the ones that are heads of chains. * * If we find a REDIRECT that was made by a concurrently-running transaction, * we must add its target TID to pendingList. (We don't try to visit the * target immediately, first because we don't want VACUUM locking more than * one buffer at a time, and second because the duplicate-filtering logic * in spgAddPendingTID is useful to ensure we can't get caught in an infinite * loop in the face of continuous concurrent insertions.) * * If forPending is true, we are examining the page as a consequence of * chasing a redirect link, not as part of the normal sequential scan. * We still vacuum the page normally, but we don't increment the stats * about live tuples; else we'd double-count those tuples, since the page * has been or will be visited in the sequential scan as well. */ static void vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer, bool forPending) { Page page = BufferGetPage(buffer); spgxlogVacuumLeaf xlrec; XLogRecData rdata[8]; OffsetNumber toDead[MaxIndexTuplesPerPage]; OffsetNumber toPlaceholder[MaxIndexTuplesPerPage]; OffsetNumber moveSrc[MaxIndexTuplesPerPage]; OffsetNumber moveDest[MaxIndexTuplesPerPage]; OffsetNumber chainSrc[MaxIndexTuplesPerPage]; OffsetNumber chainDest[MaxIndexTuplesPerPage]; OffsetNumber predecessor[MaxIndexTuplesPerPage + 1]; bool deletable[MaxIndexTuplesPerPage + 1]; int nDeletable; OffsetNumber i, max = PageGetMaxOffsetNumber(page); memset(predecessor, 0, sizeof(predecessor)); memset(deletable, 0, sizeof(deletable)); nDeletable = 0; /* Scan page, identify tuples to delete, accumulate stats */ for (i = FirstOffsetNumber; i <= max; i++) { SpGistLeafTuple lt; lt = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, i)); if (lt->tupstate == SPGIST_LIVE) { Assert(ItemPointerIsValid(<->heapPtr)); if (bds->callback(<->heapPtr, bds->callback_state)) { bds->stats->tuples_removed += 1; deletable[i] = true; nDeletable++; } else { if (!forPending) bds->stats->num_index_tuples += 1; } /* Form predecessor map, too */ if (lt->nextOffset != InvalidOffsetNumber) { /* paranoia about corrupted chain links */ if (lt->nextOffset < FirstOffsetNumber || lt->nextOffset > max || predecessor[lt->nextOffset] != InvalidOffsetNumber) elog(ERROR, "inconsistent tuple chain links in page %u of index \"%s\"", BufferGetBlockNumber(buffer), RelationGetRelationName(index)); predecessor[lt->nextOffset] = i; } } else if (lt->tupstate == SPGIST_REDIRECT) { SpGistDeadTuple dt = (SpGistDeadTuple) lt; Assert(dt->nextOffset == InvalidOffsetNumber); Assert(ItemPointerIsValid(&dt->pointer)); /* * Add target TID to pending list if the redirection could have * happened since VACUUM started. * * Note: we could make a tighter test by seeing if the xid is * "running" according to the active snapshot; but tqual.c doesn't * currently export a suitable API, and it's not entirely clear * that a tighter test is worth the cycles anyway. */ if (TransactionIdFollowsOrEquals(dt->xid, bds->myXmin)) spgAddPendingTID(bds, &dt->pointer); } else { Assert(lt->nextOffset == InvalidOffsetNumber); } } if (nDeletable == 0) return; /* nothing more to do */ /*---------- * Figure out exactly what we have to do. We do this separately from * actually modifying the page, mainly so that we have a representation * that can be dumped into WAL and then the replay code can do exactly * the same thing. The output of this step consists of six arrays * describing four kinds of operations, to be performed in this order: * * toDead[]: tuple numbers to be replaced with DEAD tuples * toPlaceholder[]: tuple numbers to be replaced with PLACEHOLDER tuples * moveSrc[]: tuple numbers that need to be relocated to another offset * (replacing the tuple there) and then replaced with PLACEHOLDER tuples * moveDest[]: new locations for moveSrc tuples * chainSrc[]: tuple numbers whose chain links (nextOffset) need updates * chainDest[]: new values of nextOffset for chainSrc members * * It's easiest to figure out what we have to do by processing tuple * chains, so we iterate over all the tuples (not just the deletable * ones!) to identify chain heads, then chase down each chain and make * work item entries for deletable tuples within the chain. *---------- */ xlrec.nDead = xlrec.nPlaceholder = xlrec.nMove = xlrec.nChain = 0; for (i = FirstOffsetNumber; i <= max; i++) { SpGistLeafTuple head; bool interveningDeletable; OffsetNumber prevLive; OffsetNumber j; head = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, i)); if (head->tupstate != SPGIST_LIVE) continue; /* can't be a chain member */ if (predecessor[i] != 0) continue; /* not a chain head */ /* initialize ... */ interveningDeletable = false; prevLive = deletable[i] ? InvalidOffsetNumber : i; /* scan down the chain ... */ j = head->nextOffset; while (j != InvalidOffsetNumber) { SpGistLeafTuple lt; lt = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, j)); if (lt->tupstate != SPGIST_LIVE) { /* all tuples in chain should be live */ elog(ERROR, "unexpected SPGiST tuple state: %d", lt->tupstate); } if (deletable[j]) { /* This tuple should be replaced by a placeholder */ toPlaceholder[xlrec.nPlaceholder] = j; xlrec.nPlaceholder++; /* previous live tuple's chain link will need an update */ interveningDeletable = true; } else if (prevLive == InvalidOffsetNumber) { /* * This is the first live tuple in the chain. It has to move * to the head position. */ moveSrc[xlrec.nMove] = j; moveDest[xlrec.nMove] = i; xlrec.nMove++; /* Chain updates will be applied after the move */ prevLive = i; interveningDeletable = false; } else { /* * Second or later live tuple. Arrange to re-chain it to the * previous live one, if there was a gap. */ if (interveningDeletable) { chainSrc[xlrec.nChain] = prevLive; chainDest[xlrec.nChain] = j; xlrec.nChain++; } prevLive = j; interveningDeletable = false; } j = lt->nextOffset; } if (prevLive == InvalidOffsetNumber) { /* The chain is entirely removable, so we need a DEAD tuple */ toDead[xlrec.nDead] = i; xlrec.nDead++; } else if (interveningDeletable) { /* One or more deletions at end of chain, so close it off */ chainSrc[xlrec.nChain] = prevLive; chainDest[xlrec.nChain] = InvalidOffsetNumber; xlrec.nChain++; } } /* sanity check ... */ if (nDeletable != xlrec.nDead + xlrec.nPlaceholder + xlrec.nMove) elog(ERROR, "inconsistent counts of deletable tuples"); /* Prepare WAL record */ xlrec.node = index->rd_node; xlrec.blkno = BufferGetBlockNumber(buffer); STORE_STATE(&bds->spgstate, xlrec.stateSrc); ACCEPT_RDATA_DATA(&xlrec, SizeOfSpgxlogVacuumLeaf, 0); ACCEPT_RDATA_DATA(toDead, sizeof(OffsetNumber) * xlrec.nDead, 1); ACCEPT_RDATA_DATA(toPlaceholder, sizeof(OffsetNumber) * xlrec.nPlaceholder, 2); ACCEPT_RDATA_DATA(moveSrc, sizeof(OffsetNumber) * xlrec.nMove, 3); ACCEPT_RDATA_DATA(moveDest, sizeof(OffsetNumber) * xlrec.nMove, 4); ACCEPT_RDATA_DATA(chainSrc, sizeof(OffsetNumber) * xlrec.nChain, 5); ACCEPT_RDATA_DATA(chainDest, sizeof(OffsetNumber) * xlrec.nChain, 6); ACCEPT_RDATA_BUFFER(buffer, 7); /* Do the updates */ START_CRIT_SECTION(); spgPageIndexMultiDelete(&bds->spgstate, page, toDead, xlrec.nDead, SPGIST_DEAD, SPGIST_DEAD, InvalidBlockNumber, InvalidOffsetNumber); spgPageIndexMultiDelete(&bds->spgstate, page, toPlaceholder, xlrec.nPlaceholder, SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, InvalidBlockNumber, InvalidOffsetNumber); /* * We implement the move step by swapping the item pointers of the source * and target tuples, then replacing the newly-source tuples with * placeholders. This is perhaps unduly friendly with the page data * representation, but it's fast and doesn't risk page overflow when a * tuple to be relocated is large. */ for (i = 0; i < xlrec.nMove; i++) { ItemId idSrc = PageGetItemId(page, moveSrc[i]); ItemId idDest = PageGetItemId(page, moveDest[i]); ItemIdData tmp; tmp = *idSrc; *idSrc = *idDest; *idDest = tmp; } spgPageIndexMultiDelete(&bds->spgstate, page, moveSrc, xlrec.nMove, SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, InvalidBlockNumber, InvalidOffsetNumber); for (i = 0; i < xlrec.nChain; i++) { SpGistLeafTuple lt; lt = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, chainSrc[i])); Assert(lt->tupstate == SPGIST_LIVE); lt->nextOffset = chainDest[i]; } MarkBufferDirty(buffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; recptr = XLogInsert(RM_SPGIST_ID, XLOG_SPGIST_VACUUM_LEAF, rdata); PageSetLSN(page, recptr); } END_CRIT_SECTION(); }
/*---------- * GetSnapshotData -- returns information about running transactions. * * The returned snapshot includes xmin (lowest still-running xact ID), * xmax (next xact ID to be assigned), and a list of running xact IDs * in the range xmin <= xid < xmax. It is used as follows: * All xact IDs < xmin are considered finished. * All xact IDs >= xmax are considered still running. * For an xact ID xmin <= xid < xmax, consult list to see whether * it is considered running or not. * This ensures that the set of transactions seen as "running" by the * current xact will not change after it takes the snapshot. * * We also compute the current global xmin (oldest xmin across all running * transactions) and save it in RecentGlobalXmin. This is the same * computation done by GetOldestXmin(TRUE). The xmin value is also stored * into RecentXmin. *---------- */ Snapshot GetSnapshotData(Snapshot snapshot, bool serializable) { SISeg *segP = shmInvalBuffer; ProcState *stateP = segP->procState; TransactionId xmin; TransactionId xmax; TransactionId globalxmin; int index; int count = 0; Assert(snapshot != NULL); /* * Allocating space for MaxBackends xids is usually overkill; * lastBackend would be sufficient. But it seems better to do the * malloc while not holding the lock, so we can't look at lastBackend. * * This does open a possibility for avoiding repeated malloc/free: * since MaxBackends does not change at runtime, we can simply reuse * the previous xip array if any. (This relies on the fact that all * calls pass static SnapshotData structs.) */ if (snapshot->xip == NULL) { /* * First call for this snapshot */ snapshot->xip = (TransactionId *) malloc(MaxBackends * sizeof(TransactionId)); if (snapshot->xip == NULL) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } globalxmin = xmin = GetCurrentTransactionId(); /* * If we are going to set MyProc->xmin then we'd better get exclusive * lock; if not, this is a read-only operation so it can be shared. */ LWLockAcquire(SInvalLock, serializable ? LW_EXCLUSIVE : LW_SHARED); /*-------------------- * Unfortunately, we have to call ReadNewTransactionId() after acquiring * SInvalLock above. It's not good because ReadNewTransactionId() does * LWLockAcquire(XidGenLock), but *necessary*. We need to be sure that * no transactions exit the set of currently-running transactions * between the time we fetch xmax and the time we finish building our * snapshot. Otherwise we could have a situation like this: * * 1. Tx Old is running (in Read Committed mode). * 2. Tx S reads new transaction ID into xmax, then * is swapped out before acquiring SInvalLock. * 3. Tx New gets new transaction ID (>= S' xmax), * makes changes and commits. * 4. Tx Old changes some row R changed by Tx New and commits. * 5. Tx S finishes getting its snapshot data. It sees Tx Old as * done, but sees Tx New as still running (since New >= xmax). * * Now S will see R changed by both Tx Old and Tx New, *but* does not * see other changes made by Tx New. If S is supposed to be in * Serializable mode, this is wrong. * * By locking SInvalLock before we read xmax, we ensure that TX Old * cannot exit the set of running transactions seen by Tx S. Therefore * both Old and New will be seen as still running => no inconsistency. *-------------------- */ xmax = ReadNewTransactionId(); for (index = 0; index < segP->lastBackend; index++) { SHMEM_OFFSET pOffset = stateP[index].procStruct; if (pOffset != INVALID_OFFSET) { PGPROC *proc = (PGPROC *) MAKE_PTR(pOffset); /* Fetch xid just once - see GetNewTransactionId */ TransactionId xid = proc->xid; /* * Ignore my own proc (dealt with my xid above), procs not * running a transaction, and xacts started since we read the * next transaction ID. There's no need to store XIDs above * what we got from ReadNewTransactionId, since we'll treat * them as running anyway. We also assume that such xacts * can't compute an xmin older than ours, so they needn't be * considered in computing globalxmin. */ if (proc == MyProc || !TransactionIdIsNormal(xid) || TransactionIdFollowsOrEquals(xid, xmax)) continue; if (TransactionIdPrecedes(xid, xmin)) xmin = xid; snapshot->xip[count] = xid; count++; /* Update globalxmin to be the smallest valid xmin */ xid = proc->xmin; if (TransactionIdIsNormal(xid)) if (TransactionIdPrecedes(xid, globalxmin)) globalxmin = xid; } } if (serializable) MyProc->xmin = xmin; LWLockRelease(SInvalLock); /* Serializable snapshot must be computed before any other... */ Assert(TransactionIdIsValid(MyProc->xmin)); /* * Update globalxmin to include actual process xids. This is a * slightly different way of computing it than GetOldestXmin uses, but * should give the same result. */ if (TransactionIdPrecedes(xmin, globalxmin)) globalxmin = xmin; /* Update globals for use by VACUUM */ RecentGlobalXmin = globalxmin; RecentXmin = xmin; snapshot->xmin = xmin; snapshot->xmax = xmax; snapshot->xcnt = count; snapshot->curcid = GetCurrentCommandId(); return snapshot; }
/* * HeapTupleSatisfiesSnapshot * True iff heap tuple is valid for the given snapshot. * * Here, we consider the effects of: * all transactions committed as of the time of the given snapshot * previous commands of this transaction * * Does _not_ include: * transactions shown as in-progress by the snapshot * transactions started after the snapshot was taken * changes made by the current command * * This is the same as HeapTupleSatisfiesNow, except that transactions that * were in progress or as yet unstarted when the snapshot was taken will * be treated as uncommitted, even if they have committed by now. * * (Notice, however, that the tuple status hint bits will be updated on the * basis of the true state of the transaction, even if we then pretend we * can't see it.) */ bool HeapTupleSatisfiesSnapshot(HeapTupleHeader tuple, Snapshot snapshot) { if (!(tuple->t_infomask & HEAP_XMIN_COMMITTED)) { if (tuple->t_infomask & HEAP_XMIN_INVALID) return false; if (tuple->t_infomask & HEAP_MOVED_OFF) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); if (TransactionIdIsCurrentTransactionId(xvac)) return false; if (!TransactionIdIsInProgress(xvac)) { if (TransactionIdDidCommit(xvac)) { tuple->t_infomask |= HEAP_XMIN_INVALID; return false; } tuple->t_infomask |= HEAP_XMIN_COMMITTED; } } else if (tuple->t_infomask & HEAP_MOVED_IN) { TransactionId xvac = HeapTupleHeaderGetXvac(tuple); if (!TransactionIdIsCurrentTransactionId(xvac)) { if (TransactionIdIsInProgress(xvac)) return false; if (TransactionIdDidCommit(xvac)) tuple->t_infomask |= HEAP_XMIN_COMMITTED; else { tuple->t_infomask |= HEAP_XMIN_INVALID; return false; } } } else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple))) { if (HeapTupleHeaderGetCmin(tuple) >= snapshot->curcid) return false; /* inserted after scan started */ if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return true; Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple))); if (tuple->t_infomask & HEAP_MARKED_FOR_UPDATE) return true; if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) return true; /* deleted after scan started */ else return false; /* deleted before scan started */ } else if (TransactionIdIsInProgress(HeapTupleHeaderGetXmin(tuple))) return false; else if (TransactionIdDidCommit(HeapTupleHeaderGetXmin(tuple))) tuple->t_infomask |= HEAP_XMIN_COMMITTED; else { /* it must have aborted or crashed */ tuple->t_infomask |= HEAP_XMIN_INVALID; return false; } } /* * By here, the inserting transaction has committed - have to check * when... */ if (TransactionIdFollowsOrEquals(HeapTupleHeaderGetXmin(tuple), snapshot->xmin)) { uint32 i; if (TransactionIdFollowsOrEquals(HeapTupleHeaderGetXmin(tuple), snapshot->xmax)) return false; for (i = 0; i < snapshot->xcnt; i++) { if (TransactionIdEquals(HeapTupleHeaderGetXmin(tuple), snapshot->xip[i])) return false; } } if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid or aborted */ return true; if (tuple->t_infomask & HEAP_MARKED_FOR_UPDATE) return true; if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) { if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple))) { if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) return true; /* deleted after scan started */ else return false; /* deleted before scan started */ } if (TransactionIdIsInProgress(HeapTupleHeaderGetXmax(tuple))) return true; if (!TransactionIdDidCommit(HeapTupleHeaderGetXmax(tuple))) { /* it must have aborted or crashed */ tuple->t_infomask |= HEAP_XMAX_INVALID; return true; } /* xmax transaction committed */ tuple->t_infomask |= HEAP_XMAX_COMMITTED; } /* * OK, the deleting transaction committed too ... but when? */ if (TransactionIdFollowsOrEquals(HeapTupleHeaderGetXmax(tuple), snapshot->xmin)) { uint32 i; if (TransactionIdFollowsOrEquals(HeapTupleHeaderGetXmax(tuple), snapshot->xmax)) return true; for (i = 0; i < snapshot->xcnt; i++) { if (TransactionIdEquals(HeapTupleHeaderGetXmax(tuple), snapshot->xip[i])) return true; } } return false; }
/* * We have to cut&paste copde of GetNewTransactionId from varsup because we change way of advancing ShmemVariableCache->nextXid */ TransactionId DtmGetNewTransactionId(bool isSubXact) { TransactionId xid; XTM_INFO("%d: GetNewTransactionId\n", getpid()); /* * Workers synchronize transaction state at the beginning of each parallel * operation, so we can't account for new XIDs after that point. */ if (IsInParallelMode()) elog(ERROR, "cannot assign TransactionIds during a parallel operation"); /* * During bootstrap initialization, we return the special bootstrap * transaction id. */ if (IsBootstrapProcessingMode()) { Assert(!isSubXact); MyPgXact->xid = BootstrapTransactionId; return BootstrapTransactionId; } /* safety check, we should never get this far in a HS slave */ if (RecoveryInProgress()) elog(ERROR, "cannot assign TransactionIds during recovery"); LWLockAcquire(XidGenLock, LW_EXCLUSIVE); xid = DtmGetNextXid(); /*---------- * Check to see if it's safe to assign another XID. This protects against * catastrophic data loss due to XID wraparound. The basic rules are: * * If we're past xidVacLimit, start trying to force autovacuum cycles. * If we're past xidWarnLimit, start issuing warnings. * If we're past xidStopLimit, refuse to execute transactions, unless * we are running in single-user mode (which gives an escape hatch * to the DBA who somehow got past the earlier defenses). * * Note that this coding also appears in GetNewMultiXactId. *---------- */ if (TransactionIdFollowsOrEquals(xid, ShmemVariableCache->xidVacLimit)) { /* * For safety's sake, we release XidGenLock while sending signals, * warnings, etc. This is not so much because we care about * preserving concurrency in this situation, as to avoid any * possibility of deadlock while doing get_database_name(). First, * copy all the shared values we'll need in this path. */ TransactionId xidWarnLimit = ShmemVariableCache->xidWarnLimit; TransactionId xidStopLimit = ShmemVariableCache->xidStopLimit; TransactionId xidWrapLimit = ShmemVariableCache->xidWrapLimit; Oid oldest_datoid = ShmemVariableCache->oldestXidDB; LWLockRelease(XidGenLock); /* * To avoid swamping the postmaster with signals, we issue the autovac * request only once per 64K transaction starts. This still gives * plenty of chances before we get into real trouble. */ if (IsUnderPostmaster && (xid % 65536) == 0) SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); if (IsUnderPostmaster && TransactionIdFollowsOrEquals(xid, xidStopLimit)) { char *oldest_datname = get_database_name(oldest_datoid); /* complain even if that DB has disappeared */ if (oldest_datname) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("database is not accepting commands to avoid wraparound data loss in database \"%s\"", oldest_datname), errhint("Stop the postmaster and vacuum that database in single-user mode.\n" "You might also need to commit or roll back old prepared transactions."))); else ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("database is not accepting commands to avoid wraparound data loss in database with OID %u", oldest_datoid), errhint("Stop the postmaster and vacuum that database in single-user mode.\n" "You might also need to commit or roll back old prepared transactions."))); } else if (TransactionIdFollowsOrEquals(xid, xidWarnLimit)) { char *oldest_datname = get_database_name(oldest_datoid); /* complain even if that DB has disappeared */ if (oldest_datname) ereport(WARNING, (errmsg("database \"%s\" must be vacuumed within %u transactions", oldest_datname, xidWrapLimit - xid), errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" "You might also need to commit or roll back old prepared transactions."))); else ereport(WARNING, (errmsg("database with OID %u must be vacuumed within %u transactions", oldest_datoid, xidWrapLimit - xid), errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n" "You might also need to commit or roll back old prepared transactions."))); } /* Re-acquire lock and start over */ LWLockAcquire(XidGenLock, LW_EXCLUSIVE); xid = DtmGetNextXid(); } /* * If we are allocating the first XID of a new page of the commit log, * zero out that commit-log page before returning. We must do this while * holding XidGenLock, else another xact could acquire and commit a later * XID before we zero the page. Fortunately, a page of the commit log * holds 32K or more transactions, so we don't have to do this very often. * * Extend pg_subtrans and pg_commit_ts too. */ if (TransactionIdFollowsOrEquals(xid, ShmemVariableCache->nextXid)) { ExtendCLOG(xid); ExtendCommitTs(xid); ExtendSUBTRANS(xid); } /* * Now advance the nextXid counter. This must not happen until after we * have successfully completed ExtendCLOG() --- if that routine fails, we * want the next incoming transaction to try it again. We cannot assign * more XIDs until there is CLOG space for them. */ if (xid == ShmemVariableCache->nextXid) TransactionIdAdvance(ShmemVariableCache->nextXid); else Assert(TransactionIdPrecedes(xid, ShmemVariableCache->nextXid)); /* * We must store the new XID into the shared ProcArray before releasing * XidGenLock. This ensures that every active XID older than * latestCompletedXid is present in the ProcArray, which is essential for * correct OldestXmin tracking; see src/backend/access/transam/README. * * XXX by storing xid into MyPgXact without acquiring ProcArrayLock, we * are relying on fetch/store of an xid to be atomic, else other backends * might see a partially-set xid here. But holding both locks at once * would be a nasty concurrency hit. So for now, assume atomicity. * * Note that readers of PGXACT xid fields should be careful to fetch the * value only once, rather than assume they can read a value multiple * times and get the same answer each time. * * The same comments apply to the subxact xid count and overflow fields. * * A solution to the atomic-store problem would be to give each PGXACT its * own spinlock used only for fetching/storing that PGXACT's xid and * related fields. * * If there's no room to fit a subtransaction XID into PGPROC, set the * cache-overflowed flag instead. This forces readers to look in * pg_subtrans to map subtransaction XIDs up to top-level XIDs. There is a * race-condition window, in that the new XID will not appear as running * until its parent link has been placed into pg_subtrans. However, that * will happen before anyone could possibly have a reason to inquire about * the status of the XID, so it seems OK. (Snapshots taken during this * window *will* include the parent XID, so they will deliver the correct * answer later on when someone does have a reason to inquire.) */ { /* * Use volatile pointer to prevent code rearrangement; other backends * could be examining my subxids info concurrently, and we don't want * them to see an invalid intermediate state, such as incrementing * nxids before filling the array entry. Note we are assuming that * TransactionId and int fetch/store are atomic. */ volatile PGPROC *myproc = MyProc; volatile PGXACT *mypgxact = MyPgXact; if (!isSubXact) mypgxact->xid = xid; else { int nxids = mypgxact->nxids; if (nxids < PGPROC_MAX_CACHED_SUBXIDS) { myproc->subxids.xids[nxids] = xid; mypgxact->nxids = nxids + 1; } else mypgxact->overflowed = true; } } LWLockRelease(XidGenLock); return xid; }