/* * SubTransGetTopmostTransaction * * Returns the topmost transaction of the given transaction id. * * Because we cannot look back further than TransactionXmin, it is possible * that this function will lie and return an intermediate subtransaction ID * instead of the true topmost parent ID. This is OK, because in practice * we only care about detecting whether the topmost parent is still running * or is part of a current snapshot's list of still-running transactions. * Therefore, any XID before TransactionXmin is as good as any other. */ TransactionId SubTransGetTopmostTransaction(TransactionId xid) { TransactionId parentXid = xid, previousXid = xid; /* Can't ask about stuff that might not be around anymore */ Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); while (TransactionIdIsValid(parentXid)) { previousXid = parentXid; if (TransactionIdPrecedes(parentXid, TransactionXmin)) break; parentXid = SubTransGetParent(parentXid); /* * By convention the parent xid gets allocated first, so should always * precede the child xid. Anything else points to a corrupted data * structure that could lead to an infinite loop, so exit. */ if (!TransactionIdPrecedes(parentXid, previousXid)) elog(ERROR, "pg_subtrans contains invalid entry: xid %u points to parent xid %u", previousXid, parentXid); } Assert(TransactionIdIsValid(previousXid)); return previousXid; }
/* * Get new XID. For global transaction is it previsly set by dtm_begin_transaction or dtm_join_transaction. * Local transactions are using range of local Xids obtains from DTM. */ static TransactionId DtmGetNextXid() { TransactionId xid; LWLockAcquire(dtm->xidLock, LW_EXCLUSIVE); if (TransactionIdIsValid(DtmNextXid)) { XTM_INFO("Use global XID %d\n", DtmNextXid); xid = DtmNextXid; if (TransactionIdPrecedesOrEquals(ShmemVariableCache->nextXid, xid)) { /* Advance ShmemVariableCache->nextXid formward until new Xid */ while (TransactionIdPrecedes(ShmemVariableCache->nextXid, xid)) { XTM_INFO("Extend CLOG for global transaction to %d\n", ShmemVariableCache->nextXid); ExtendCLOG(ShmemVariableCache->nextXid); ExtendCommitTs(ShmemVariableCache->nextXid); ExtendSUBTRANS(ShmemVariableCache->nextXid); TransactionIdAdvance(ShmemVariableCache->nextXid); } dtm->nReservedXids = 0; } } else { if (dtm->nReservedXids == 0) { XTM_INFO("%d: reserve new XID range\n", getpid()); dtm->nReservedXids = ArbiterReserve(ShmemVariableCache->nextXid, DtmLocalXidReserve, &dtm->nextXid); Assert(dtm->nReservedXids > 0); Assert(TransactionIdFollowsOrEquals(dtm->nextXid, ShmemVariableCache->nextXid)); /* Advance ShmemVariableCache->nextXid formward until new Xid */ while (TransactionIdPrecedes(ShmemVariableCache->nextXid, dtm->nextXid)) { XTM_INFO("Extend CLOG for local transaction to %d\n", ShmemVariableCache->nextXid); ExtendCLOG(ShmemVariableCache->nextXid); ExtendCommitTs(ShmemVariableCache->nextXid); ExtendSUBTRANS(ShmemVariableCache->nextXid); TransactionIdAdvance(ShmemVariableCache->nextXid); } } Assert(ShmemVariableCache->nextXid == dtm->nextXid); xid = dtm->nextXid++; dtm->nReservedXids -= 1; XTM_INFO("Obtain new local XID %d\n", xid); } LWLockRelease(dtm->xidLock); return xid; }
/* * Compute the oldest xmin across all slots and store it in the ProcArray. */ void ReplicationSlotsComputeRequiredXmin(bool already_locked) { int i; TransactionId agg_xmin = InvalidTransactionId; TransactionId agg_catalog_xmin = InvalidTransactionId; Assert(ReplicationSlotCtl != NULL); if (!already_locked) LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (i = 0; i < max_replication_slots; i++) { ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i]; TransactionId effective_xmin; TransactionId effective_catalog_xmin; if (!s->in_use) continue; { volatile ReplicationSlot *vslot = s; SpinLockAcquire(&s->mutex); effective_xmin = vslot->effective_xmin; effective_catalog_xmin = vslot->effective_catalog_xmin; SpinLockRelease(&s->mutex); } /* check the data xmin */ if (TransactionIdIsValid(effective_xmin) && (!TransactionIdIsValid(agg_xmin) || TransactionIdPrecedes(effective_xmin, agg_xmin))) agg_xmin = effective_xmin; /* check the catalog xmin */ if (TransactionIdIsValid(effective_catalog_xmin) && (!TransactionIdIsValid(agg_catalog_xmin) || TransactionIdPrecedes(effective_catalog_xmin, agg_catalog_xmin))) agg_catalog_xmin = effective_catalog_xmin; } if (!already_locked) LWLockRelease(ReplicationSlotControlLock); ProcArraySetReplicationSlotXmin(agg_xmin, agg_catalog_xmin, already_locked); }
/* * do a TransactionId -> txid conversion for an XID near the given epoch */ static txid convert_xid(TransactionId xid, const TxidEpoch *state) { #ifndef INT64_IS_BUSTED uint64 epoch; /* return special xid's as-is */ if (!TransactionIdIsNormal(xid)) return (txid) xid; /* xid can be on either side when near wrap-around */ epoch = (uint64) state->epoch; if (xid > state->last_xid && TransactionIdPrecedes(xid, state->last_xid)) epoch--; else if (xid < state->last_xid && TransactionIdFollows(xid, state->last_xid)) epoch++; return (epoch << 32) | xid; #else /* INT64_IS_BUSTED */ /* we can't do anything with the epoch, so ignore it */ return (txid) xid & MAX_TXID; #endif /* INT64_IS_BUSTED */ }
/* * Move forwards the oldest commitTS value that can be consulted */ void AdvanceOldestCommitTsXid(TransactionId oldestXact) { LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); if (ShmemVariableCache->oldestCommitTsXid != InvalidTransactionId && TransactionIdPrecedes(ShmemVariableCache->oldestCommitTsXid, oldestXact)) ShmemVariableCache->oldestCommitTsXid = oldestXact; LWLockRelease(CommitTsLock); }
/* * TransactionIdDidCommit * True iff transaction associated with the identifier did commit. * * Note: * Assumes transaction identifier is valid. */ bool /* true if given transaction committed */ TransactionIdDidCommit(TransactionId transactionId) { XidStatus xidstatus; xidstatus = TransactionLogFetch(transactionId); /* * If it's marked committed, it's committed. */ if (xidstatus == TRANSACTION_STATUS_COMMITTED) #ifdef PGXC { syncGXID_GTM((GlobalTransactionId)transactionId); #endif return true; #ifdef PGXC } #endif /* * If it's marked subcommitted, we have to check the parent recursively. * However, if it's older than TransactionXmin, we can't look at * pg_subtrans; instead assume that the parent crashed without cleaning up * its children. * * Originally we Assert'ed that the result of SubTransGetParent was not * zero. However with the introduction of prepared transactions, there can * be a window just after database startup where we do not have complete * knowledge in pg_subtrans of the transactions after TransactionXmin. * StartupSUBTRANS() has ensured that any missing information will be * zeroed. Since this case should not happen under normal conditions, it * seems reasonable to emit a WARNING for it. */ if (xidstatus == TRANSACTION_STATUS_SUB_COMMITTED) { TransactionId parentXid; if (TransactionIdPrecedes(transactionId, TransactionXmin)) return false; parentXid = SubTransGetParent(transactionId); if (!TransactionIdIsValid(parentXid)) { elog(WARNING, "no pg_subtrans entry for subcommitted XID %u", transactionId); return false; } return TransactionIdDidCommit(parentXid); } /* * It's not committed. */ return false; }
/* * GetOldestXmin -- returns oldest transaction that was running * when any current transaction was started. * * If allDbs is TRUE then all backends are considered; if allDbs is FALSE * then only backends running in my own database are considered. * * This is used by VACUUM to decide which deleted tuples must be preserved * in a table. allDbs = TRUE is needed for shared relations, but allDbs = * FALSE is sufficient for non-shared relations, since only backends in my * own database could ever see the tuples in them. * * Note: we include the currently running xids in the set of considered xids. * This ensures that if a just-started xact has not yet set its snapshot, * when it does set the snapshot it cannot set xmin less than what we compute. */ TransactionId GetOldestXmin(bool allDbs) { SISeg *segP = shmInvalBuffer; ProcState *stateP = segP->procState; TransactionId result; int index; result = GetCurrentTransactionId(); LWLockAcquire(SInvalLock, LW_SHARED); for (index = 0; index < segP->lastBackend; index++) { SHMEM_OFFSET pOffset = stateP[index].procStruct; if (pOffset != INVALID_OFFSET) { PGPROC *proc = (PGPROC *) MAKE_PTR(pOffset); if (allDbs || proc->databaseId == MyDatabaseId) { /* Fetch xid just once - see GetNewTransactionId */ TransactionId xid = proc->xid; if (TransactionIdIsNormal(xid)) { if (TransactionIdPrecedes(xid, result)) result = xid; xid = proc->xmin; if (TransactionIdIsNormal(xid)) if (TransactionIdPrecedes(xid, result)) result = xid; } } } } LWLockRelease(SInvalLock); return result; }
/* * Set the limit values between which commit TS can be consulted. */ void SetCommitTsLimit(TransactionId oldestXact, TransactionId newestXact) { /* * Be careful not to overwrite values that are either further into the * "future" or signal a disabled committs. */ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); if (ShmemVariableCache->oldestCommitTs != InvalidTransactionId) { if (TransactionIdPrecedes(ShmemVariableCache->oldestCommitTs, oldestXact)) ShmemVariableCache->oldestCommitTs = oldestXact; if (TransactionIdPrecedes(newestXact, ShmemVariableCache->newestCommitTs)) ShmemVariableCache->newestCommitTs = newestXact; } else { Assert(ShmemVariableCache->newestCommitTs == InvalidTransactionId); } LWLockRelease(CommitTsLock); }
/* Record lowest soon-prunable XID */ static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid) { /* * This should exactly match the PageSetPrunable macro. We can't store * directly into the page header yet, so we update working state. */ Assert(TransactionIdIsNormal(xid)); if (!TransactionIdIsValid(prstate->new_prune_xid) || TransactionIdPrecedes(xid, prstate->new_prune_xid)) prstate->new_prune_xid = xid; }
/* * Comparison function for RegisteredSnapshots heap. Snapshots are ordered * by xmin, so that the snapshot with smallest xmin is at the top. */ static int xmin_cmp(const pairingheap_node *a, const pairingheap_node *b, void *arg) { const SnapshotData *asnap = pairingheap_const_container(SnapshotData, ph_node, a); const SnapshotData *bsnap = pairingheap_const_container(SnapshotData, ph_node, b); if (TransactionIdPrecedes(asnap->xmin, bsnap->xmin)) return 1; else if (TransactionIdFollows(asnap->xmin, bsnap->xmin)) return -1; else return 0; }
/* * Decide which of two SUBTRANS page numbers is "older" for truncation purposes. * * We need to use comparison of TransactionIds here in order to do the right * thing with wraparound XID arithmetic. However, if we are asked about * page number zero, we don't want to hand InvalidTransactionId to * TransactionIdPrecedes: it'll get weird about permanent xact IDs. So, * offset both xids by FirstNormalTransactionId to avoid that. */ static bool SubTransPagePrecedes(int page1, int page2) { TransactionId xid1; TransactionId xid2; xid1 = ((TransactionId) page1) * SUBTRANS_XACTS_PER_PAGE; xid1 += FirstNormalTransactionId; xid2 = ((TransactionId) page2) * SUBTRANS_XACTS_PER_PAGE; xid2 += FirstNormalTransactionId; return TransactionIdPrecedes(xid1, xid2); }
/* * Decide which of two CLOG page numbers is "older" for truncation purposes. * * We need to use comparison of TransactionIds here in order to do the right * thing with wraparound XID arithmetic. However, if we are asked about * page number zero, we don't want to hand InvalidTransactionId to * TransactionIdPrecedes: it'll get weird about permanent xact IDs. So, * offset both xids by FirstNormalTransactionId to avoid that. */ static bool CommitTsPagePrecedes(int page1, int page2) { TransactionId xid1; TransactionId xid2; xid1 = ((TransactionId) page1) * COMMIT_TS_XACTS_PER_PAGE; xid1 += FirstNormalTransactionId; xid2 = ((TransactionId) page2) * COMMIT_TS_XACTS_PER_PAGE; xid2 += FirstNormalTransactionId; return TransactionIdPrecedes(xid1, xid2); }
/* * Decide which of two DistributedLog page numbers is "older" for * truncation purposes. * * We need to use comparison of TransactionIds here in order to do the right * thing with wraparound XID arithmetic. However, if we are asked about * page number zero, we don't want to hand InvalidTransactionId to * TransactionIdPrecedes: it'll get weird about permanent xact IDs. So, * offset both xids by FirstNormalTransactionId to avoid that. */ static bool DistributedLog_PagePrecedes(int page1, int page2) { TransactionId xid1; TransactionId xid2; xid1 = ((TransactionId) page1) * ENTRIES_PER_PAGE; xid1 += FirstNormalTransactionId; xid2 = ((TransactionId) page2) * ENTRIES_PER_PAGE; xid2 += FirstNormalTransactionId; return TransactionIdPrecedes(xid1, xid2); }
/* * Get oldest Xid visible by any active transaction (global or local) * Take in account global Xmin received from DTMD */ static TransactionId DtmGetOldestXmin(Relation rel, bool ignoreVacuum) { TransactionId localXmin = PgGetOldestXmin(rel, ignoreVacuum); TransactionId globalXmin = dtm->minXid; XTM_INFO("XTM: DtmGetOldestXmin localXmin=%d, globalXmin=%d\n", localXmin, globalXmin); if (TransactionIdIsValid(globalXmin)) { globalXmin -= vacuum_defer_cleanup_age; if (!TransactionIdIsNormal(globalXmin)) globalXmin = FirstNormalTransactionId; if (TransactionIdPrecedes(globalXmin, localXmin)) localXmin = globalXmin; XTM_INFO("XTM: DtmGetOldestXmin adjusted localXmin=%d, globalXmin=%d\n", localXmin, globalXmin); } return localXmin; }
/* * HeapTupleIsSurelyDead * * Determine whether a tuple is surely dead. We sometimes use this * in lieu of HeapTupleSatisifesVacuum when the tuple has just been * tested by HeapTupleSatisfiesMVCC and, therefore, any hint bits that * can be set should already be set. We assume that if no hint bits * either for xmin or xmax, the transaction is still running. This is * therefore faster than HeapTupleSatisfiesVacuum, because we don't * consult CLOG (and also because we don't need to give an exact answer, * just whether or not the tuple is surely dead). */ bool HeapTupleIsSurelyDead(HeapTuple htup, TransactionId OldestXmin) { HeapTupleHeader tuple = htup->t_data; Assert(ItemPointerIsValid(&htup->t_self)); Assert(htup->t_tableOid != InvalidOid); /* * If the inserting transaction is marked invalid, then it aborted, and * the tuple is definitely dead. If it's marked neither committed nor * invalid, then we assume it's still alive (since the presumption is that * all relevant hint bits were just set moments ago). */ if (!HeapTupleHeaderXminCommitted(tuple)) return HeapTupleHeaderXminInvalid(tuple) ? true : false; /* * If the inserting transaction committed, but any deleting transaction * aborted, the tuple is still alive. */ if (tuple->t_infomask & HEAP_XMAX_INVALID) return false; /* * If the XMAX is just a lock, the tuple is still alive. */ if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) return false; /* * If the Xmax is a MultiXact, it might be dead or alive, but we cannot * know without checking pg_multixact. */ if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) return false; /* If deleter isn't known to have committed, assume it's still running. */ if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) return false; /* Deleter committed, so tuple is dead if the XID is old enough. */ return TransactionIdPrecedes(HeapTupleHeaderGetRawXmax(tuple), OldestXmin); }
/* adapted from Postgres' txid.c#convert_xid function */ uint64 convert_xid(TransactionId xid) { TxidEpoch state; uint64 epoch; GetNextXidAndEpoch(&state.last_xid, &state.epoch); /* return special xid's as-is */ if (!TransactionIdIsNormal(xid)) return (uint64) xid; /* xid can be on either side when near wrap-around */ epoch = (uint64) state.epoch; if (xid > state.last_xid && TransactionIdPrecedes(xid, state.last_xid)) epoch--; else if (xid < state.last_xid && TransactionIdFollows(xid, state.last_xid)) epoch++; return (epoch << 32) | xid; }
/* * SnapshotResetXmin * * If there are no more snapshots, we can reset our PGXACT->xmin to InvalidXid. * Note we can do this without locking because we assume that storing an Xid * is atomic. * * Even if there are some remaining snapshots, we may be able to advance our * PGXACT->xmin to some degree. This typically happens when a portal is * dropped. For efficiency, we only consider recomputing PGXACT->xmin when * the active snapshot stack is empty. */ static void SnapshotResetXmin(void) { Snapshot minSnapshot; if (ActiveSnapshot != NULL) return; if (pairingheap_is_empty(&RegisteredSnapshots)) { MyPgXact->xmin = InvalidTransactionId; return; } minSnapshot = pairingheap_container(SnapshotData, ph_node, pairingheap_first(&RegisteredSnapshots)); if (TransactionIdPrecedes(MyPgXact->xmin, minSnapshot->xmin)) MyPgXact->xmin = minSnapshot->xmin; }
/* * do a TransactionId -> txid conversion for an XID near the given epoch */ static txid convert_xid(TransactionId xid, const TxidEpoch *state) { uint64 epoch; /* return special xid's as-is */ if (!TransactionIdIsNormal(xid)) return (txid) xid; /* xid can be on either side when near wrap-around */ epoch = (uint64) state->epoch; if (xid > state->last_xid && TransactionIdPrecedes(xid, state->last_xid)) epoch--; else if (xid < state->last_xid && TransactionIdFollows(xid, state->last_xid)) epoch++; return (epoch << 32) | xid; }
/* * TransactionIdDidAbort * True iff transaction associated with the identifier did abort. * * Note: * Assumes transaction identifier is valid. */ bool /* true if given transaction aborted */ TransactionIdDidAbort(TransactionId transactionId) { XidStatus xidstatus; xidstatus = TransactionLogFetch(transactionId); /* * If it's marked aborted, it's aborted. */ if (xidstatus == TRANSACTION_STATUS_ABORTED) return true; /* * If it's marked subcommitted, we have to check the parent recursively. * However, if it's older than TransactionXmin, we can't look at * pg_subtrans; instead assume that the parent crashed without cleaning up * its children. */ if (xidstatus == TRANSACTION_STATUS_SUB_COMMITTED) { TransactionId parentXid; if (TransactionIdPrecedes(transactionId, TransactionXmin)) return true; parentXid = SubTransGetParent(transactionId); if (!TransactionIdIsValid(parentXid)) { /* see notes in TransactionIdDidCommit */ elog(WARNING, "no pg_subtrans entry for subcommitted XID %u", transactionId); return true; } return TransactionIdDidAbort(parentXid); } /* * It's not aborted. */ return false; }
/* * SubTransGetTopmostTransaction * * Returns the topmost transaction of the given transaction id. * * Because we cannot look back further than TransactionXmin, it is possible * that this function will lie and return an intermediate subtransaction ID * instead of the true topmost parent ID. This is OK, because in practice * we only care about detecting whether the topmost parent is still running * or is part of a current snapshot's list of still-running transactions. * Therefore, any XID before TransactionXmin is as good as any other. */ TransactionId SubTransGetTopmostTransaction(TransactionId xid) { TransactionId parentXid = xid, previousXid = xid; /* Can't ask about stuff that might not be around anymore */ Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); while (TransactionIdIsValid(parentXid)) { previousXid = parentXid; if (TransactionIdPrecedes(parentXid, TransactionXmin)) break; parentXid = SubTransGetParent(parentXid); } Assert(TransactionIdIsValid(previousXid)); return previousXid; }
/* * TransactionIdLatest --- get latest XID among a main xact and its children */ TransactionId TransactionIdLatest(TransactionId mainxid, int nxids, const TransactionId *xids) { TransactionId result; /* * In practice it is highly likely that the xids[] array is sorted, and so * we could save some cycles by just taking the last child XID, but this * probably isn't so performance-critical that it's worth depending on * that assumption. But just to show we're not totally stupid, scan the * array back-to-front to avoid useless assignments. */ result = mainxid; while (--nxids >= 0) { if (TransactionIdPrecedes(result, xids[nxids])) result = xids[nxids]; } return result; }
/* * Check whether a tuple is all-visible relative to a given OldestXmin value. * The buffer should contain the tuple and should be locked and pinned. */ static bool tuple_all_visible(HeapTuple tup, TransactionId OldestXmin, Buffer buffer) { HTSV_Result state; TransactionId xmin; state = HeapTupleSatisfiesVacuum(tup, OldestXmin, buffer); if (state != HEAPTUPLE_LIVE) return false; /* all-visible implies live */ /* * Neither lazy_scan_heap nor heap_page_is_all_visible will mark a page * all-visible unless every tuple is hinted committed. However, those hint * bits could be lost after a crash, so we can't be certain that they'll * be set here. So just check the xmin. */ xmin = HeapTupleHeaderGetXmin(tup->t_data); if (!TransactionIdPrecedes(xmin, OldestXmin)) return false; /* xmin not old enough for all to see */ return true; }
/* * StandbyReleaseLocksMany * Release standby locks held by XIDs < removeXid * * If keepPreparedXacts is true, keep prepared transactions even if * they're older than removeXid */ static void StandbyReleaseLocksMany(TransactionId removeXid, bool keepPreparedXacts) { ListCell *cell, *prev, *next; LOCKTAG locktag; /* * Release all matching locks. */ prev = NULL; for (cell = list_head(RecoveryLockList); cell; cell = next) { xl_standby_lock *lock = (xl_standby_lock *) lfirst(cell); next = lnext(cell); if (!TransactionIdIsValid(removeXid) || TransactionIdPrecedes(lock->xid, removeXid)) { if (keepPreparedXacts && StandbyTransactionIdIsPrepared(lock->xid)) continue; elog(trace_recovery(DEBUG4), "releasing recovery lock: xid %u db %u rel %u", lock->xid, lock->dbOid, lock->relOid); SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid); if (!LockRelease(&locktag, AccessExclusiveLock, true)) elog(LOG, "RecoveryLockList contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u", lock->xid, lock->dbOid, lock->relOid); RecoveryLockList = list_delete_cell(RecoveryLockList, cell, prev); pfree(lock); } else prev = cell; } }
/* * _bt_page_recyclable() -- Is an existing page recyclable? * * This exists to make sure _bt_getbuf and btvacuumscan have the same * policy about whether a page is safe to re-use. */ bool _bt_page_recyclable(Page page) { BTPageOpaque opaque; /* * It's possible to find an all-zeroes page in an index --- for example, a * backend might successfully extend the relation one page and then crash * before it is able to make a WAL entry for adding the page. If we find a * zeroed page then reclaim it. */ if (PageIsNew(page)) return true; /* * Otherwise, recycle if deleted and too old to have any processes * interested in it. */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_ISDELETED(opaque) && TransactionIdPrecedes(opaque->btpo.xact, RecentGlobalXmin)) return true; return false; }
/* * lazy_scan_heap() -- scan an open heap relation * * This routine sets commit status bits, builds lists of dead tuples * and pages with free space, and calculates statistics on the number * of live tuples in the heap. When done, or when we run low on space * for dead-tuple TIDs, invoke vacuuming of indexes and heap. * * If there are no indexes then we just vacuum each dirty page as we * process it, since there's no point in gathering many tuples. */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, Relation *Irel, int nindexes, bool scan_all) { BlockNumber nblocks, blkno; HeapTupleData tuple; char *relname; BlockNumber empty_pages, vacuumed_pages; double num_tuples, tups_vacuumed, nkeep, nunused; IndexBulkDeleteResult **indstats; int i; PGRUsage ru0; Buffer vmbuffer = InvalidBuffer; BlockNumber next_not_all_visible_block; bool skipping_all_visible_blocks; pg_rusage_init(&ru0); relname = RelationGetRelationName(onerel); ereport(elevel, (errmsg("vacuuming \"%s.%s\"", get_namespace_name(RelationGetNamespace(onerel)), relname))); empty_pages = vacuumed_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); nblocks = RelationGetNumberOfBlocks(onerel); vacrelstats->rel_pages = nblocks; vacrelstats->scanned_pages = 0; vacrelstats->nonempty_pages = 0; vacrelstats->latestRemovedXid = InvalidTransactionId; lazy_space_alloc(vacrelstats, nblocks); /* * We want to skip pages that don't require vacuuming according to the * visibility map, but only when we can skip at least SKIP_PAGES_THRESHOLD * consecutive pages. Since we're reading sequentially, the OS should be * doing readahead for us, so there's no gain in skipping a page now and * then; that's likely to disable readahead and so be counterproductive. * Also, skipping even a single page means that we can't update * relfrozenxid, so we only want to do it if we can skip a goodly number * of pages. * * Before entering the main loop, establish the invariant that * next_not_all_visible_block is the next block number >= blkno that's not * all-visible according to the visibility map, or nblocks if there's no * such block. Also, we set up the skipping_all_visible_blocks flag, * which is needed because we need hysteresis in the decision: once we've * started skipping blocks, we may as well skip everything up to the next * not-all-visible block. * * Note: if scan_all is true, we won't actually skip any pages; but we * maintain next_not_all_visible_block anyway, so as to set up the * all_visible_according_to_vm flag correctly for each page. */ for (next_not_all_visible_block = 0; next_not_all_visible_block < nblocks; next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } if (next_not_all_visible_block >= SKIP_PAGES_THRESHOLD) skipping_all_visible_blocks = true; else skipping_all_visible_blocks = false; for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool tupgone, hastup; int prev_dead_count; OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; Size freespace; bool all_visible_according_to_vm; bool all_visible; bool has_dead_tuples; if (blkno == next_not_all_visible_block) { /* Time to advance next_not_all_visible_block */ for (next_not_all_visible_block++; next_not_all_visible_block < nblocks; next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } /* * We know we can't skip the current block. But set up * skipping_all_visible_blocks to do the right thing at the * following blocks. */ if (next_not_all_visible_block - blkno > SKIP_PAGES_THRESHOLD) skipping_all_visible_blocks = true; else skipping_all_visible_blocks = false; all_visible_according_to_vm = false; } else { /* Current block is all-visible */ if (skipping_all_visible_blocks && !scan_all) continue; all_visible_according_to_vm = true; } vacuum_delay_point(); vacrelstats->scanned_pages++; /* * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. */ if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && vacrelstats->num_dead_tuples > 0) { /* Log cleanup info before we touch indexes */ vacuum_log_cleanup_info(onerel, vacrelstats); /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); /* * Forget the now-vacuumed tuples, and press on, but be careful * not to reset latestRemovedXid since we want that value to be * valid. */ vacrelstats->num_dead_tuples = 0; vacrelstats->num_index_scans++; } buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, vac_strategy); /* We need buffer cleanup lock so that we can prune HOT chains. */ LockBufferForCleanup(buf); page = BufferGetPage(buf); if (PageIsNew(page)) { /* * An all-zeroes page could be left over if a backend extends the * relation but crashes before initializing the page. Reclaim such * pages for use. * * We have to be careful here because we could be looking at a * page that someone has just added to the relation and not yet * been able to initialize (see RelationGetBufferForTuple). To * protect against that, release the buffer lock, grab the * relation extension lock momentarily, and re-lock the buffer. If * the page is still uninitialized by then, it must be left over * from a crashed backend, and we can initialize it. * * We don't really need the relation lock when this is a new or * temp relation, but it's probably not worth the code space to * check that, since this surely isn't a critical path. * * Note: the comparable code in vacuum.c need not worry because * it's got exclusive lock on the whole relation. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockRelationForExtension(onerel, ExclusiveLock); UnlockRelationForExtension(onerel, ExclusiveLock); LockBufferForCleanup(buf); if (PageIsNew(page)) { ereport(WARNING, (errmsg("relation \"%s\" page %u is uninitialized --- fixing", relname, blkno))); PageInit(page, BufferGetPageSize(buf), 0); empty_pages++; } freespace = PageGetHeapFreeSpace(page); MarkBufferDirty(buf); UnlockReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } if (PageIsEmpty(page)) { empty_pages++; freespace = PageGetHeapFreeSpace(page); if (!PageIsAllVisible(page)) { PageSetAllVisible(page); SetBufferCommitInfoNeedsSave(buf); } LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* Update the visibility map */ if (!all_visible_according_to_vm) { visibilitymap_pin(onerel, blkno, &vmbuffer); LockBuffer(buf, BUFFER_LOCK_SHARE); if (PageIsAllVisible(page)) visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } ReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } /* * Prune all HOT-update chains in this page. * * We count tuples removed by the pruning step as removed by VACUUM. */ tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, &vacrelstats->latestRemovedXid); /* * Now scan the page to collect vacuumable items and check for tuples * requiring freezing. */ all_visible = true; has_dead_tuples = false; nfrozen = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* Unused items require no processing, but we count 'em */ if (!ItemIdIsUsed(itemid)) { nunused += 1; continue; } /* Redirect items mustn't be touched */ if (ItemIdIsRedirected(itemid)) { hastup = true; /* this page won't be truncatable */ continue; } ItemPointerSet(&(tuple.t_self), blkno, offnum); /* * DEAD item pointers are to be vacuumed normally; but we don't * count them in tups_vacuumed, else we'd be double-counting (at * least in the common case where heap_page_prune() just freed up * a non-HOT tuple). */ if (ItemIdIsDead(itemid)) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); all_visible = false; continue; } Assert(ItemIdIsNormal(itemid)); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tupgone = false; switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf)) { case HEAPTUPLE_DEAD: /* * Ordinarily, DEAD tuples would have been removed by * heap_page_prune(), but it's possible that the tuple * state changed since heap_page_prune() looked. In * particular an INSERT_IN_PROGRESS tuple could have * changed to DEAD if the inserter aborted. So this * cannot be considered an error condition. * * If the tuple is HOT-updated then it must only be * removed by a prune operation; so we keep it just as if * it were RECENTLY_DEAD. Also, if it's a heap-only * tuple, we choose to keep it, because it'll be a lot * cheaper to get rid of it in the next pruning pass than * to treat it like an indexed tuple. */ if (HeapTupleIsHotUpdated(&tuple) || HeapTupleIsHeapOnly(&tuple)) nkeep += 1; else tupgone = true; /* we can delete the tuple */ all_visible = false; break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ if (onerel->rd_rel->relhasoids && !OidIsValid(HeapTupleGetOid(&tuple))) elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); /* * Is the tuple definitely visible to all transactions? * * NB: Like with per-tuple hint bits, we can't set the * PD_ALL_VISIBLE flag if the inserter committed * asynchronously. See SetHintBits for more info. Check * that the HEAP_XMIN_COMMITTED hint bit is set because of * that. */ if (all_visible) { TransactionId xmin; if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)) { all_visible = false; break; } /* * The inserter definitely committed. But is it old * enough that everyone sees it as committed? */ xmin = HeapTupleHeaderGetXmin(tuple.t_data); if (!TransactionIdPrecedes(xmin, OldestXmin)) { all_visible = false; break; } } break; case HEAPTUPLE_RECENTLY_DEAD: /* * If tuple is recently deleted then we must not remove it * from relation. */ nkeep += 1; all_visible = false; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } if (tupgone) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, &vacrelstats->latestRemovedXid); tups_vacuumed += 1; has_dead_tuples = true; } else { num_tuples += 1; hastup = true; /* * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ if (heap_freeze_tuple(tuple.t_data, FreezeLimit, InvalidBuffer)) frozen[nfrozen++] = offnum; } } /* scan along page */ /* * If we froze any tuples, mark the buffer dirty, and write a WAL * record recording the changes. We must log the changes to be * crash-safe against future truncation of CLOG. */ if (nfrozen > 0) { MarkBufferDirty(buf); if (RelationNeedsWAL(onerel)) { XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, frozen, nfrozen); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } /* * If there are no indexes then we can vacuum the page right now * instead of doing a second scan. */ if (nindexes == 0 && vacrelstats->num_dead_tuples > 0) { /* Remove tuples from heap */ lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); /* * Forget the now-vacuumed tuples, and press on, but be careful * not to reset latestRemovedXid since we want that value to be * valid. */ vacrelstats->num_dead_tuples = 0; vacuumed_pages++; } freespace = PageGetHeapFreeSpace(page); /* Update the all-visible flag on the page */ if (!PageIsAllVisible(page) && all_visible) { PageSetAllVisible(page); SetBufferCommitInfoNeedsSave(buf); } /* * It's possible for the value returned by GetOldestXmin() to move * backwards, so it's not wrong for us to see tuples that appear to * not be visible to everyone yet, while PD_ALL_VISIBLE is already * set. The real safe xmin value never moves backwards, but * GetOldestXmin() is conservative and sometimes returns a value * that's unnecessarily small, so if we see that contradiction it just * means that the tuples that we think are not visible to everyone yet * actually are, and the PD_ALL_VISIBLE flag is correct. * * There should never be dead tuples on a page with PD_ALL_VISIBLE * set, however. */ else if (PageIsAllVisible(page) && has_dead_tuples) { elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u", relname, blkno); PageClearAllVisible(page); SetBufferCommitInfoNeedsSave(buf); /* * Normally, we would drop the lock on the heap page before * updating the visibility map, but since this case shouldn't * happen anyway, don't worry about that. */ visibilitymap_clear(onerel, blkno); } LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* Update the visibility map */ if (!all_visible_according_to_vm && all_visible) { visibilitymap_pin(onerel, blkno, &vmbuffer); LockBuffer(buf, BUFFER_LOCK_SHARE); if (PageIsAllVisible(page)) visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } ReleaseBuffer(buf); /* Remember the location of the last page with nonremovable tuples */ if (hastup) vacrelstats->nonempty_pages = blkno + 1; /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record * its post-compaction free space. If not, then we're done with this * page, so remember its free space as-is. (This path will always be * taken if there are no indexes.) */ if (vacrelstats->num_dead_tuples == prev_dead_count) RecordPageWithFreeSpace(onerel, blkno, freespace); } /* save stats for use later */ vacrelstats->scanned_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; /* now we can compute the new value for pg_class.reltuples */ vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false, nblocks, vacrelstats->scanned_pages, num_tuples); /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) { /* Log cleanup info before we touch indexes */ vacuum_log_cleanup_info(onerel, vacrelstats); /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); vacrelstats->num_index_scans++; } /* Release the pin on the visibility map page */ if (BufferIsValid(vmbuffer)) { ReleaseBuffer(vmbuffer); vmbuffer = InvalidBuffer; } /* Do post-vacuum cleanup and statistics update for each index */ for (i = 0; i < nindexes; i++) lazy_cleanup_index(Irel[i], indstats[i], vacrelstats); /* If no indexes, make log report that lazy_vacuum_heap would've made */ if (vacuumed_pages) ereport(elevel, (errmsg("\"%s\": removed %.0f row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, vacuumed_pages))); ereport(elevel, (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages", RelationGetRelationName(onerel), tups_vacuumed, num_tuples, vacrelstats->scanned_pages, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages are entirely empty.\n" "%s.", nkeep, nunused, empty_pages, pg_rusage_show(&ru0)))); }
/* * Interrogate the commit timestamp of a transaction. * * The return value indicates whether a commit timestamp record was found for * the given xid. The timestamp value is returned in *ts (which may not be * null), and the origin node for the Xid is returned in *nodeid, if it's not * null. */ bool TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts, RepOriginId *nodeid) { int pageno = TransactionIdToCTsPage(xid); int entryno = TransactionIdToCTsEntry(xid); int slotno; CommitTimestampEntry entry; TransactionId oldestCommitTsXid; TransactionId newestCommitTsXid; if (!TransactionIdIsValid(xid)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("cannot retrieve commit timestamp for transaction %u", xid))); else if (!TransactionIdIsNormal(xid)) { /* frozen and bootstrap xids are always committed far in the past */ *ts = 0; if (nodeid) *nodeid = 0; return false; } LWLockAcquire(CommitTsLock, LW_SHARED); /* Error if module not enabled */ if (!commitTsShared->commitTsActive) error_commit_ts_disabled(); /* * If we're asked for the cached value, return that. Otherwise, fall * through to read from SLRU. */ if (commitTsShared->xidLastCommit == xid) { *ts = commitTsShared->dataLastCommit.time; if (nodeid) *nodeid = commitTsShared->dataLastCommit.nodeid; LWLockRelease(CommitTsLock); return *ts != 0; } oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid; newestCommitTsXid = ShmemVariableCache->newestCommitTsXid; /* neither is invalid, or both are */ Assert(TransactionIdIsValid(oldestCommitTsXid) == TransactionIdIsValid(newestCommitTsXid)); LWLockRelease(CommitTsLock); /* * Return empty if the requested value is outside our valid range. */ if (!TransactionIdIsValid(oldestCommitTsXid) || TransactionIdPrecedes(xid, oldestCommitTsXid) || TransactionIdPrecedes(newestCommitTsXid, xid)) { *ts = 0; if (nodeid) *nodeid = InvalidRepOriginId; return false; } /* lock is acquired by SimpleLruReadPage_ReadOnly */ slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid); memcpy(&entry, CommitTsCtl->shared->page_buffer[slotno] + SizeOfCommitTimestampEntry * entryno, SizeOfCommitTimestampEntry); *ts = entry.time; if (nodeid) *nodeid = entry.nodeid; LWLockRelease(CommitTsControlLock); return *ts != 0; }
/* * TransactionTreeSetCommitTsData * * Record the final commit timestamp of transaction entries in the commit log * for a transaction and its subtransaction tree, as efficiently as possible. * * xid is the top level transaction id. * * subxids is an array of xids of length nsubxids, representing subtransactions * in the tree of xid. In various cases nsubxids may be zero. * The reason why tracking just the parent xid commit timestamp is not enough * is that the subtrans SLRU does not stay valid across crashes (it's not * permanent) so we need to keep the information about them here. If the * subtrans implementation changes in the future, we might want to revisit the * decision of storing timestamp info for each subxid. * * The write_xlog parameter tells us whether to include an XLog record of this * or not. Normally, this is called from transaction commit routines (both * normal and prepared) and the information will be stored in the transaction * commit XLog record, and so they should pass "false" for this. The XLog redo * code should use "false" here as well. Other callers probably want to pass * true, so that the given values persist in case of crashes. */ void TransactionTreeSetCommitTsData(TransactionId xid, int nsubxids, TransactionId *subxids, TimestampTz timestamp, RepOriginId nodeid, bool write_xlog) { int i; TransactionId headxid; TransactionId newestXact; /* * No-op if the module is not active. * * An unlocked read here is fine, because in a standby (the only place * where the flag can change in flight) this routine is only called by the * recovery process, which is also the only process which can change the * flag. */ if (!commitTsShared->commitTsActive) return; /* * Comply with the WAL-before-data rule: if caller specified it wants this * value to be recorded in WAL, do so before touching the data. */ if (write_xlog) WriteSetTimestampXlogRec(xid, nsubxids, subxids, timestamp, nodeid); /* * Figure out the latest Xid in this batch: either the last subxid if * there's any, otherwise the parent xid. */ if (nsubxids > 0) newestXact = subxids[nsubxids - 1]; else newestXact = xid; /* * We split the xids to set the timestamp to in groups belonging to the * same SLRU page; the first element in each such set is its head. The * first group has the main XID as the head; subsequent sets use the first * subxid not on the previous page as head. This way, we only have to * lock/modify each SLRU page once. */ for (i = 0, headxid = xid;;) { int pageno = TransactionIdToCTsPage(headxid); int j; for (j = i; j < nsubxids; j++) { if (TransactionIdToCTsPage(subxids[j]) != pageno) break; } /* subxids[i..j] are on the same page as the head */ SetXidCommitTsInPage(headxid, j - i, subxids + i, timestamp, nodeid, pageno); /* if we wrote out all subxids, we're done. */ if (j + 1 >= nsubxids) break; /* * Set the new head and skip over it, as well as over the subxids we * just wrote. */ headxid = subxids[j]; i += j - i + 1; } /* update the cached value in shared memory */ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE); commitTsShared->xidLastCommit = xid; commitTsShared->dataLastCommit.time = timestamp; commitTsShared->dataLastCommit.nodeid = nodeid; /* and move forwards our endpoint, if needed */ if (TransactionIdPrecedes(ShmemVariableCache->newestCommitTsXid, newestXact)) ShmemVariableCache->newestCommitTsXid = newestXact; LWLockRelease(CommitTsLock); }
/* * get_relation_info - * Retrieves catalog information for a given relation. * * Given the Oid of the relation, return the following info into fields * of the RelOptInfo struct: * * min_attr lowest valid AttrNumber * max_attr highest valid AttrNumber * indexlist list of IndexOptInfos for relation's indexes * pages number of pages * tuples number of tuples * * Also, initialize the attr_needed[] and attr_widths[] arrays. In most * cases these are left as zeroes, but sometimes we need to compute attr * widths here, and we may as well cache the results for costsize.c. * * If inhparent is true, all we need to do is set up the attr arrays: * the RelOptInfo actually represents the appendrel formed by an inheritance * tree, and so the parent rel's physical size and index information isn't * important for it. */ void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel) { Index varno = rel->relid; Relation relation; bool hasindex; List *indexinfos = NIL; /* * We need not lock the relation since it was already locked, either by * the rewriter or when expand_inherited_rtentry() added it to the query's * rangetable. */ relation = heap_open(relationObjectId, NoLock); rel->min_attr = FirstLowInvalidHeapAttributeNumber + 1; rel->max_attr = RelationGetNumberOfAttributes(relation); rel->reltablespace = RelationGetForm(relation)->reltablespace; Assert(rel->max_attr >= rel->min_attr); rel->attr_needed = (Relids *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(Relids)); rel->attr_widths = (int32 *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(int32)); /* * Estimate relation size --- unless it's an inheritance parent, in which * case the size will be computed later in set_append_rel_pathlist, and we * must leave it zero for now to avoid bollixing the total_table_pages * calculation. */ if (!inhparent) estimate_rel_size(relation, rel->attr_widths - rel->min_attr, &rel->pages, &rel->tuples); /* * Make list of indexes. Ignore indexes on system catalogs if told to. * Don't bother with indexes for an inheritance parent, either. */ if (inhparent || (IgnoreSystemIndexes && IsSystemClass(relation->rd_rel))) hasindex = false; else hasindex = relation->rd_rel->relhasindex; if (hasindex) { List *indexoidlist; ListCell *l; LOCKMODE lmode; indexoidlist = RelationGetIndexList(relation); /* * For each index, we get the same type of lock that the executor will * need, and do not release it. This saves a couple of trips to the * shared lock manager while not creating any real loss of * concurrency, because no schema changes could be happening on the * index while we hold lock on the parent rel, and neither lock type * blocks any other kind of index operation. */ if (rel->relid == root->parse->resultRelation) lmode = RowExclusiveLock; else lmode = AccessShareLock; foreach(l, indexoidlist) { Oid indexoid = lfirst_oid(l); Relation indexRelation; Form_pg_index index; IndexOptInfo *info; int ncolumns; int i; /* * Extract info from the relation descriptor for the index. */ indexRelation = index_open(indexoid, lmode); index = indexRelation->rd_index; /* * Ignore invalid indexes, since they can't safely be used for * queries. Note that this is OK because the data structure we * are constructing is only used by the planner --- the executor * still needs to insert into "invalid" indexes! */ if (!index->indisvalid) { index_close(indexRelation, NoLock); continue; } /* * If the index is valid, but cannot yet be used, ignore it; but * mark the plan we are generating as transient. See * src/backend/access/heap/README.HOT for discussion. */ if (index->indcheckxmin && !TransactionIdPrecedes(HeapTupleHeaderGetXmin(indexRelation->rd_indextuple->t_data), TransactionXmin)) { root->glob->transientPlan = true; index_close(indexRelation, NoLock); continue; } info = makeNode(IndexOptInfo); info->indexoid = index->indexrelid; info->reltablespace = RelationGetForm(indexRelation)->reltablespace; info->rel = rel; info->ncolumns = ncolumns = index->indnatts; /* * Allocate per-column info arrays. To save a few palloc cycles * we allocate all the Oid-type arrays in one request. Note that * the opfamily array needs an extra, terminating zero at the end. * We pre-zero the ordering info in case the index is unordered. */ info->indexkeys = (int *) palloc(sizeof(int) * ncolumns); info->opfamily = (Oid *) palloc0(sizeof(Oid) * (4 * ncolumns + 1)); info->opcintype = info->opfamily + (ncolumns + 1); info->fwdsortop = info->opcintype + ncolumns; info->revsortop = info->fwdsortop + ncolumns; info->nulls_first = (bool *) palloc0(sizeof(bool) * ncolumns); for (i = 0; i < ncolumns; i++) { info->indexkeys[i] = index->indkey.values[i]; info->opfamily[i] = indexRelation->rd_opfamily[i]; info->opcintype[i] = indexRelation->rd_opcintype[i]; } info->relam = indexRelation->rd_rel->relam; info->amcostestimate = indexRelation->rd_am->amcostestimate; info->amoptionalkey = indexRelation->rd_am->amoptionalkey; info->amsearchnulls = indexRelation->rd_am->amsearchnulls; info->amhasgettuple = OidIsValid(indexRelation->rd_am->amgettuple); info->amhasgetbitmap = OidIsValid(indexRelation->rd_am->amgetbitmap); /* * Fetch the ordering operators associated with the index, if any. * We expect that all ordering-capable indexes use btree's * strategy numbers for the ordering operators. */ if (indexRelation->rd_am->amcanorder) { int nstrat = indexRelation->rd_am->amstrategies; for (i = 0; i < ncolumns; i++) { int16 opt = indexRelation->rd_indoption[i]; int fwdstrat; int revstrat; if (opt & INDOPTION_DESC) { fwdstrat = BTGreaterStrategyNumber; revstrat = BTLessStrategyNumber; } else { fwdstrat = BTLessStrategyNumber; revstrat = BTGreaterStrategyNumber; } /* * Index AM must have a fixed set of strategies for it to * make sense to specify amcanorder, so we need not allow * the case amstrategies == 0. */ if (fwdstrat > 0) { Assert(fwdstrat <= nstrat); info->fwdsortop[i] = indexRelation->rd_operator[i * nstrat + fwdstrat - 1]; } if (revstrat > 0) { Assert(revstrat <= nstrat); info->revsortop[i] = indexRelation->rd_operator[i * nstrat + revstrat - 1]; } info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; } } /* * Fetch the index expressions and predicate, if any. We must * modify the copies we obtain from the relcache to have the * correct varno for the parent relation, so that they match up * correctly against qual clauses. */ info->indexprs = RelationGetIndexExpressions(indexRelation); info->indpred = RelationGetIndexPredicate(indexRelation); if (info->indexprs && varno != 1) ChangeVarNodes((Node *) info->indexprs, 1, varno, 0); if (info->indpred && varno != 1) ChangeVarNodes((Node *) info->indpred, 1, varno, 0); info->predOK = false; /* set later in indxpath.c */ info->unique = index->indisunique; /* * Estimate the index size. If it's not a partial index, we lock * the number-of-tuples estimate to equal the parent table; if it * is partial then we have to use the same methods as we would for * a table, except we can be sure that the index is not larger * than the table. */ if (info->indpred == NIL) { info->pages = RelationGetNumberOfBlocks(indexRelation); info->tuples = rel->tuples; } else { estimate_rel_size(indexRelation, NULL, &info->pages, &info->tuples); if (info->tuples > rel->tuples) info->tuples = rel->tuples; } index_close(indexRelation, NoLock); indexinfos = lcons(info, indexinfos); } list_free(indexoidlist); }
/* * get_relation_info - * Retrieves catalog information for a given relation. * * Given the Oid of the relation, return the following info into fields * of the RelOptInfo struct: * * min_attr lowest valid AttrNumber * max_attr highest valid AttrNumber * indexlist list of IndexOptInfos for relation's indexes * pages number of pages * tuples number of tuples * * Also, initialize the attr_needed[] and attr_widths[] arrays. In most * cases these are left as zeroes, but sometimes we need to compute attr * widths here, and we may as well cache the results for costsize.c. * * If inhparent is true, all we need to do is set up the attr arrays: * the RelOptInfo actually represents the appendrel formed by an inheritance * tree, and so the parent rel's physical size and index information isn't * important for it. */ void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel) { Index varno = rel->relid; Relation relation; bool hasindex; List *indexinfos = NIL; /* * We need not lock the relation since it was already locked, either by * the rewriter or when expand_inherited_rtentry() added it to the query's * rangetable. */ relation = heap_open(relationObjectId, NoLock); /* Temporary and unlogged relations are inaccessible during recovery. */ if (!RelationNeedsWAL(relation) && RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary or unlogged relations during recovery"))); rel->min_attr = FirstLowInvalidHeapAttributeNumber + 1; rel->max_attr = RelationGetNumberOfAttributes(relation); rel->reltablespace = RelationGetForm(relation)->reltablespace; Assert(rel->max_attr >= rel->min_attr); rel->attr_needed = (Relids *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(Relids)); rel->attr_widths = (int32 *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(int32)); /* * Estimate relation size --- unless it's an inheritance parent, in which * case the size will be computed later in set_append_rel_pathlist, and we * must leave it zero for now to avoid bollixing the total_table_pages * calculation. */ if (!inhparent) estimate_rel_size(relation, rel->attr_widths - rel->min_attr, &rel->pages, &rel->tuples, &rel->allvisfrac); /* * Make list of indexes. Ignore indexes on system catalogs if told to. * Don't bother with indexes for an inheritance parent, either. */ if (inhparent || (IgnoreSystemIndexes && IsSystemClass(relation->rd_rel))) hasindex = false; else hasindex = relation->rd_rel->relhasindex; if (hasindex) { List *indexoidlist; ListCell *l; LOCKMODE lmode; indexoidlist = RelationGetIndexList(relation); /* * For each index, we get the same type of lock that the executor will * need, and do not release it. This saves a couple of trips to the * shared lock manager while not creating any real loss of * concurrency, because no schema changes could be happening on the * index while we hold lock on the parent rel, and neither lock type * blocks any other kind of index operation. */ if (rel->relid == root->parse->resultRelation) lmode = RowExclusiveLock; else lmode = AccessShareLock; foreach(l, indexoidlist) { Oid indexoid = lfirst_oid(l); Relation indexRelation; Form_pg_index index; IndexOptInfo *info; int ncolumns; int i; /* * Extract info from the relation descriptor for the index. */ indexRelation = index_open(indexoid, lmode); index = indexRelation->rd_index; /* * Ignore invalid indexes, since they can't safely be used for * queries. Note that this is OK because the data structure we * are constructing is only used by the planner --- the executor * still needs to insert into "invalid" indexes! */ if (!index->indisvalid) { index_close(indexRelation, NoLock); continue; } /* * If the index is valid, but cannot yet be used, ignore it; but * mark the plan we are generating as transient. See * src/backend/access/heap/README.HOT for discussion. */ if (index->indcheckxmin && !TransactionIdPrecedes(HeapTupleHeaderGetXmin(indexRelation->rd_indextuple->t_data), TransactionXmin)) { root->glob->transientPlan = true; index_close(indexRelation, NoLock); continue; } info = makeNode(IndexOptInfo); info->indexoid = index->indexrelid; info->reltablespace = RelationGetForm(indexRelation)->reltablespace; info->rel = rel; info->ncolumns = ncolumns = index->indnatts; info->indexkeys = (int *) palloc(sizeof(int) * ncolumns); info->indexcollations = (Oid *) palloc(sizeof(Oid) * ncolumns); info->opfamily = (Oid *) palloc(sizeof(Oid) * ncolumns); info->opcintype = (Oid *) palloc(sizeof(Oid) * ncolumns); for (i = 0; i < ncolumns; i++) { info->indexkeys[i] = index->indkey.values[i]; info->indexcollations[i] = indexRelation->rd_indcollation[i]; info->opfamily[i] = indexRelation->rd_opfamily[i]; info->opcintype[i] = indexRelation->rd_opcintype[i]; } info->relam = indexRelation->rd_rel->relam; info->amcostestimate = indexRelation->rd_am->amcostestimate; info->canreturn = index_can_return(indexRelation); info->amcanorderbyop = indexRelation->rd_am->amcanorderbyop; info->amoptionalkey = indexRelation->rd_am->amoptionalkey; info->amsearcharray = indexRelation->rd_am->amsearcharray; info->amsearchnulls = indexRelation->rd_am->amsearchnulls; info->amhasgettuple = OidIsValid(indexRelation->rd_am->amgettuple); info->amhasgetbitmap = OidIsValid(indexRelation->rd_am->amgetbitmap); /* * Fetch the ordering information for the index, if any. */ if (info->relam == BTREE_AM_OID) { /* * If it's a btree index, we can use its opfamily OIDs * directly as the sort ordering opfamily OIDs. */ Assert(indexRelation->rd_am->amcanorder); info->sortopfamily = info->opfamily; info->reverse_sort = (bool *) palloc(sizeof(bool) * ncolumns); info->nulls_first = (bool *) palloc(sizeof(bool) * ncolumns); for (i = 0; i < ncolumns; i++) { int16 opt = indexRelation->rd_indoption[i]; info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0; info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; } } else if (indexRelation->rd_am->amcanorder) { /* * Otherwise, identify the corresponding btree opfamilies by * trying to map this index's "<" operators into btree. Since * "<" uniquely defines the behavior of a sort order, this is * a sufficient test. * * XXX This method is rather slow and also requires the * undesirable assumption that the other index AM numbers its * strategies the same as btree. It'd be better to have a way * to explicitly declare the corresponding btree opfamily for * each opfamily of the other index type. But given the lack * of current or foreseeable amcanorder index types, it's not * worth expending more effort on now. */ info->sortopfamily = (Oid *) palloc(sizeof(Oid) * ncolumns); info->reverse_sort = (bool *) palloc(sizeof(bool) * ncolumns); info->nulls_first = (bool *) palloc(sizeof(bool) * ncolumns); for (i = 0; i < ncolumns; i++) { int16 opt = indexRelation->rd_indoption[i]; Oid ltopr; Oid btopfamily; Oid btopcintype; int16 btstrategy; info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0; info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; ltopr = get_opfamily_member(info->opfamily[i], info->opcintype[i], info->opcintype[i], BTLessStrategyNumber); if (OidIsValid(ltopr) && get_ordering_op_properties(ltopr, &btopfamily, &btopcintype, &btstrategy) && btopcintype == info->opcintype[i] && btstrategy == BTLessStrategyNumber) { /* Successful mapping */ info->sortopfamily[i] = btopfamily; } else { /* Fail ... quietly treat index as unordered */ info->sortopfamily = NULL; info->reverse_sort = NULL; info->nulls_first = NULL; break; } } } else { info->sortopfamily = NULL; info->reverse_sort = NULL; info->nulls_first = NULL; } /* * Fetch the index expressions and predicate, if any. We must * modify the copies we obtain from the relcache to have the * correct varno for the parent relation, so that they match up * correctly against qual clauses. */ info->indexprs = RelationGetIndexExpressions(indexRelation); info->indpred = RelationGetIndexPredicate(indexRelation); if (info->indexprs && varno != 1) ChangeVarNodes((Node *) info->indexprs, 1, varno, 0); if (info->indpred && varno != 1) ChangeVarNodes((Node *) info->indpred, 1, varno, 0); /* Build targetlist using the completed indexprs data */ info->indextlist = build_index_tlist(root, info, relation); info->predOK = false; /* set later in indxpath.c */ info->unique = index->indisunique; info->immediate = index->indimmediate; info->hypothetical = false; /* * Estimate the index size. If it's not a partial index, we lock * the number-of-tuples estimate to equal the parent table; if it * is partial then we have to use the same methods as we would for * a table, except we can be sure that the index is not larger * than the table. */ if (info->indpred == NIL) { info->pages = RelationGetNumberOfBlocks(indexRelation); info->tuples = rel->tuples; } else { double allvisfrac; /* dummy */ estimate_rel_size(indexRelation, NULL, &info->pages, &info->tuples, &allvisfrac); if (info->tuples > rel->tuples) info->tuples = rel->tuples; } index_close(indexRelation, NoLock); indexinfos = lcons(info, indexinfos); } list_free(indexoidlist); }
/* * Hot Standby feedback */ static void ProcessStandbyHSFeedbackMessage(void) { StandbyHSFeedbackMessage reply; TransactionId newxmin = InvalidTransactionId; pq_copymsgbytes(&reply_message, (char *) &reply, sizeof(StandbyHSFeedbackMessage)); elog(DEBUG2, "hot standby feedback xmin %u epoch %u", reply.xmin, reply.epoch); /* * Update the WalSender's proc xmin to allow it to be visible to * snapshots. This will hold back the removal of dead rows and thereby * prevent the generation of cleanup conflicts on the standby server. */ if (TransactionIdIsValid(reply.xmin)) { TransactionId nextXid; uint32 nextEpoch; bool epochOK = false; GetNextXidAndEpoch(&nextXid, &nextEpoch); /* * Epoch of oldestXmin should be same as standby or if the counter has * wrapped, then one less than reply. */ if (reply.xmin <= nextXid) { if (reply.epoch == nextEpoch) epochOK = true; } else { if (nextEpoch > 0 && reply.epoch == nextEpoch - 1) epochOK = true; } /* * Feedback from standby must not go backwards, nor should it go * forwards further than our most recent xid. */ if (epochOK && TransactionIdPrecedesOrEquals(reply.xmin, nextXid)) { if (!TransactionIdIsValid(MyProc->xmin)) { TransactionId oldestXmin = GetOldestXmin(true, true); if (TransactionIdPrecedes(oldestXmin, reply.xmin)) newxmin = reply.xmin; else newxmin = oldestXmin; } else { if (TransactionIdPrecedes(MyProc->xmin, reply.xmin)) newxmin = reply.xmin; else newxmin = MyProc->xmin; /* stay the same */ } } } /* * Grab the ProcArrayLock to set xmin, or invalidate for bad reply */ if (MyProc->xmin != newxmin) { LWLockAcquire(ProcArrayLock, LW_SHARED); MyProc->xmin = newxmin; LWLockRelease(ProcArrayLock); } }