/* * Initiate page evacuation protocol. * * The page must be locked in exclusive mode by the caller. * * If the page is not yet initialized or empty, return false without doing * anything; it can be used for revmap without any further changes. If it * contains tuples, mark it for evacuation and return true. */ bool brin_start_evacuating_page(Relation idxRel, Buffer buf) { OffsetNumber off; OffsetNumber maxoff; Page page; page = BufferGetPage(buf); if (PageIsNew(page)) return false; maxoff = PageGetMaxOffsetNumber(page); for (off = FirstOffsetNumber; off <= maxoff; off++) { ItemId lp; lp = PageGetItemId(page, off); if (ItemIdIsUsed(lp)) { /* prevent other backends from adding more stuff to this page */ BrinPageFlags(page) |= BRIN_EVACUATE_PAGE; MarkBufferDirtyHint(buf, true); return true; } } return false; }
/* * MUST BE CALLED ONLY ON RECOVERY. * * Check if exists valid (inserted by not aborted xaction) heap tuple * for given item pointer */ bool XLogIsValidTuple(RelFileNode hnode, ItemPointer iptr) { Relation reln; Buffer buffer; Page page; ItemId lp; HeapTupleHeader htup; reln = XLogOpenRelation(false, RM_HEAP_ID, hnode); if (!RelationIsValid(reln)) return (false); buffer = ReadBuffer(reln, ItemPointerGetBlockNumber(iptr)); if (!BufferIsValid(buffer)) return (false); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = (Page) BufferGetPage(buffer); if (PageIsNew((PageHeader) page) || ItemPointerGetOffsetNumber(iptr) > PageGetMaxOffsetNumber(page)) { UnlockAndReleaseBuffer(buffer); return (false); } if (PageGetSUI(page) != ThisStartUpID) { Assert(PageGetSUI(page) < ThisStartUpID); UnlockAndReleaseBuffer(buffer); return (true); } lp = PageGetItemId(page, ItemPointerGetOffsetNumber(iptr)); if (!ItemIdIsUsed(lp) || ItemIdDeleted(lp)) { UnlockAndReleaseBuffer(buffer); return (false); } htup = (HeapTupleHeader) PageGetItem(page, lp); /* MUST CHECK WASN'T TUPLE INSERTED IN PREV STARTUP */ if (!(htup->t_infomask & HEAP_XMIN_COMMITTED)) { if (htup->t_infomask & HEAP_XMIN_INVALID || (htup->t_infomask & HEAP_MOVED_IN && TransactionIdDidAbort(HeapTupleHeaderGetXvac(htup))) || TransactionIdDidAbort(HeapTupleHeaderGetXmin(htup))) { UnlockAndReleaseBuffer(buffer); return (false); } } UnlockAndReleaseBuffer(buffer); return (true); }
/* * PageGetHeapFreeSpace * Returns the size of the free (allocatable) space on a page, * reduced by the space needed for a new line pointer. * * The difference between this and PageGetFreeSpace is that this will return * zero if there are already MaxHeapTuplesPerPage line pointers in the page * and none are free. We use this to enforce that no more than * MaxHeapTuplesPerPage line pointers are created on a heap page. (Although * no more tuples than that could fit anyway, in the presence of redirected * or dead line pointers it'd be possible to have too many line pointers. * To avoid breaking code that assumes MaxHeapTuplesPerPage is a hard limit * on the number of line pointers, we make this extra check.) */ Size PageGetHeapFreeSpace(Page page) { Size space; space = PageGetFreeSpace(page); if (space > 0) { OffsetNumber offnum, nline; /* * Are there already MaxHeapTuplesPerPage line pointers in the page? */ nline = PageGetMaxOffsetNumber(page); if (nline >= MaxHeapTuplesPerPage) { if (PageHasFreeLinePointers((PageHeader) page)) { /* * Since this is just a hint, we must confirm that there is * indeed a free line pointer */ for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum)) { ItemId lp = PageGetItemId(page, offnum); if (!ItemIdIsUsed(lp)) break; } if (offnum > nline) { /* * The hint is wrong, but we can't clear it here since we * don't have the ability to mark the page dirty. */ space = 0; } } else { /* * Although the hint might be wrong, PageAddItem will believe * it anyway, so we must believe it too. */ space = 0; } } } return space; }
/* * Check if specified heap tuple was inserted by given * xaction/command and return * * - -1 if not * - 0 if there is no tuple at all * - 1 if yes */ int XLogIsOwnerOfTuple(RelFileNode hnode, ItemPointer iptr, TransactionId xid, CommandId cid) { Relation reln; Buffer buffer; Page page; ItemId lp; HeapTupleHeader htup; reln = XLogOpenRelation(false, RM_HEAP_ID, hnode); if (!RelationIsValid(reln)) return (0); buffer = ReadBuffer(reln, ItemPointerGetBlockNumber(iptr)); if (!BufferIsValid(buffer)) return (0); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = (Page) BufferGetPage(buffer); if (PageIsNew((PageHeader) page) || ItemPointerGetOffsetNumber(iptr) > PageGetMaxOffsetNumber(page)) { UnlockAndReleaseBuffer(buffer); return (0); } lp = PageGetItemId(page, ItemPointerGetOffsetNumber(iptr)); if (!ItemIdIsUsed(lp) || ItemIdDeleted(lp)) { UnlockAndReleaseBuffer(buffer); return (0); } htup = (HeapTupleHeader) PageGetItem(page, lp); Assert(PageGetSUI(page) == ThisStartUpID); if (!TransactionIdEquals(HeapTupleHeaderGetXmin(htup), xid) || HeapTupleHeaderGetCmin(htup) != cid) { UnlockAndReleaseBuffer(buffer); return (-1); } UnlockAndReleaseBuffer(buffer); return (1); }
/* * mask_lp_flags * * In some index AMs, line pointer flags can be modified in master without * emitting any WAL record. */ void mask_lp_flags(Page page) { OffsetNumber offnum, maxoff; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemId = PageGetItemId(page, offnum); if (ItemIdIsUsed(itemId)) itemId->lp_flags = LP_UNUSED; } }
/* * Move all tuples out of a page. * * The caller must hold lock on the page. The lock and pin are released. */ void brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange, BrinRevmap *revmap, Buffer buf) { OffsetNumber off; OffsetNumber maxoff; Page page; page = BufferGetPage(buf); Assert(((BrinSpecialSpace *) PageGetSpecialPointer(page))->flags & BRIN_EVACUATE_PAGE); maxoff = PageGetMaxOffsetNumber(page); for (off = FirstOffsetNumber; off <= maxoff; off++) { BrinTuple *tup; Size sz; ItemId lp; CHECK_FOR_INTERRUPTS(); lp = PageGetItemId(page, off); if (ItemIdIsUsed(lp)) { sz = ItemIdGetLength(lp); tup = (BrinTuple *) PageGetItem(page, lp); tup = brin_copy_tuple(tup, sz); LockBuffer(buf, BUFFER_LOCK_UNLOCK); if (!brin_doupdate(idxRel, pagesPerRange, revmap, tup->bt_blkno, buf, off, tup, sz, tup, sz, false)) off--; /* retry */ LockBuffer(buf, BUFFER_LOCK_SHARE); /* It's possible that someone extended the revmap over this page */ if (!BRIN_IS_REGULAR_PAGE(page)) break; } } UnlockReleaseBuffer(buf); }
/* * PageAddItemExtended * * Add an item to a page. Return value is the offset at which it was * inserted, or InvalidOffsetNumber if the item is not inserted for any * reason. A WARNING is issued indicating the reason for the refusal. * * offsetNumber must be either InvalidOffsetNumber to specify finding a * free item pointer, or a value between FirstOffsetNumber and one past * the last existing item, to specify using that particular item pointer. * * If offsetNumber is valid and flag PAI_OVERWRITE is set, we just store * the item at the specified offsetNumber, which must be either a * currently-unused item pointer, or one past the last existing item. * * If offsetNumber is valid and flag PAI_OVERWRITE is not set, insert * the item at the specified offsetNumber, moving existing items later * in the array to make room. * * If offsetNumber is not valid, then assign a slot by finding the first * one that is both unused and deallocated. * * If flag PAI_IS_HEAP is set, we enforce that there can't be more than * MaxHeapTuplesPerPage line pointers on the page. * * !!! EREPORT(ERROR) IS DISALLOWED HERE !!! */ OffsetNumber PageAddItemExtended(Page page, Item item, Size size, OffsetNumber offsetNumber, int flags) { PageHeader phdr = (PageHeader) page; Size alignedSize; int lower; int upper; ItemId itemId; OffsetNumber limit; bool needshuffle = false; /* * Be wary about corrupted page pointers */ if (phdr->pd_lower < SizeOfPageHeaderData || phdr->pd_lower > phdr->pd_upper || phdr->pd_upper > phdr->pd_special || phdr->pd_special > BLCKSZ) ereport(PANIC, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", phdr->pd_lower, phdr->pd_upper, phdr->pd_special))); /* * Select offsetNumber to place the new item at */ limit = OffsetNumberNext(PageGetMaxOffsetNumber(page)); /* was offsetNumber passed in? */ if (OffsetNumberIsValid(offsetNumber)) { /* yes, check it */ if ((flags & PAI_OVERWRITE) != 0) { if (offsetNumber < limit) { itemId = PageGetItemId(phdr, offsetNumber); if (ItemIdIsUsed(itemId) || ItemIdHasStorage(itemId)) { elog(WARNING, "will not overwrite a used ItemId"); return InvalidOffsetNumber; } } } else { if (offsetNumber < limit) needshuffle = true; /* need to move existing linp's */ } } else { /* offsetNumber was not passed in, so find a free slot */ /* if no free slot, we'll put it at limit (1st open slot) */ if (PageHasFreeLinePointers(phdr)) { /* * Look for "recyclable" (unused) ItemId. We check for no storage * as well, just to be paranoid --- unused items should never have * storage. */ for (offsetNumber = 1; offsetNumber < limit; offsetNumber++) { itemId = PageGetItemId(phdr, offsetNumber); if (!ItemIdIsUsed(itemId) && !ItemIdHasStorage(itemId)) break; } if (offsetNumber >= limit) { /* the hint is wrong, so reset it */ PageClearHasFreeLinePointers(phdr); } } else { /* don't bother searching if hint says there's no free slot */ offsetNumber = limit; } } /* Reject placing items beyond the first unused line pointer */ if (offsetNumber > limit) { elog(WARNING, "specified item offset is too large"); return InvalidOffsetNumber; } /* Reject placing items beyond heap boundary, if heap */ if ((flags & PAI_IS_HEAP) != 0 && offsetNumber > MaxHeapTuplesPerPage) { elog(WARNING, "can't put more than MaxHeapTuplesPerPage items in a heap page"); return InvalidOffsetNumber; } /* * Compute new lower and upper pointers for page, see if it'll fit. * * Note: do arithmetic as signed ints, to avoid mistakes if, say, * alignedSize > pd_upper. */ if (offsetNumber == limit || needshuffle) lower = phdr->pd_lower + sizeof(ItemIdData); else lower = phdr->pd_lower; alignedSize = MAXALIGN(size); upper = (int) phdr->pd_upper - (int) alignedSize; if (lower > upper) return InvalidOffsetNumber; /* * OK to insert the item. First, shuffle the existing pointers if needed. */ itemId = PageGetItemId(phdr, offsetNumber); if (needshuffle) memmove(itemId + 1, itemId, (limit - offsetNumber) * sizeof(ItemIdData)); /* set the item pointer */ ItemIdSetNormal(itemId, upper, size); /* * Items normally contain no uninitialized bytes. Core bufpage consumers * conform, but this is not a necessary coding rule; a new index AM could * opt to depart from it. However, data type input functions and other * C-language functions that synthesize datums should initialize all * bytes; datumIsEqual() relies on this. Testing here, along with the * similar check in printtup(), helps to catch such mistakes. * * Values of the "name" type retrieved via index-only scans may contain * uninitialized bytes; see comment in btrescan(). Valgrind will report * this as an error, but it is safe to ignore. */ VALGRIND_CHECK_MEM_IS_DEFINED(item, size); /* copy the item's data onto the page */ memcpy((char *) page + upper, item, size); /* adjust page header */ phdr->pd_lower = (LocationIndex) lower; phdr->pd_upper = (LocationIndex) upper; return offsetNumber; }
/* * This function takes an already open relation and scans its pages, * skipping those that have the corresponding visibility map bit set. * For pages we skip, we find the free space from the free space map * and approximate tuple_len on that basis. For the others, we count * the exact number of dead tuples etc. * * This scan is loosely based on vacuumlazy.c:lazy_scan_heap(), but * we do not try to avoid skipping single pages. */ static void statapprox_heap(Relation rel, output_type *stat) { BlockNumber scanned, nblocks, blkno; Buffer vmbuffer = InvalidBuffer; BufferAccessStrategy bstrategy; TransactionId OldestXmin; uint64 misc_count = 0; OldestXmin = GetOldestXmin(rel, PROCARRAY_FLAGS_VACUUM); bstrategy = GetAccessStrategy(BAS_BULKREAD); nblocks = RelationGetNumberOfBlocks(rel); scanned = 0; for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; Size freespace; CHECK_FOR_INTERRUPTS(); /* * If the page has only visible tuples, then we can find out the free * space from the FSM and move on. */ if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer)) { freespace = GetRecordedFreeSpace(rel, blkno); stat->tuple_len += BLCKSZ - freespace; stat->free_space += freespace; continue; } buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buf, BUFFER_LOCK_SHARE); page = BufferGetPage(buf); /* * It's not safe to call PageGetHeapFreeSpace() on new pages, so we * treat them as being free space for our purposes. */ if (!PageIsNew(page)) stat->free_space += PageGetHeapFreeSpace(page); else stat->free_space += BLCKSZ - SizeOfPageHeaderData; if (PageIsNew(page) || PageIsEmpty(page)) { UnlockReleaseBuffer(buf); continue; } scanned++; /* * Look at each tuple on the page and decide whether it's live or * dead, then count it and its size. Unlike lazy_scan_heap, we can * afford to ignore problems and special cases. */ maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; HeapTupleData tuple; itemid = PageGetItemId(page, offnum); if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid) || ItemIdIsDead(itemid)) { continue; } Assert(ItemIdIsNormal(itemid)); ItemPointerSet(&(tuple.t_self), blkno, offnum); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tuple.t_tableOid = RelationGetRelid(rel); /* * We count live and dead tuples, but we also need to add up * others in order to feed vac_estimate_reltuples. */ switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) { case HEAPTUPLE_RECENTLY_DEAD: misc_count++; /* Fall through */ case HEAPTUPLE_DEAD: stat->dead_tuple_len += tuple.t_len; stat->dead_tuple_count++; break; case HEAPTUPLE_LIVE: stat->tuple_len += tuple.t_len; stat->tuple_count++; break; case HEAPTUPLE_INSERT_IN_PROGRESS: case HEAPTUPLE_DELETE_IN_PROGRESS: misc_count++; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } } UnlockReleaseBuffer(buf); } stat->table_len = (uint64) nblocks *BLCKSZ; stat->tuple_count = vac_estimate_reltuples(rel, false, nblocks, scanned, stat->tuple_count + misc_count); /* * Calculate percentages if the relation has one or more pages. */ if (nblocks != 0) { stat->scanned_percent = 100 * scanned / nblocks; stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len; stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len; stat->free_percent = 100.0 * stat->free_space / stat->table_len; } if (BufferIsValid(vmbuffer)) { ReleaseBuffer(vmbuffer); vmbuffer = InvalidBuffer; } }
/* * Fetch the BrinTuple for a given heap block. * * The buffer containing the tuple is locked, and returned in *buf. As an * optimization, the caller can pass a pinned buffer *buf on entry, which will * avoid a pin-unpin cycle when the next tuple is on the same page as a * previous one. * * If no tuple is found for the given heap range, returns NULL. In that case, * *buf might still be updated, but it's not locked. * * The output tuple offset within the buffer is returned in *off, and its size * is returned in *size. */ BrinTuple * brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk, Buffer *buf, OffsetNumber *off, Size *size, int mode) { Relation idxRel = revmap->rm_irel; BlockNumber mapBlk; RevmapContents *contents; ItemPointerData *iptr; BlockNumber blk; Page page; ItemId lp; BrinTuple *tup; ItemPointerData previptr; /* normalize the heap block number to be the first page in the range */ heapBlk = (heapBlk / revmap->rm_pagesPerRange) * revmap->rm_pagesPerRange; /* Compute the revmap page number we need */ mapBlk = revmap_get_blkno(revmap, heapBlk); if (mapBlk == InvalidBlockNumber) { *off = InvalidOffsetNumber; return NULL; } ItemPointerSetInvalid(&previptr); for (;;) { CHECK_FOR_INTERRUPTS(); if (revmap->rm_currBuf == InvalidBuffer || BufferGetBlockNumber(revmap->rm_currBuf) != mapBlk) { if (revmap->rm_currBuf != InvalidBuffer) ReleaseBuffer(revmap->rm_currBuf); Assert(mapBlk != InvalidBlockNumber); revmap->rm_currBuf = ReadBuffer(revmap->rm_irel, mapBlk); } LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_SHARE); contents = (RevmapContents *) PageGetContents(BufferGetPage(revmap->rm_currBuf)); iptr = contents->rm_tids; iptr += HEAPBLK_TO_REVMAP_INDEX(revmap->rm_pagesPerRange, heapBlk); if (!ItemPointerIsValid(iptr)) { LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK); return NULL; } /* * Check the TID we got in a previous iteration, if any, and save the * current TID we got from the revmap; if we loop, we can sanity-check * that the next one we get is different. Otherwise we might be stuck * looping forever if the revmap is somehow badly broken. */ if (ItemPointerIsValid(&previptr) && ItemPointerEquals(&previptr, iptr)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg_internal("corrupted BRIN index: inconsistent range map"))); previptr = *iptr; blk = ItemPointerGetBlockNumber(iptr); *off = ItemPointerGetOffsetNumber(iptr); LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK); /* Ok, got a pointer to where the BrinTuple should be. Fetch it. */ if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != blk) { if (BufferIsValid(*buf)) ReleaseBuffer(*buf); *buf = ReadBuffer(idxRel, blk); } LockBuffer(*buf, mode); page = BufferGetPage(*buf); /* If we land on a revmap page, start over */ if (BRIN_IS_REGULAR_PAGE(page)) { lp = PageGetItemId(page, *off); if (ItemIdIsUsed(lp)) { tup = (BrinTuple *) PageGetItem(page, lp); if (tup->bt_blkno == heapBlk) { if (size) *size = ItemIdGetLength(lp); /* found it! */ return tup; } } } /* * No luck. Assume that the revmap was updated concurrently. */ LockBuffer(*buf, BUFFER_LOCK_UNLOCK); } /* not reached, but keep compiler quiet */ return NULL; }
/* * Get the latestRemovedXid from the heap pages pointed at by the index * tuples being deleted. This puts the work for calculating latestRemovedXid * into the recovery path rather than the primary path. * * It's possible that this generates a fair amount of I/O, since an index * block may have hundreds of tuples being deleted. Repeat accesses to the * same heap blocks are common, though are not yet optimised. * * XXX optimise later with something like XLogPrefetchBuffer() */ static TransactionId btree_xlog_delete_get_latestRemovedXid(XLogReaderState *record) { xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); OffsetNumber *unused; Buffer ibuffer, hbuffer; Page ipage, hpage; RelFileNode rnode; BlockNumber blkno; ItemId iitemid, hitemid; IndexTuple itup; HeapTupleHeader htuphdr; BlockNumber hblkno; OffsetNumber hoffnum; TransactionId latestRemovedXid = InvalidTransactionId; int i; /* * If there's nothing running on the standby we don't need to derive a * full latestRemovedXid value, so use a fast path out of here. This * returns InvalidTransactionId, and so will conflict with all HS * transactions; but since we just worked out that that's zero people, * it's OK. * * XXX There is a race condition here, which is that a new backend might * start just after we look. If so, it cannot need to conflict, but this * coding will result in throwing a conflict anyway. */ if (CountDBBackends(InvalidOid) == 0) return latestRemovedXid; /* * In what follows, we have to examine the previous state of the index * page, as well as the heap page(s) it points to. This is only valid if * WAL replay has reached a consistent database state; which means that * the preceding check is not just an optimization, but is *necessary*. We * won't have let in any user sessions before we reach consistency. */ if (!reachedConsistency) elog(PANIC, "btree_xlog_delete_get_latestRemovedXid: cannot operate with inconsistent data"); /* * Get index page. If the DB is consistent, this should not fail, nor * should any of the heap page fetches below. If one does, we return * InvalidTransactionId to cancel all HS transactions. That's probably * overkill, but it's safe, and certainly better than panicking here. */ XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL); if (!BufferIsValid(ibuffer)) return InvalidTransactionId; LockBuffer(ibuffer, BT_READ); ipage = (Page) BufferGetPage(ibuffer); /* * Loop through the deleted index items to obtain the TransactionId from * the heap items they point to. */ unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete); for (i = 0; i < xlrec->nitems; i++) { /* * Identify the index tuple about to be deleted */ iitemid = PageGetItemId(ipage, unused[i]); itup = (IndexTuple) PageGetItem(ipage, iitemid); /* * Locate the heap page that the index tuple points at */ hblkno = ItemPointerGetBlockNumber(&(itup->t_tid)); hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL); if (!BufferIsValid(hbuffer)) { UnlockReleaseBuffer(ibuffer); return InvalidTransactionId; } LockBuffer(hbuffer, BUFFER_LOCK_SHARE); hpage = (Page) BufferGetPage(hbuffer); /* * Look up the heap tuple header that the index tuple points at by * using the heap node supplied with the xlrec. We can't use * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer. * Note that we are not looking at tuple data here, just headers. */ hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid)); hitemid = PageGetItemId(hpage, hoffnum); /* * Follow any redirections until we find something useful. */ while (ItemIdIsRedirected(hitemid)) { hoffnum = ItemIdGetRedirect(hitemid); hitemid = PageGetItemId(hpage, hoffnum); CHECK_FOR_INTERRUPTS(); } /* * If the heap item has storage, then read the header and use that to * set latestRemovedXid. * * Some LP_DEAD items may not be accessible, so we ignore them. */ if (ItemIdHasStorage(hitemid)) { htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid); HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid); } else if (ItemIdIsDead(hitemid)) { /* * Conjecture: if hitemid is dead then it had xids before the xids * marked on LP_NORMAL items. So we just ignore this item and move * onto the next, for the purposes of calculating * latestRemovedxids. */ } else Assert(!ItemIdIsUsed(hitemid)); UnlockReleaseBuffer(hbuffer); } UnlockReleaseBuffer(ibuffer); /* * If all heap tuples were LP_DEAD then we will be returning * InvalidTransactionId here, which avoids conflicts. This matches * existing logic which assumes that LP_DEAD tuples must already be older * than the latestRemovedXid on the cleanup record that set them as * LP_DEAD, hence must already have generated a conflict. */ return latestRemovedXid; }
/* * Returns a list of items whose visibility map information does not match * the status of the tuples on the page. * * If all_visible is passed as true, this will include all items which are * on pages marked as all-visible in the visibility map but which do not * seem to in fact be all-visible. * * If all_frozen is passed as true, this will include all items which are * on pages marked as all-frozen but which do not seem to in fact be frozen. */ static corrupt_items * collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen) { Relation rel; BlockNumber nblocks; corrupt_items *items; BlockNumber blkno; Buffer vmbuffer = InvalidBuffer; BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD); TransactionId OldestXmin = InvalidTransactionId; if (all_visible) { /* Don't pass rel; that will fail in recovery. */ OldestXmin = GetOldestXmin(NULL, true); } rel = relation_open(relid, AccessShareLock); if (rel->rd_rel->relkind != RELKIND_RELATION && rel->rd_rel->relkind != RELKIND_MATVIEW && rel->rd_rel->relkind != RELKIND_TOASTVALUE) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is not a table, materialized view, or TOAST table", RelationGetRelationName(rel)))); nblocks = RelationGetNumberOfBlocks(rel); /* * Guess an initial array size. We don't expect many corrupted tuples, so * start with a small array. This function uses the "next" field to track * the next offset where we can store an item (which is the same thing as * the number of items found so far) and the "count" field to track the * number of entries allocated. We'll repurpose these fields before * returning. */ items = palloc0(sizeof(corrupt_items)); items->next = 0; items->count = 64; items->tids = palloc(items->count * sizeof(ItemPointerData)); /* Loop over every block in the relation. */ for (blkno = 0; blkno < nblocks; ++blkno) { bool check_frozen = false; bool check_visible = false; Buffer buffer; Page page; OffsetNumber offnum, maxoff; /* Make sure we are interruptible. */ CHECK_FOR_INTERRUPTS(); /* Use the visibility map to decide whether to check this page. */ if (all_frozen && VM_ALL_FROZEN(rel, blkno, &vmbuffer)) check_frozen = true; if (all_visible && VM_ALL_VISIBLE(rel, blkno, &vmbuffer)) check_visible = true; if (!check_visible && !check_frozen) continue; /* Read and lock the page. */ buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); maxoff = PageGetMaxOffsetNumber(page); /* * The visibility map bits might have changed while we were acquiring * the page lock. Recheck to avoid returning spurious results. */ if (check_frozen && !VM_ALL_FROZEN(rel, blkno, &vmbuffer)) check_frozen = false; if (check_visible && !VM_ALL_VISIBLE(rel, blkno, &vmbuffer)) check_visible = false; if (!check_visible && !check_frozen) { UnlockReleaseBuffer(buffer); continue; } /* Iterate over each tuple on the page. */ for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { HeapTupleData tuple; ItemId itemid; itemid = PageGetItemId(page, offnum); /* Unused or redirect line pointers are of no interest. */ if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid)) continue; /* Dead line pointers are neither all-visible nor frozen. */ if (ItemIdIsDead(itemid)) { ItemPointerSet(&(tuple.t_self), blkno, offnum); record_corrupt_item(items, &tuple.t_self); continue; } /* Initialize a HeapTupleData structure for checks below. */ ItemPointerSet(&(tuple.t_self), blkno, offnum); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tuple.t_tableOid = relid; /* * If we're checking whether the page is all-visible, we expect * the tuple to be all-visible. */ if (check_visible && !tuple_all_visible(&tuple, OldestXmin, buffer)) { TransactionId RecomputedOldestXmin; /* * Time has passed since we computed OldestXmin, so it's * possible that this tuple is all-visible in reality even * though it doesn't appear so based on our * previously-computed value. Let's compute a new value so we * can be certain whether there is a problem. * * From a concurrency point of view, it sort of sucks to * retake ProcArrayLock here while we're holding the buffer * exclusively locked, but it should be safe against * deadlocks, because surely GetOldestXmin() should never take * a buffer lock. And this shouldn't happen often, so it's * worth being careful so as to avoid false positives. */ RecomputedOldestXmin = GetOldestXmin(NULL, true); if (!TransactionIdPrecedes(OldestXmin, RecomputedOldestXmin)) record_corrupt_item(items, &tuple.t_self); else { OldestXmin = RecomputedOldestXmin; if (!tuple_all_visible(&tuple, OldestXmin, buffer)) record_corrupt_item(items, &tuple.t_self); } } /* * If we're checking whether the page is all-frozen, we expect the * tuple to be in a state where it will never need freezing. */ if (check_frozen) { if (heap_tuple_needs_eventual_freeze(tuple.t_data)) record_corrupt_item(items, &tuple.t_self); } } UnlockReleaseBuffer(buffer); } /* Clean up. */ if (vmbuffer != InvalidBuffer) ReleaseBuffer(vmbuffer); relation_close(rel, AccessShareLock); /* * Before returning, repurpose the fields to match caller's expectations. * next is now the next item that should be read (rather than written) and * count is now the number of items we wrote (rather than the number we * allocated). */ items->count = items->next; items->next = 0; return items; }
/* * For all items in this page, find their respective root line pointers. * If item k is part of a HOT-chain with root at item j, then we set * root_offsets[k - 1] = j. * * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries. * We zero out all unused entries. * * The function must be called with at least share lock on the buffer, to * prevent concurrent prune operations. * * Note: The information collected here is valid only as long as the caller * holds a pin on the buffer. Once pin is released, a tuple might be pruned * and reused by a completely unrelated tuple. */ void heap_get_root_tuples(Page page, OffsetNumber *root_offsets) { OffsetNumber offnum, maxoff; MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber)); maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId lp = PageGetItemId(page, offnum); HeapTupleHeader htup; OffsetNumber nextoffnum; TransactionId priorXmax; /* skip unused and dead items */ if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp)) continue; if (ItemIdIsNormal(lp)) { htup = (HeapTupleHeader) PageGetItem(page, lp); /* * Check if this tuple is part of a HOT-chain rooted at some other * tuple. If so, skip it for now; we'll process it when we find * its root. */ if (HeapTupleHeaderIsHeapOnly(htup)) continue; /* * This is either a plain tuple or the root of a HOT-chain. * Remember it in the mapping. */ root_offsets[offnum - 1] = offnum; /* If it's not the start of a HOT-chain, we're done with it */ if (!HeapTupleHeaderIsHotUpdated(htup)) continue; /* Set up to scan the HOT-chain */ nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); priorXmax = HeapTupleHeaderGetXmax(htup); } else { /* Must be a redirect item. We do not set its root_offsets entry */ Assert(ItemIdIsRedirected(lp)); /* Set up to scan the HOT-chain */ nextoffnum = ItemIdGetRedirect(lp); priorXmax = InvalidTransactionId; } /* * Now follow the HOT-chain and collect other tuples in the chain. * * Note: Even though this is a nested loop, the complexity of the * function is O(N) because a tuple in the page should be visited not * more than twice, once in the outer loop and once in HOT-chain * chases. */ for (;;) { lp = PageGetItemId(page, nextoffnum); /* Check for broken chains */ if (!ItemIdIsNormal(lp)) break; htup = (HeapTupleHeader) PageGetItem(page, lp); if (TransactionIdIsValid(priorXmax) && !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup))) break; /* Remember the root line pointer for this item */ root_offsets[nextoffnum - 1] = offnum; /* Advance to next chain member, if any */ if (!HeapTupleHeaderIsHotUpdated(htup)) break; nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); priorXmax = HeapTupleHeaderGetXmax(htup); } } }
/* * PageRepairFragmentation * * Frees fragmented space on a page. * It doesn't remove unused line pointers! Please don't change this. * * This routine is usable for heap pages only, but see PageIndexMultiDelete. * * Returns number of unused line pointers on page. If "unused" is not NULL * then the unused[] array is filled with indexes of unused line pointers. */ int PageRepairFragmentation(Page page, OffsetNumber *unused) { Offset pd_lower = ((PageHeader) page)->pd_lower; Offset pd_upper = ((PageHeader) page)->pd_upper; Offset pd_special = ((PageHeader) page)->pd_special; itemIdSort itemidbase, itemidptr; ItemId lp; int nline, nused; int i; Size totallen; Offset upper; /* * It's worth the trouble to be more paranoid here than in most places, * because we are about to reshuffle data in (what is usually) a shared * disk buffer. If we aren't careful then corrupted pointers, lengths, * etc could cause us to clobber adjacent disk buffers, spreading the data * loss further. So, check everything. */ if (pd_lower < SizeOfPageHeaderData || pd_lower > pd_upper || pd_upper > pd_special || pd_special > BLCKSZ || pd_special != MAXALIGN(pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", pd_lower, pd_upper, pd_special), errSendAlert(true))); nline = PageGetMaxOffsetNumber(page); nused = 0; for (i = 0; i < nline; i++) { lp = PageGetItemId(page, i + 1); if (ItemIdDeleted(lp)) /* marked for deletion */ lp->lp_flags &= ~(LP_USED | LP_DELETE); if (ItemIdIsUsed(lp)) nused++; else if (unused) unused[i - nused] = (OffsetNumber) i; } if (nused == 0) { /* Page is completely empty, so just reset it quickly */ for (i = 0; i < nline; i++) { lp = PageGetItemId(page, i + 1); lp->lp_len = 0; /* indicate unused & deallocated */ } ((PageHeader) page)->pd_upper = pd_special; } else { /* nused != 0 */ /* Need to compact the page the hard way */ itemidbase = (itemIdSort) palloc(sizeof(itemIdSortData) * nused); itemidptr = itemidbase; totallen = 0; for (i = 0; i < nline; i++) { lp = PageGetItemId(page, i + 1); if (ItemIdIsUsed(lp)) { itemidptr->offsetindex = i; itemidptr->itemoff = ItemIdGetOffset(lp); if (itemidptr->itemoff < (int) pd_upper || itemidptr->itemoff >= (int) pd_special) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item pointer: %u", itemidptr->itemoff), errSendAlert(true))); itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp)); totallen += itemidptr->alignedlen; itemidptr++; } else { lp->lp_len = 0; /* indicate unused & deallocated */ } } if (totallen > (Size) (pd_special - pd_lower)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item lengths: total %u, available space %u", (unsigned int) totallen, pd_special - pd_lower), errSendAlert(true))); /* sort itemIdSortData array into decreasing itemoff order */ qsort((char *) itemidbase, nused, sizeof(itemIdSortData), itemoffcompare); /* compactify page */ upper = pd_special; for (i = 0, itemidptr = itemidbase; i < nused; i++, itemidptr++) { lp = PageGetItemId(page, itemidptr->offsetindex + 1); upper -= itemidptr->alignedlen; memmove((char *) page + upper, (char *) page + itemidptr->itemoff, itemidptr->alignedlen); lp->lp_off = upper; } ((PageHeader) page)->pd_upper = upper; pfree(itemidbase); } /* Set hint bit for PageAddItem */ if (nused < nline) PageSetHasFreeLinePointers(page); else PageClearHasFreeLinePointers(page); return (nline - nused); }
/* * Rescan end pages to verify that they are (still) empty of tuples. * * Returns number of nondeletable pages (last nonempty page + 1). */ static BlockNumber count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats) { BlockNumber blkno; /* Strange coding of loop control is needed because blkno is unsigned */ blkno = vacrelstats->rel_pages; while (blkno > vacrelstats->nonempty_pages) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool hastup; /* * We don't insert a vacuum delay point here, because we have an * exclusive lock on the table which we want to hold for as short a * time as possible. We still need to check for interrupts however. */ CHECK_FOR_INTERRUPTS(); blkno--; buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, vac_strategy); /* In this phase we only need shared access to the buffer */ LockBuffer(buf, BUFFER_LOCK_SHARE); page = BufferGetPage(buf); if (PageIsNew(page) || PageIsEmpty(page)) { /* PageIsNew probably shouldn't happen... */ UnlockReleaseBuffer(buf); continue; } hastup = false; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* * Note: any non-unused item should be taken as a reason to keep * this page. We formerly thought that DEAD tuples could be * thrown away, but that's not so, because we'd not have cleaned * out their index entries. */ if (ItemIdIsUsed(itemid)) { hastup = true; break; /* can stop scanning */ } } /* scan along page */ UnlockReleaseBuffer(buf); /* Done scanning if we found a tuple here */ if (hastup) return blkno + 1; } /* * If we fall out of the loop, all the previously-thought-to-be-empty * pages still are; we need not bother to look at the last known-nonempty * page. */ return vacrelstats->nonempty_pages; }
/* ---------------------------------------------------------------- * BitmapHeapNext * * Retrieve next tuple from the BitmapHeapScan node's currentRelation * ---------------------------------------------------------------- */ static TupleTableSlot * BitmapHeapNext(BitmapHeapScanState *node) { EState *estate; ExprContext *econtext; HeapScanDesc scan; Index scanrelid; TIDBitmap *tbm; TBMIterateResult *tbmres; OffsetNumber targoffset; TupleTableSlot *slot; /* * extract necessary information from index scan node */ estate = node->ss.ps.state; econtext = node->ss.ps.ps_ExprContext; slot = node->ss.ss_ScanTupleSlot; scan = node->ss.ss_currentScanDesc; scanrelid = ((BitmapHeapScan *) node->ss.ps.plan)->scan.scanrelid; tbm = node->tbm; tbmres = node->tbmres; /* * Check if we are evaluating PlanQual for tuple of this relation. * Additional checking is not good, but no other way for now. We could * introduce new nodes for this case and handle IndexScan --> NewNode * switching in Init/ReScan plan... */ if (estate->es_evTuple != NULL && estate->es_evTuple[scanrelid - 1] != NULL) { if (estate->es_evTupleNull[scanrelid - 1]) return ExecClearTuple(slot); ExecStoreTuple(estate->es_evTuple[scanrelid - 1], slot, InvalidBuffer, false); /* Does the tuple meet the original qual conditions? */ econtext->ecxt_scantuple = slot; ResetExprContext(econtext); if (!ExecQual(node->bitmapqualorig, econtext, false)) ExecClearTuple(slot); /* would not be returned by scan */ /* Flag for the next call that no more tuples */ estate->es_evTupleNull[scanrelid - 1] = true; return slot; } /* * If we haven't yet performed the underlying index scan, do it, and * prepare the bitmap to be iterated over. */ if (tbm == NULL) { tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node)); if (!tbm || !IsA(tbm, TIDBitmap)) elog(ERROR, "unrecognized result from subplan"); node->tbm = tbm; node->tbmres = tbmres = NULL; tbm_begin_iterate(tbm); } for (;;) { Page dp; ItemId lp; /* * Get next page of results if needed */ if (tbmres == NULL) { node->tbmres = tbmres = tbm_iterate(tbm); if (tbmres == NULL) { /* no more entries in the bitmap */ break; } /* * Ignore any claimed entries past what we think is the end of the * relation. (This is probably not necessary given that we got at * least AccessShareLock on the table before performing any of the * indexscans, but let's be safe.) */ if (tbmres->blockno >= scan->rs_nblocks) { node->tbmres = tbmres = NULL; continue; } /* * Fetch the current heap page and identify candidate tuples. */ bitgetpage(scan, tbmres); /* * Set rs_cindex to first slot to examine */ scan->rs_cindex = 0; } else { /* * Continuing in previously obtained page; advance rs_cindex */ scan->rs_cindex++; } /* * Out of range? If so, nothing more to look at on this page */ if (scan->rs_cindex < 0 || scan->rs_cindex >= scan->rs_ntuples) { node->tbmres = tbmres = NULL; continue; } /* * Okay to fetch the tuple */ targoffset = scan->rs_vistuples[scan->rs_cindex]; dp = (Page) BufferGetPage(scan->rs_cbuf); lp = PageGetItemId(dp, targoffset); Assert(ItemIdIsUsed(lp)); scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); scan->rs_ctup.t_len = ItemIdGetLength(lp); ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset); pgstat_count_heap_fetch(&scan->rs_pgstat_info); /* * Set up the result slot to point to this tuple. Note that the slot * acquires a pin on the buffer. */ ExecStoreTuple(&scan->rs_ctup, slot, scan->rs_cbuf, false); /* * If we are using lossy info, we have to recheck the qual conditions * at every tuple. */ if (tbmres->ntuples < 0) { econtext->ecxt_scantuple = slot; ResetExprContext(econtext); if (!ExecQual(node->bitmapqualorig, econtext, false)) { /* Fails recheck, so drop it and loop back for another */ ExecClearTuple(slot); continue; } } /* OK to return this tuple */ return slot; } /* * if we get here it means we are at the end of the scan.. */ return ExecClearTuple(slot); }
/* * PageIndexDeleteNoCompact * Delete the given items for an index page, and defragment the resulting * free space, but do not compact the item pointers array. * * itemnos is the array of tuples to delete; nitems is its size. maxIdxTuples * is the maximum number of tuples that can exist in a page. * * Unused items at the end of the array are removed. * * This is used for index AMs that require that existing TIDs of live tuples * remain unchanged. */ void PageIndexDeleteNoCompact(Page page, OffsetNumber *itemnos, int nitems) { PageHeader phdr = (PageHeader) page; LocationIndex pd_lower = phdr->pd_lower; LocationIndex pd_upper = phdr->pd_upper; LocationIndex pd_special = phdr->pd_special; int nline; bool empty; OffsetNumber offnum; int nextitm; /* * As with PageRepairFragmentation, paranoia seems justified. */ if (pd_lower < SizeOfPageHeaderData || pd_lower > pd_upper || pd_upper > pd_special || pd_special > BLCKSZ || pd_special != MAXALIGN(pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", pd_lower, pd_upper, pd_special))); /* * Scan the existing item pointer array and mark as unused those that are * in our kill-list; make sure any non-interesting ones are marked unused * as well. */ nline = PageGetMaxOffsetNumber(page); empty = true; nextitm = 0; for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum)) { ItemId lp; ItemLength itemlen; ItemOffset offset; lp = PageGetItemId(page, offnum); itemlen = ItemIdGetLength(lp); offset = ItemIdGetOffset(lp); if (ItemIdIsUsed(lp)) { if (offset < pd_upper || (offset + itemlen) > pd_special || offset != MAXALIGN(offset)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item pointer: offset = %u, length = %u", offset, (unsigned int) itemlen))); if (nextitm < nitems && offnum == itemnos[nextitm]) { /* this one is on our list to delete, so mark it unused */ ItemIdSetUnused(lp); nextitm++; } else if (ItemIdHasStorage(lp)) { /* This one's live -- must do the compaction dance */ empty = false; } else { /* get rid of this one too */ ItemIdSetUnused(lp); } } } /* this will catch invalid or out-of-order itemnos[] */ if (nextitm != nitems) elog(ERROR, "incorrect index offsets supplied"); if (empty) { /* Page is completely empty, so just reset it quickly */ phdr->pd_lower = SizeOfPageHeaderData; phdr->pd_upper = pd_special; } else { /* There are live items: need to compact the page the hard way */ itemIdSortData itemidbase[MaxOffsetNumber]; itemIdSort itemidptr; int i; Size totallen; /* * Scan the page taking note of each item that we need to preserve. * This includes both live items (those that contain data) and * interspersed unused ones. It's critical to preserve these unused * items, because otherwise the offset numbers for later live items * would change, which is not acceptable. Unused items might get used * again later; that is fine. */ itemidptr = itemidbase; totallen = 0; PageClearHasFreeLinePointers(page); for (i = 0; i < nline; i++) { ItemId lp; itemidptr->offsetindex = i; lp = PageGetItemId(page, i + 1); if (ItemIdHasStorage(lp)) { itemidptr->itemoff = ItemIdGetOffset(lp); itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp)); totallen += itemidptr->alignedlen; itemidptr++; } else { PageSetHasFreeLinePointers(page); ItemIdSetUnused(lp); } } nline = itemidptr - itemidbase; /* By here, there are exactly nline elements in itemidbase array */ if (totallen > (Size) (pd_special - pd_lower)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item lengths: total %u, available space %u", (unsigned int) totallen, pd_special - pd_lower))); /* * Defragment the data areas of each tuple, being careful to preserve * each item's position in the linp array. */ compactify_tuples(itemidbase, nline, page); } }
/* * PageAddItem * * Add an item to a page. Return value is offset at which it was * inserted, or InvalidOffsetNumber if there's not room to insert. * * If overwrite is true, we just store the item at the specified * offsetNumber (which must be either a currently-unused item pointer, * or one past the last existing item). Otherwise, * if offsetNumber is valid and <= current max offset in the page, * insert item into the array at that position by shuffling ItemId's * down to make room. * If offsetNumber is not valid, then assign one by finding the first * one that is both unused and deallocated. * * If is_heap is true, we enforce that there can't be more than * MaxHeapTuplesPerPage line pointers on the page. * * !!! EREPORT(ERROR) IS DISALLOWED HERE !!! */ OffsetNumber PageAddItem(Page page, Item item, Size size, OffsetNumber offsetNumber, bool overwrite, bool is_heap) { PageHeader phdr = (PageHeader) page; Size alignedSize; int lower; int upper; ItemId itemId; OffsetNumber limit; bool needshuffle = false; /* * Be wary about corrupted page pointers */ if (phdr->pd_lower < SizeOfPageHeaderData || phdr->pd_lower > phdr->pd_upper || phdr->pd_upper > phdr->pd_special || phdr->pd_special > BLCKSZ) ereport(PANIC, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", phdr->pd_lower, phdr->pd_upper, phdr->pd_special))); /* * Select offsetNumber to place the new item at */ limit = OffsetNumberNext(PageGetMaxOffsetNumber(page)); /* was offsetNumber passed in? */ if (OffsetNumberIsValid(offsetNumber)) { /* yes, check it */ if (overwrite) { if (offsetNumber < limit) { itemId = PageGetItemId(phdr, offsetNumber); if (ItemIdIsUsed(itemId) || ItemIdHasStorage(itemId)) { elog(WARNING, "will not overwrite a used ItemId"); return InvalidOffsetNumber; } } } else { if (offsetNumber < limit) needshuffle = true; /* need to move existing linp's */ } } else { /* offsetNumber was not passed in, so find a free slot */ /* if no free slot, we'll put it at limit (1st open slot) */ if (PageHasFreeLinePointers(phdr)) { /* * Look for "recyclable" (unused) ItemId. We check for no storage * as well, just to be paranoid --- unused items should never have * storage. */ for (offsetNumber = 1; offsetNumber < limit; offsetNumber++) { itemId = PageGetItemId(phdr, offsetNumber); if (!ItemIdIsUsed(itemId) && !ItemIdHasStorage(itemId)) break; } if (offsetNumber >= limit) { /* the hint is wrong, so reset it */ PageClearHasFreeLinePointers(phdr); } } else { /* don't bother searching if hint says there's no free slot */ offsetNumber = limit; } } if (offsetNumber > limit) { elog(WARNING, "specified item offset is too large"); return InvalidOffsetNumber; } if (is_heap && offsetNumber > MaxHeapTuplesPerPage) { elog(WARNING, "can't put more than MaxHeapTuplesPerPage items in a heap page"); return InvalidOffsetNumber; } /* * Compute new lower and upper pointers for page, see if it'll fit. * * Note: do arithmetic as signed ints, to avoid mistakes if, say, * alignedSize > pd_upper. */ if (offsetNumber == limit || needshuffle) lower = phdr->pd_lower + sizeof(ItemIdData); else lower = phdr->pd_lower; alignedSize = MAXALIGN(size); upper = (int) phdr->pd_upper - (int) alignedSize; if (lower > upper) return InvalidOffsetNumber; /* * OK to insert the item. First, shuffle the existing pointers if needed. */ itemId = PageGetItemId(phdr, offsetNumber); if (needshuffle) memmove(itemId + 1, itemId, (limit - offsetNumber) * sizeof(ItemIdData)); /* set the item pointer */ ItemIdSetNormal(itemId, upper, size); /* copy the item's data onto the page */ memcpy((char *) page + upper, item, size); /* adjust page header */ phdr->pd_lower = (LocationIndex) lower; phdr->pd_upper = (LocationIndex) upper; return offsetNumber; }
/* * Extract all item values from a BRIN index page * * Usage: SELECT * FROM brin_page_items(get_raw_page('idx', 1), 'idx'::regclass); */ Datum brin_page_items(PG_FUNCTION_ARGS) { brin_page_state *state; FuncCallContext *fctx; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use raw page functions")))); if (SRF_IS_FIRSTCALL()) { bytea *raw_page = PG_GETARG_BYTEA_P(0); Oid indexRelid = PG_GETARG_OID(1); Page page; TupleDesc tupdesc; MemoryContext mctx; Relation indexRel; AttrNumber attno; /* minimally verify the page we got */ page = verify_brin_page(raw_page, BRIN_PAGETYPE_REGULAR, "regular"); /* create a function context for cross-call persistence */ fctx = SRF_FIRSTCALL_INIT(); /* switch to memory context appropriate for multiple function calls */ mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); indexRel = index_open(indexRelid, AccessShareLock); state = palloc(offsetof(brin_page_state, columns) + sizeof(brin_column_state) * RelationGetDescr(indexRel)->natts); state->bdesc = brin_build_desc(indexRel); state->page = page; state->offset = FirstOffsetNumber; state->unusedItem = false; state->done = false; state->dtup = NULL; /* * Initialize output functions for all indexed datatypes; simplifies * calling them later. */ for (attno = 1; attno <= state->bdesc->bd_tupdesc->natts; attno++) { Oid output; bool isVarlena; BrinOpcInfo *opcinfo; int i; brin_column_state *column; opcinfo = state->bdesc->bd_info[attno - 1]; column = palloc(offsetof(brin_column_state, outputFn) + sizeof(FmgrInfo) * opcinfo->oi_nstored); column->nstored = opcinfo->oi_nstored; for (i = 0; i < opcinfo->oi_nstored; i++) { getTypeOutputInfo(opcinfo->oi_typcache[i]->type_id, &output, &isVarlena); fmgr_info(output, &column->outputFn[i]); } state->columns[attno - 1] = column; } index_close(indexRel, AccessShareLock); fctx->user_fctx = state; fctx->tuple_desc = BlessTupleDesc(tupdesc); MemoryContextSwitchTo(mctx); } fctx = SRF_PERCALL_SETUP(); state = fctx->user_fctx; if (!state->done) { HeapTuple result; Datum values[7]; bool nulls[7]; /* * This loop is called once for every attribute of every tuple in the * page. At the start of a tuple, we get a NULL dtup; that's our * signal for obtaining and decoding the next one. If that's not the * case, we output the next attribute. */ if (state->dtup == NULL) { BrinTuple *tup; MemoryContext mctx; ItemId itemId; /* deformed tuple must live across calls */ mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); /* verify item status: if there's no data, we can't decode */ itemId = PageGetItemId(state->page, state->offset); if (ItemIdIsUsed(itemId)) { tup = (BrinTuple *) PageGetItem(state->page, PageGetItemId(state->page, state->offset)); state->dtup = brin_deform_tuple(state->bdesc, tup); state->attno = 1; state->unusedItem = false; } else state->unusedItem = true; MemoryContextSwitchTo(mctx); } else state->attno++; MemSet(nulls, 0, sizeof(nulls)); if (state->unusedItem) { values[0] = UInt16GetDatum(state->offset); nulls[1] = true; nulls[2] = true; nulls[3] = true; nulls[4] = true; nulls[5] = true; nulls[6] = true; } else { int att = state->attno - 1; values[0] = UInt16GetDatum(state->offset); values[1] = UInt32GetDatum(state->dtup->bt_blkno); values[2] = UInt16GetDatum(state->attno); values[3] = BoolGetDatum(state->dtup->bt_columns[att].bv_allnulls); values[4] = BoolGetDatum(state->dtup->bt_columns[att].bv_hasnulls); values[5] = BoolGetDatum(state->dtup->bt_placeholder); if (!state->dtup->bt_columns[att].bv_allnulls) { BrinValues *bvalues = &state->dtup->bt_columns[att]; StringInfoData s; bool first; int i; initStringInfo(&s); appendStringInfoChar(&s, '{'); first = true; for (i = 0; i < state->columns[att]->nstored; i++) { char *val; if (!first) appendStringInfoString(&s, " .. "); first = false; val = OutputFunctionCall(&state->columns[att]->outputFn[i], bvalues->bv_values[i]); appendStringInfoString(&s, val); pfree(val); } appendStringInfoChar(&s, '}'); values[6] = CStringGetTextDatum(s.data); pfree(s.data); } else { nulls[6] = true; } } result = heap_form_tuple(fctx->tuple_desc, values, nulls); /* * If the item was unused, jump straight to the next one; otherwise, * the only cleanup needed here is to set our signal to go to the next * tuple in the following iteration, by freeing the current one. */ if (state->unusedItem) state->offset = OffsetNumberNext(state->offset); else if (state->attno >= state->bdesc->bd_tupdesc->natts) { pfree(state->dtup); state->dtup = NULL; state->offset = OffsetNumberNext(state->offset); } /* * If we're beyond the end of the page, set flag to end the function * in the following iteration. */ if (state->offset > PageGetMaxOffsetNumber(state->page)) state->done = true; SRF_RETURN_NEXT(fctx, HeapTupleGetDatum(result)); } brin_free_desc(state->bdesc); SRF_RETURN_DONE(fctx); }
/* * Rescan end pages to verify that they are (still) empty of tuples. * * Returns number of nondeletable pages (last nonempty page + 1). */ static BlockNumber count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats) { BlockNumber blkno; instr_time starttime; instr_time currenttime; instr_time elapsed; /* Initialize the starttime if we check for conflicting lock requests */ INSTR_TIME_SET_CURRENT(starttime); /* Strange coding of loop control is needed because blkno is unsigned */ blkno = vacrelstats->rel_pages; while (blkno > vacrelstats->nonempty_pages) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool hastup; /* * Check if another process requests a lock on our relation. We are * holding an AccessExclusiveLock here, so they will be waiting. We * only do this in autovacuum_truncate_lock_check millisecond * intervals, and we only check if that interval has elapsed once * every 32 blocks to keep the number of system calls and actual * shared lock table lookups to a minimum. */ if ((blkno % 32) == 0) { INSTR_TIME_SET_CURRENT(currenttime); elapsed = currenttime; INSTR_TIME_SUBTRACT(elapsed, starttime); if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000) >= AUTOVACUUM_TRUNCATE_LOCK_CHECK_INTERVAL) { if (LockHasWaitersRelation(onerel, AccessExclusiveLock)) { ereport(elevel, (errmsg("\"%s\": suspending truncate " "due to conflicting lock request", RelationGetRelationName(onerel)))); vacrelstats->lock_waiter_detected = true; return blkno; } starttime = currenttime; } } /* * We don't insert a vacuum delay point here, because we have an * exclusive lock on the table which we want to hold for as short a * time as possible. We still need to check for interrupts however. */ CHECK_FOR_INTERRUPTS(); blkno--; buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, vac_strategy); /* In this phase we only need shared access to the buffer */ LockBuffer(buf, BUFFER_LOCK_SHARE); page = BufferGetPage(buf); if (PageIsNew(page) || PageIsEmpty(page)) { /* PageIsNew probably shouldn't happen... */ UnlockReleaseBuffer(buf); continue; } hastup = false; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* * Note: any non-unused item should be taken as a reason to keep * this page. We formerly thought that DEAD tuples could be * thrown away, but that's not so, because we'd not have cleaned * out their index entries. */ if (ItemIdIsUsed(itemid)) { hastup = true; break; /* can stop scanning */ } } /* scan along page */ UnlockReleaseBuffer(buf); /* Done scanning if we found a tuple here */ if (hastup) return blkno + 1; } /* * If we fall out of the loop, all the previously-thought-to-be-empty * pages still are; we need not bother to look at the last known-nonempty * page. */ return vacrelstats->nonempty_pages; }
/* * PageRepairFragmentation * * Frees fragmented space on a page. * It doesn't remove unused line pointers! Please don't change this. * * This routine is usable for heap pages only, but see PageIndexMultiDelete. * * As a side effect, the page's PD_HAS_FREE_LINES hint bit is updated. */ void PageRepairFragmentation(Page page) { Offset pd_lower = ((PageHeader) page)->pd_lower; Offset pd_upper = ((PageHeader) page)->pd_upper; Offset pd_special = ((PageHeader) page)->pd_special; ItemId lp; int nline, nstorage, nunused; int i; Size totallen; /* * It's worth the trouble to be more paranoid here than in most places, * because we are about to reshuffle data in (what is usually) a shared * disk buffer. If we aren't careful then corrupted pointers, lengths, * etc could cause us to clobber adjacent disk buffers, spreading the data * loss further. So, check everything. */ if (pd_lower < SizeOfPageHeaderData || pd_lower > pd_upper || pd_upper > pd_special || pd_special > BLCKSZ || pd_special != MAXALIGN(pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", pd_lower, pd_upper, pd_special))); nline = PageGetMaxOffsetNumber(page); nunused = nstorage = 0; for (i = FirstOffsetNumber; i <= nline; i++) { lp = PageGetItemId(page, i); if (ItemIdIsUsed(lp)) { if (ItemIdHasStorage(lp)) nstorage++; } else { /* Unused entries should have lp_len = 0, but make sure */ ItemIdSetUnused(lp); nunused++; } } if (nstorage == 0) { /* Page is completely empty, so just reset it quickly */ ((PageHeader) page)->pd_upper = pd_special; } else { /* Need to compact the page the hard way */ itemIdSortData itemidbase[MaxHeapTuplesPerPage]; itemIdSort itemidptr = itemidbase; totallen = 0; for (i = 0; i < nline; i++) { lp = PageGetItemId(page, i + 1); if (ItemIdHasStorage(lp)) { itemidptr->offsetindex = i; itemidptr->itemoff = ItemIdGetOffset(lp); if (itemidptr->itemoff < (int) pd_upper || itemidptr->itemoff >= (int) pd_special) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item pointer: %u", itemidptr->itemoff))); itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp)); totallen += itemidptr->alignedlen; itemidptr++; } } if (totallen > (Size) (pd_special - pd_lower)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item lengths: total %u, available space %u", (unsigned int) totallen, pd_special - pd_lower))); compactify_tuples(itemidbase, nstorage, page); } /* Set hint bit for PageAddItem */ if (nunused > 0) PageSetHasFreeLinePointers(page); else PageClearHasFreeLinePointers(page); }
/* * lazy_scan_heap() -- scan an open heap relation * * This routine sets commit status bits, builds lists of dead tuples * and pages with free space, and calculates statistics on the number * of live tuples in the heap. When done, or when we run low on space * for dead-tuple TIDs, invoke vacuuming of indexes and heap. * * If there are no indexes then we just vacuum each dirty page as we * process it, since there's no point in gathering many tuples. */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, Relation *Irel, int nindexes, List *updated_stats) { MIRROREDLOCK_BUFMGR_DECLARE; BlockNumber nblocks, blkno; HeapTupleData tuple; char *relname; BlockNumber empty_pages, vacuumed_pages; double num_tuples, tups_vacuumed, nkeep, nunused; IndexBulkDeleteResult **indstats; int i; int reindex_count = 1; PGRUsage ru0; /* Fetch gp_persistent_relation_node information that will be added to XLOG record. */ RelationFetchGpRelationNodeForXLog(onerel); pg_rusage_init(&ru0); relname = RelationGetRelationName(onerel); ereport(elevel, (errmsg("vacuuming \"%s.%s\"", get_namespace_name(RelationGetNamespace(onerel)), relname))); empty_pages = vacuumed_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); nblocks = RelationGetNumberOfBlocks(onerel); vacrelstats->rel_pages = nblocks; vacrelstats->nonempty_pages = 0; lazy_space_alloc(vacrelstats, nblocks); for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool tupgone, hastup; int prev_dead_count; OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; vacuum_delay_point(); /* * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. */ if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && vacrelstats->num_dead_tuples > 0) { /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); reindex_count++; /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); /* Forget the now-vacuumed tuples, and press on */ vacrelstats->num_dead_tuples = 0; vacrelstats->num_index_scans++; } /* -------- MirroredLock ---------- */ MIRROREDLOCK_BUFMGR_LOCK; buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy); /* We need buffer cleanup lock so that we can prune HOT chains. */ LockBufferForCleanup(buf); page = BufferGetPage(buf); if (PageIsNew(page)) { /* * An all-zeroes page could be left over if a backend extends the * relation but crashes before initializing the page. Reclaim such * pages for use. * * We have to be careful here because we could be looking at a * page that someone has just added to the relation and not yet * been able to initialize (see RelationGetBufferForTuple). To * protect against that, release the buffer lock, grab the * relation extension lock momentarily, and re-lock the buffer. If * the page is still uninitialized by then, it must be left over * from a crashed backend, and we can initialize it. * * We don't really need the relation lock when this is a new or * temp relation, but it's probably not worth the code space to * check that, since this surely isn't a critical path. * * Note: the comparable code in vacuum.c need not worry because * it's got exclusive lock on the whole relation. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ LockRelationForExtension(onerel, ExclusiveLock); UnlockRelationForExtension(onerel, ExclusiveLock); /* -------- MirroredLock ---------- */ MIRROREDLOCK_BUFMGR_LOCK; LockBufferForCleanup(buf); if (PageIsNew(page)) { ereport(WARNING, (errmsg("relation \"%s\" page %u is uninitialized --- fixing", relname, blkno))); PageInit(page, BufferGetPageSize(buf), 0); /* must record in xlog so that changetracking will know about this change */ log_heap_newpage(onerel, page, blkno); empty_pages++; lazy_record_free_space(vacrelstats, blkno, PageGetHeapFreeSpace(page)); } MarkBufferDirty(buf); UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ continue; } if (PageIsEmpty(page)) { empty_pages++; lazy_record_free_space(vacrelstats, blkno, PageGetHeapFreeSpace(page)); UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ continue; } /* * Prune all HOT-update chains in this page. * * We count tuples removed by the pruning step as removed by VACUUM. */ tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, false); /* * Now scan the page to collect vacuumable items and check for tuples * requiring freezing. */ nfrozen = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* Unused items require no processing, but we count 'em */ if (!ItemIdIsUsed(itemid)) { nunused += 1; continue; } /* Redirect items mustn't be touched */ if (ItemIdIsRedirected(itemid)) { hastup = true; /* this page won't be truncatable */ continue; } ItemPointerSet(&(tuple.t_self), blkno, offnum); /* * DEAD item pointers are to be vacuumed normally; but we don't * count them in tups_vacuumed, else we'd be double-counting (at * least in the common case where heap_page_prune() just freed up * a non-HOT tuple). */ if (ItemIdIsDead(itemid)) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); continue; } Assert(ItemIdIsNormal(itemid)); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tupgone = false; switch (HeapTupleSatisfiesVacuum(onerel, tuple.t_data, OldestXmin, buf)) { case HEAPTUPLE_DEAD: /* * Ordinarily, DEAD tuples would have been removed by * heap_page_prune(), but it's possible that the tuple * state changed since heap_page_prune() looked. In * particular an INSERT_IN_PROGRESS tuple could have * changed to DEAD if the inserter aborted. So this * cannot be considered an error condition. * * If the tuple is HOT-updated then it must only be * removed by a prune operation; so we keep it just as if * it were RECENTLY_DEAD. Also, if it's a heap-only * tuple, we choose to keep it, because it'll be a lot * cheaper to get rid of it in the next pruning pass than * to treat it like an indexed tuple. */ if (HeapTupleIsHotUpdated(&tuple) || HeapTupleIsHeapOnly(&tuple)) nkeep += 1; else tupgone = true; /* we can delete the tuple */ break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ if (onerel->rd_rel->relhasoids && !OidIsValid(HeapTupleGetOid(&tuple))) elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); break; case HEAPTUPLE_RECENTLY_DEAD: /* * If tuple is recently deleted then we must not remove it * from relation. */ nkeep += 1; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } if (tupgone) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); tups_vacuumed += 1; } else { num_tuples += 1; hastup = true; /* * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ if (heap_freeze_tuple(tuple.t_data, &FreezeLimit, InvalidBuffer, false)) frozen[nfrozen++] = offnum; } } /* scan along page */ /* * If we froze any tuples, mark the buffer dirty, and write a WAL * record recording the changes. We must log the changes to be * crash-safe against future truncation of CLOG. */ if (nfrozen > 0) { MarkBufferDirty(buf); /* no XLOG for temp tables, though */ if (!onerel->rd_istemp) { XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, frozen, nfrozen); PageSetLSN(page, recptr); } } /* * If there are no indexes then we can vacuum the page right now * instead of doing a second scan. */ if (nindexes == 0 && vacrelstats->num_dead_tuples > 0) { /* Remove tuples from heap */ lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); /* Forget the now-vacuumed tuples, and press on */ vacrelstats->num_dead_tuples = 0; vacuumed_pages++; } /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record * its post-compaction free space. If not, then we're done with this * page, so remember its free space as-is. (This path will always be * taken if there are no indexes.) */ if (vacrelstats->num_dead_tuples == prev_dead_count) { lazy_record_free_space(vacrelstats, blkno, PageGetHeapFreeSpace(page)); } /* Remember the location of the last page with nonremovable tuples */ if (hastup) vacrelstats->nonempty_pages = blkno + 1; UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ } /* save stats for use later */ vacrelstats->rel_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) { /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); reindex_count++; /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); vacrelstats->num_index_scans++; } /* Do post-vacuum cleanup and statistics update for each index */ for (i = 0; i < nindexes; i++) lazy_cleanup_index(Irel[i], indstats[i], vacrelstats, updated_stats); /* If no indexes, make log report that lazy_vacuum_heap would've made */ if (vacuumed_pages) ereport(elevel, (errmsg("\"%s\": removed %.0f row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, vacuumed_pages))); ereport(elevel, (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, num_tuples, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages contain useful free space.\n" "%u pages are entirely empty.\n" "%s.", nkeep, nunused, vacrelstats->tot_free_pages, empty_pages, pg_rusage_show(&ru0)))); }
/* * bitgetpage - subroutine for BitmapHeapNext() * * This routine reads and pins the specified page of the relation, then * builds an array indicating which tuples on the page are both potentially * interesting according to the bitmap, and visible according to the snapshot. */ static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres) { BlockNumber page = tbmres->blockno; Buffer buffer; Snapshot snapshot; Page dp; int ntup; int curslot; int minslot; int maxslot; int maxoff; /* * Acquire pin on the target heap page, trading in any pin we held before. */ Assert(page < scan->rs_nblocks); scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf, scan->rs_rd, page); buffer = scan->rs_cbuf; snapshot = scan->rs_snapshot; /* * We must hold share lock on the buffer content while examining tuple * visibility. Afterwards, however, the tuples we have found to be * visible are guaranteed good as long as we hold the buffer pin. */ LockBuffer(buffer, BUFFER_LOCK_SHARE); dp = (Page) BufferGetPage(buffer); maxoff = PageGetMaxOffsetNumber(dp); /* * Determine how many entries we need to look at on this page. If the * bitmap is lossy then we need to look at each physical item pointer; * otherwise we just look through the offsets listed in tbmres. */ if (tbmres->ntuples >= 0) { /* non-lossy case */ minslot = 0; maxslot = tbmres->ntuples - 1; } else { /* lossy case */ minslot = FirstOffsetNumber; maxslot = maxoff; } ntup = 0; for (curslot = minslot; curslot <= maxslot; curslot++) { OffsetNumber targoffset; ItemId lp; HeapTupleData loctup; bool valid; if (tbmres->ntuples >= 0) { /* non-lossy case */ targoffset = tbmres->offsets[curslot]; } else { /* lossy case */ targoffset = (OffsetNumber) curslot; } /* * We'd better check for out-of-range offnum in case of VACUUM since * the TID was obtained. */ if (targoffset < FirstOffsetNumber || targoffset > maxoff) continue; lp = PageGetItemId(dp, targoffset); /* * Must check for deleted tuple. */ if (!ItemIdIsUsed(lp)) continue; /* * check time qualification of tuple, remember it if valid */ loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); loctup.t_len = ItemIdGetLength(lp); ItemPointerSet(&(loctup.t_self), page, targoffset); valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); if (valid) scan->rs_vistuples[ntup++] = targoffset; } LockBuffer(buffer, BUFFER_LOCK_UNLOCK); Assert(ntup <= MaxHeapTuplesPerPage); scan->rs_ntuples = ntup; }
/* * lazy_scan_heap() -- scan an open heap relation * * This routine sets commit status bits, builds lists of dead tuples * and pages with free space, and calculates statistics on the number * of live tuples in the heap. When done, or when we run low on space * for dead-tuple TIDs, invoke vacuuming of indexes and heap. * * If there are no indexes then we just vacuum each dirty page as we * process it, since there's no point in gathering many tuples. */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, Relation *Irel, int nindexes, bool scan_all) { BlockNumber nblocks, blkno; HeapTupleData tuple; char *relname; BlockNumber empty_pages, vacuumed_pages; double num_tuples, tups_vacuumed, nkeep, nunused; IndexBulkDeleteResult **indstats; int i; PGRUsage ru0; Buffer vmbuffer = InvalidBuffer; BlockNumber next_not_all_visible_block; bool skipping_all_visible_blocks; pg_rusage_init(&ru0); relname = RelationGetRelationName(onerel); ereport(elevel, (errmsg("vacuuming \"%s.%s\"", get_namespace_name(RelationGetNamespace(onerel)), relname))); empty_pages = vacuumed_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); nblocks = RelationGetNumberOfBlocks(onerel); vacrelstats->rel_pages = nblocks; vacrelstats->scanned_pages = 0; vacrelstats->nonempty_pages = 0; vacrelstats->latestRemovedXid = InvalidTransactionId; lazy_space_alloc(vacrelstats, nblocks); /* * We want to skip pages that don't require vacuuming according to the * visibility map, but only when we can skip at least SKIP_PAGES_THRESHOLD * consecutive pages. Since we're reading sequentially, the OS should be * doing readahead for us, so there's no gain in skipping a page now and * then; that's likely to disable readahead and so be counterproductive. * Also, skipping even a single page means that we can't update * relfrozenxid, so we only want to do it if we can skip a goodly number * of pages. * * Before entering the main loop, establish the invariant that * next_not_all_visible_block is the next block number >= blkno that's not * all-visible according to the visibility map, or nblocks if there's no * such block. Also, we set up the skipping_all_visible_blocks flag, * which is needed because we need hysteresis in the decision: once we've * started skipping blocks, we may as well skip everything up to the next * not-all-visible block. * * Note: if scan_all is true, we won't actually skip any pages; but we * maintain next_not_all_visible_block anyway, so as to set up the * all_visible_according_to_vm flag correctly for each page. */ for (next_not_all_visible_block = 0; next_not_all_visible_block < nblocks; next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } if (next_not_all_visible_block >= SKIP_PAGES_THRESHOLD) skipping_all_visible_blocks = true; else skipping_all_visible_blocks = false; for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool tupgone, hastup; int prev_dead_count; OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; Size freespace; bool all_visible_according_to_vm; bool all_visible; bool has_dead_tuples; if (blkno == next_not_all_visible_block) { /* Time to advance next_not_all_visible_block */ for (next_not_all_visible_block++; next_not_all_visible_block < nblocks; next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } /* * We know we can't skip the current block. But set up * skipping_all_visible_blocks to do the right thing at the * following blocks. */ if (next_not_all_visible_block - blkno > SKIP_PAGES_THRESHOLD) skipping_all_visible_blocks = true; else skipping_all_visible_blocks = false; all_visible_according_to_vm = false; } else { /* Current block is all-visible */ if (skipping_all_visible_blocks && !scan_all) continue; all_visible_according_to_vm = true; } vacuum_delay_point(); vacrelstats->scanned_pages++; /* * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. */ if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && vacrelstats->num_dead_tuples > 0) { /* Log cleanup info before we touch indexes */ vacuum_log_cleanup_info(onerel, vacrelstats); /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); /* * Forget the now-vacuumed tuples, and press on, but be careful * not to reset latestRemovedXid since we want that value to be * valid. */ vacrelstats->num_dead_tuples = 0; vacrelstats->num_index_scans++; } buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, vac_strategy); /* We need buffer cleanup lock so that we can prune HOT chains. */ LockBufferForCleanup(buf); page = BufferGetPage(buf); if (PageIsNew(page)) { /* * An all-zeroes page could be left over if a backend extends the * relation but crashes before initializing the page. Reclaim such * pages for use. * * We have to be careful here because we could be looking at a * page that someone has just added to the relation and not yet * been able to initialize (see RelationGetBufferForTuple). To * protect against that, release the buffer lock, grab the * relation extension lock momentarily, and re-lock the buffer. If * the page is still uninitialized by then, it must be left over * from a crashed backend, and we can initialize it. * * We don't really need the relation lock when this is a new or * temp relation, but it's probably not worth the code space to * check that, since this surely isn't a critical path. * * Note: the comparable code in vacuum.c need not worry because * it's got exclusive lock on the whole relation. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockRelationForExtension(onerel, ExclusiveLock); UnlockRelationForExtension(onerel, ExclusiveLock); LockBufferForCleanup(buf); if (PageIsNew(page)) { ereport(WARNING, (errmsg("relation \"%s\" page %u is uninitialized --- fixing", relname, blkno))); PageInit(page, BufferGetPageSize(buf), 0); empty_pages++; } freespace = PageGetHeapFreeSpace(page); MarkBufferDirty(buf); UnlockReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } if (PageIsEmpty(page)) { empty_pages++; freespace = PageGetHeapFreeSpace(page); if (!PageIsAllVisible(page)) { PageSetAllVisible(page); SetBufferCommitInfoNeedsSave(buf); } LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* Update the visibility map */ if (!all_visible_according_to_vm) { visibilitymap_pin(onerel, blkno, &vmbuffer); LockBuffer(buf, BUFFER_LOCK_SHARE); if (PageIsAllVisible(page)) visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } ReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } /* * Prune all HOT-update chains in this page. * * We count tuples removed by the pruning step as removed by VACUUM. */ tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, &vacrelstats->latestRemovedXid); /* * Now scan the page to collect vacuumable items and check for tuples * requiring freezing. */ all_visible = true; has_dead_tuples = false; nfrozen = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* Unused items require no processing, but we count 'em */ if (!ItemIdIsUsed(itemid)) { nunused += 1; continue; } /* Redirect items mustn't be touched */ if (ItemIdIsRedirected(itemid)) { hastup = true; /* this page won't be truncatable */ continue; } ItemPointerSet(&(tuple.t_self), blkno, offnum); /* * DEAD item pointers are to be vacuumed normally; but we don't * count them in tups_vacuumed, else we'd be double-counting (at * least in the common case where heap_page_prune() just freed up * a non-HOT tuple). */ if (ItemIdIsDead(itemid)) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); all_visible = false; continue; } Assert(ItemIdIsNormal(itemid)); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tupgone = false; switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf)) { case HEAPTUPLE_DEAD: /* * Ordinarily, DEAD tuples would have been removed by * heap_page_prune(), but it's possible that the tuple * state changed since heap_page_prune() looked. In * particular an INSERT_IN_PROGRESS tuple could have * changed to DEAD if the inserter aborted. So this * cannot be considered an error condition. * * If the tuple is HOT-updated then it must only be * removed by a prune operation; so we keep it just as if * it were RECENTLY_DEAD. Also, if it's a heap-only * tuple, we choose to keep it, because it'll be a lot * cheaper to get rid of it in the next pruning pass than * to treat it like an indexed tuple. */ if (HeapTupleIsHotUpdated(&tuple) || HeapTupleIsHeapOnly(&tuple)) nkeep += 1; else tupgone = true; /* we can delete the tuple */ all_visible = false; break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ if (onerel->rd_rel->relhasoids && !OidIsValid(HeapTupleGetOid(&tuple))) elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); /* * Is the tuple definitely visible to all transactions? * * NB: Like with per-tuple hint bits, we can't set the * PD_ALL_VISIBLE flag if the inserter committed * asynchronously. See SetHintBits for more info. Check * that the HEAP_XMIN_COMMITTED hint bit is set because of * that. */ if (all_visible) { TransactionId xmin; if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)) { all_visible = false; break; } /* * The inserter definitely committed. But is it old * enough that everyone sees it as committed? */ xmin = HeapTupleHeaderGetXmin(tuple.t_data); if (!TransactionIdPrecedes(xmin, OldestXmin)) { all_visible = false; break; } } break; case HEAPTUPLE_RECENTLY_DEAD: /* * If tuple is recently deleted then we must not remove it * from relation. */ nkeep += 1; all_visible = false; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } if (tupgone) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, &vacrelstats->latestRemovedXid); tups_vacuumed += 1; has_dead_tuples = true; } else { num_tuples += 1; hastup = true; /* * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ if (heap_freeze_tuple(tuple.t_data, FreezeLimit, InvalidBuffer)) frozen[nfrozen++] = offnum; } } /* scan along page */ /* * If we froze any tuples, mark the buffer dirty, and write a WAL * record recording the changes. We must log the changes to be * crash-safe against future truncation of CLOG. */ if (nfrozen > 0) { MarkBufferDirty(buf); if (RelationNeedsWAL(onerel)) { XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, frozen, nfrozen); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } /* * If there are no indexes then we can vacuum the page right now * instead of doing a second scan. */ if (nindexes == 0 && vacrelstats->num_dead_tuples > 0) { /* Remove tuples from heap */ lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); /* * Forget the now-vacuumed tuples, and press on, but be careful * not to reset latestRemovedXid since we want that value to be * valid. */ vacrelstats->num_dead_tuples = 0; vacuumed_pages++; } freespace = PageGetHeapFreeSpace(page); /* Update the all-visible flag on the page */ if (!PageIsAllVisible(page) && all_visible) { PageSetAllVisible(page); SetBufferCommitInfoNeedsSave(buf); } /* * It's possible for the value returned by GetOldestXmin() to move * backwards, so it's not wrong for us to see tuples that appear to * not be visible to everyone yet, while PD_ALL_VISIBLE is already * set. The real safe xmin value never moves backwards, but * GetOldestXmin() is conservative and sometimes returns a value * that's unnecessarily small, so if we see that contradiction it just * means that the tuples that we think are not visible to everyone yet * actually are, and the PD_ALL_VISIBLE flag is correct. * * There should never be dead tuples on a page with PD_ALL_VISIBLE * set, however. */ else if (PageIsAllVisible(page) && has_dead_tuples) { elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u", relname, blkno); PageClearAllVisible(page); SetBufferCommitInfoNeedsSave(buf); /* * Normally, we would drop the lock on the heap page before * updating the visibility map, but since this case shouldn't * happen anyway, don't worry about that. */ visibilitymap_clear(onerel, blkno); } LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* Update the visibility map */ if (!all_visible_according_to_vm && all_visible) { visibilitymap_pin(onerel, blkno, &vmbuffer); LockBuffer(buf, BUFFER_LOCK_SHARE); if (PageIsAllVisible(page)) visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } ReleaseBuffer(buf); /* Remember the location of the last page with nonremovable tuples */ if (hastup) vacrelstats->nonempty_pages = blkno + 1; /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record * its post-compaction free space. If not, then we're done with this * page, so remember its free space as-is. (This path will always be * taken if there are no indexes.) */ if (vacrelstats->num_dead_tuples == prev_dead_count) RecordPageWithFreeSpace(onerel, blkno, freespace); } /* save stats for use later */ vacrelstats->scanned_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; /* now we can compute the new value for pg_class.reltuples */ vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false, nblocks, vacrelstats->scanned_pages, num_tuples); /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) { /* Log cleanup info before we touch indexes */ vacuum_log_cleanup_info(onerel, vacrelstats); /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); vacrelstats->num_index_scans++; } /* Release the pin on the visibility map page */ if (BufferIsValid(vmbuffer)) { ReleaseBuffer(vmbuffer); vmbuffer = InvalidBuffer; } /* Do post-vacuum cleanup and statistics update for each index */ for (i = 0; i < nindexes; i++) lazy_cleanup_index(Irel[i], indstats[i], vacrelstats); /* If no indexes, make log report that lazy_vacuum_heap would've made */ if (vacuumed_pages) ereport(elevel, (errmsg("\"%s\": removed %.0f row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, vacuumed_pages))); ereport(elevel, (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages", RelationGetRelationName(onerel), tups_vacuumed, num_tuples, vacrelstats->scanned_pages, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages are entirely empty.\n" "%s.", nkeep, nunused, empty_pages, pg_rusage_show(&ru0)))); }
/* * Prune and repair fragmentation in the specified page. * * Caller must have pin and buffer cleanup lock on the page. * * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum). * * If redirect_move is set, we remove redirecting line pointers by * updating the root line pointer to point directly to the first non-dead * tuple in the chain. NOTE: eliminating the redirect changes the first * tuple's effective CTID, and is therefore unsafe except within VACUUM FULL. * The only reason we support this capability at all is that by using it, * VACUUM FULL need not cope with LP_REDIRECT items at all; which seems a * good thing since VACUUM FULL is overly complicated already. * * If report_stats is true then we send the number of reclaimed heap-only * tuples to pgstats. (This must be FALSE during vacuum, since vacuum will * send its own new total to pgstats, and we don't want this delta applied * on top of that.) * * Returns the number of tuples deleted from the page. */ int heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, bool redirect_move, bool report_stats) { int ndeleted = 0; Page page = BufferGetPage(buffer); OffsetNumber offnum, maxoff; PruneState prstate; /* * Our strategy is to scan the page and make lists of items to change, * then apply the changes within a critical section. This keeps as much * logic as possible out of the critical section, and also ensures that * WAL replay will work the same as the normal case. * * First, inform inval.c that upcoming CacheInvalidateHeapTuple calls are * nontransactional. */ if (redirect_move) BeginNonTransactionalInvalidation(); /* * Initialize the new pd_prune_xid value to zero (indicating no prunable * tuples). If we find any tuples which may soon become prunable, we will * save the lowest relevant XID in new_prune_xid. Also initialize the rest * of our working state. */ prstate.new_prune_xid = InvalidTransactionId; prstate.nredirected = prstate.ndead = prstate.nunused = 0; memset(prstate.marked, 0, sizeof(prstate.marked)); /* Scan the page */ maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; /* Ignore items already processed as part of an earlier chain */ if (prstate.marked[offnum]) continue; /* Nothing to do if slot is empty or already dead */ itemid = PageGetItemId(page, offnum); if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid)) continue; /* Process this item or chain of items */ ndeleted += heap_prune_chain(relation, buffer, offnum, OldestXmin, &prstate, redirect_move); } /* * Send invalidation messages for any tuples we are about to move. It is * safe to do this now, even though we could theoretically still fail * before making the actual page update, because a useless cache * invalidation doesn't hurt anything. Also, no one else can reload the * tuples while we have exclusive buffer lock, so it's not too early to * send the invals. This avoids sending the invals while inside the * critical section, which is a good thing for robustness. */ if (redirect_move) EndNonTransactionalInvalidation(); /* Any error while applying the changes is critical */ START_CRIT_SECTION(); /* Have we found any prunable items? */ if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0) { /* * Apply the planned item changes, then repair page fragmentation, and * update the page's hint bit about whether it has free line pointers. */ heap_page_prune_execute(buffer, prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, prstate.nowunused, prstate.nunused, redirect_move); /* * Update the page's pd_prune_xid field to either zero, or the lowest * XID of any soon-prunable tuple. */ ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; /* * Also clear the "page is full" flag, since there's no point in * repeating the prune/defrag process until something else happens to * the page. */ PageClearFull(page); MarkBufferDirty(buffer); /* * Emit a WAL HEAP_CLEAN or HEAP_CLEAN_MOVE record showing what we did */ if (!relation->rd_istemp) { XLogRecPtr recptr; recptr = log_heap_clean(relation, buffer, prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, prstate.nowunused, prstate.nunused, redirect_move); PageSetLSN(BufferGetPage(buffer), recptr); } } else { /* * If we didn't prune anything, but have found a new value for the * pd_prune_xid field, update it and mark the buffer dirty. This is * treated as a non-WAL-logged hint. * * Also clear the "page is full" flag if it is set, since there's no * point in repeating the prune/defrag process until something else * happens to the page. */ if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid || PageIsFull(page)) { ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; PageClearFull(page); MarkBufferDirtyHint(buffer, relation); } } END_CRIT_SECTION(); /* * If requested, report the number of tuples reclaimed to pgstats. This is * ndeleted minus ndead, because we don't want to count a now-DEAD root * item as a deletion for this purpose. */ if (report_stats && ndeleted > prstate.ndead) pgstat_update_heap_dead_tuples(relation, ndeleted - prstate.ndead); /* * XXX Should we update the FSM information of this page ? * * There are two schools of thought here. We may not want to update FSM * information so that the page is not used for unrelated UPDATEs/INSERTs * and any free space in this page will remain available for further * UPDATEs in *this* page, thus improving chances for doing HOT updates. * * But for a large table and where a page does not receive further UPDATEs * for a long time, we might waste this space by not updating the FSM * information. The relation may get extended and fragmented further. * * One possibility is to leave "fillfactor" worth of space in this page * and update FSM with the remaining space. * * In any case, the current FSM implementation doesn't accept * one-page-at-a-time updates, so this is all academic for now. */ return ndeleted; }
/* * Extract all item values from a BRIN index page * * Usage: SELECT * FROM brin_page_items(get_raw_page('idx', 1), 'idx'::regclass); */ Datum brin_page_items(PG_FUNCTION_ARGS) { bytea *raw_page = PG_GETARG_BYTEA_P(0); Oid indexRelid = PG_GETARG_OID(1); ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; TupleDesc tupdesc; MemoryContext oldcontext; Tuplestorestate *tupstore; Relation indexRel; brin_column_state **columns; BrinDesc *bdesc; BrinMemTuple *dtup; Page page; OffsetNumber offset; AttrNumber attno; bool unusedItem; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use raw page functions")))); /* check to see if caller supports us returning a tuplestore */ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("set-valued function called in context that cannot accept a set"))); if (!(rsinfo->allowedModes & SFRM_Materialize) || rsinfo->expectedDesc == NULL) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("materialize mode required, but it is not allowed in this context"))); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); /* Build tuplestore to hold the result rows */ oldcontext = MemoryContextSwitchTo(rsinfo->econtext->ecxt_per_query_memory); tupstore = tuplestore_begin_heap(true, false, work_mem); rsinfo->returnMode = SFRM_Materialize; rsinfo->setResult = tupstore; rsinfo->setDesc = tupdesc; MemoryContextSwitchTo(oldcontext); indexRel = index_open(indexRelid, AccessShareLock); bdesc = brin_build_desc(indexRel); /* minimally verify the page we got */ page = verify_brin_page(raw_page, BRIN_PAGETYPE_REGULAR, "regular"); /* * Initialize output functions for all indexed datatypes; simplifies * calling them later. */ columns = palloc(sizeof(brin_column_state *) * RelationGetDescr(indexRel)->natts); for (attno = 1; attno <= bdesc->bd_tupdesc->natts; attno++) { Oid output; bool isVarlena; BrinOpcInfo *opcinfo; int i; brin_column_state *column; opcinfo = bdesc->bd_info[attno - 1]; column = palloc(offsetof(brin_column_state, outputFn) + sizeof(FmgrInfo) * opcinfo->oi_nstored); column->nstored = opcinfo->oi_nstored; for (i = 0; i < opcinfo->oi_nstored; i++) { getTypeOutputInfo(opcinfo->oi_typcache[i]->type_id, &output, &isVarlena); fmgr_info(output, &column->outputFn[i]); } columns[attno - 1] = column; } offset = FirstOffsetNumber; unusedItem = false; dtup = NULL; for (;;) { Datum values[7]; bool nulls[7]; /* * This loop is called once for every attribute of every tuple in the * page. At the start of a tuple, we get a NULL dtup; that's our * signal for obtaining and decoding the next one. If that's not the * case, we output the next attribute. */ if (dtup == NULL) { ItemId itemId; /* verify item status: if there's no data, we can't decode */ itemId = PageGetItemId(page, offset); if (ItemIdIsUsed(itemId)) { dtup = brin_deform_tuple(bdesc, (BrinTuple *) PageGetItem(page, itemId)); attno = 1; unusedItem = false; } else unusedItem = true; } else attno++; MemSet(nulls, 0, sizeof(nulls)); if (unusedItem) { values[0] = UInt16GetDatum(offset); nulls[1] = true; nulls[2] = true; nulls[3] = true; nulls[4] = true; nulls[5] = true; nulls[6] = true; } else { int att = attno - 1; values[0] = UInt16GetDatum(offset); values[1] = UInt32GetDatum(dtup->bt_blkno); values[2] = UInt16GetDatum(attno); values[3] = BoolGetDatum(dtup->bt_columns[att].bv_allnulls); values[4] = BoolGetDatum(dtup->bt_columns[att].bv_hasnulls); values[5] = BoolGetDatum(dtup->bt_placeholder); if (!dtup->bt_columns[att].bv_allnulls) { BrinValues *bvalues = &dtup->bt_columns[att]; StringInfoData s; bool first; int i; initStringInfo(&s); appendStringInfoChar(&s, '{'); first = true; for (i = 0; i < columns[att]->nstored; i++) { char *val; if (!first) appendStringInfoString(&s, " .. "); first = false; val = OutputFunctionCall(&columns[att]->outputFn[i], bvalues->bv_values[i]); appendStringInfoString(&s, val); pfree(val); } appendStringInfoChar(&s, '}'); values[6] = CStringGetTextDatum(s.data); pfree(s.data); } else { nulls[6] = true; } } tuplestore_putvalues(tupstore, tupdesc, values, nulls); /* * If the item was unused, jump straight to the next one; otherwise, * the only cleanup needed here is to set our signal to go to the next * tuple in the following iteration, by freeing the current one. */ if (unusedItem) offset = OffsetNumberNext(offset); else if (attno >= bdesc->bd_tupdesc->natts) { pfree(dtup); dtup = NULL; offset = OffsetNumberNext(offset); } /* * If we're beyond the end of the page, we're done. */ if (offset > PageGetMaxOffsetNumber(page)) break; } /* clean up and return the tuplestore */ brin_free_desc(bdesc); tuplestore_donestoring(tupstore); index_close(indexRel, AccessShareLock); return (Datum) 0; }
/* * Prune specified item pointer or a HOT chain originating at that item. * * If the item is an index-referenced tuple (i.e. not a heap-only tuple), * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple. * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really * DEAD, the OldestXmin test is just too coarse to detect it. * * The root line pointer is redirected to the tuple immediately after the * latest DEAD tuple. If all tuples in the chain are DEAD, the root line * pointer is marked LP_DEAD. (This includes the case of a DEAD simple * tuple, which we treat as a chain of length 1.) * * OldestXmin is the cutoff XID used to identify dead tuples. * * We don't actually change the page here, except perhaps for hint-bit updates * caused by HeapTupleSatisfiesVacuum. We just add entries to the arrays in * prstate showing the changes to be made. Items to be redirected are added * to the redirected[] array (two entries per redirection); items to be set to * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED * state are added to nowunused[]. * * If redirect_move is true, we intend to get rid of redirecting line pointers, * not just make redirection entries. * * Returns the number of tuples (to be) deleted from the page. */ static int heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum, TransactionId OldestXmin, PruneState *prstate, bool redirect_move) { int ndeleted = 0; Page dp = (Page) BufferGetPage(buffer); TransactionId priorXmax = InvalidTransactionId; ItemId rootlp; HeapTupleHeader htup; OffsetNumber latestdead = InvalidOffsetNumber, redirect_target = InvalidOffsetNumber, maxoff = PageGetMaxOffsetNumber(dp), offnum; OffsetNumber chainitems[MaxHeapTuplesPerPage]; int nchain = 0, i; rootlp = PageGetItemId(dp, rootoffnum); /* * If it's a heap-only tuple, then it is not the start of a HOT chain. */ if (ItemIdIsNormal(rootlp)) { htup = (HeapTupleHeader) PageGetItem(dp, rootlp); if (HeapTupleHeaderIsHeapOnly(htup)) { /* * If the tuple is DEAD and doesn't chain to anything else, mark * it unused immediately. (If it does chain, we can only remove * it as part of pruning its chain.) * * We need this primarily to handle aborted HOT updates, that is, * XMIN_INVALID heap-only tuples. Those might not be linked to by * any chain, since the parent tuple might be re-updated before * any pruning occurs. So we have to be able to reap them * separately from chain-pruning. (Note that * HeapTupleHeaderIsHotUpdated will never return true for an * XMIN_INVALID tuple, so this code will work even when there were * sequential updates within the aborted transaction.) * * Note that we might first arrive at a dead heap-only tuple * either here or while following a chain below. Whichever path * gets there first will mark the tuple unused. */ if (HeapTupleSatisfiesVacuum(relation, htup, OldestXmin, buffer) == HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup)) { heap_prune_record_unused(prstate, rootoffnum); ndeleted++; } /* Nothing more to do */ return ndeleted; } } /* Start from the root tuple */ offnum = rootoffnum; /* while not end of the chain */ for (;;) { ItemId lp; bool tupdead, recent_dead; /* Some sanity checks */ if (offnum < FirstOffsetNumber || offnum > maxoff) break; /* If item is already processed, stop --- it must not be same chain */ if (prstate->marked[offnum]) break; lp = PageGetItemId(dp, offnum); /* Unused item obviously isn't part of the chain */ if (!ItemIdIsUsed(lp)) break; /* * If we are looking at the redirected root line pointer, jump to the * first normal tuple in the chain. If we find a redirect somewhere * else, stop --- it must not be same chain. */ if (ItemIdIsRedirected(lp)) { if (nchain > 0) break; /* not at start of chain */ chainitems[nchain++] = offnum; offnum = ItemIdGetRedirect(rootlp); continue; } /* * Likewise, a dead item pointer can't be part of the chain. (We * already eliminated the case of dead root tuple outside this * function.) */ if (ItemIdIsDead(lp)) break; Assert(ItemIdIsNormal(lp)); htup = (HeapTupleHeader) PageGetItem(dp, lp); /* * Check the tuple XMIN against prior XMAX, if any */ if (TransactionIdIsValid(priorXmax) && !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax)) break; /* * OK, this tuple is indeed a member of the chain. */ chainitems[nchain++] = offnum; /* * Check tuple's visibility status. */ tupdead = recent_dead = false; switch (HeapTupleSatisfiesVacuum(relation, htup, OldestXmin, buffer)) { case HEAPTUPLE_DEAD: tupdead = true; break; case HEAPTUPLE_RECENTLY_DEAD: recent_dead = true; /* * This tuple may soon become DEAD. Update the hint field so * that the page is reconsidered for pruning in future. */ heap_prune_record_prunable(prstate, HeapTupleHeaderGetXmax(htup)); break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* * This tuple may soon become DEAD. Update the hint field so * that the page is reconsidered for pruning in future. */ heap_prune_record_prunable(prstate, HeapTupleHeaderGetXmax(htup)); break; case HEAPTUPLE_LIVE: case HEAPTUPLE_INSERT_IN_PROGRESS: /* * If we wanted to optimize for aborts, we might consider * marking the page prunable when we see INSERT_IN_PROGRESS. * But we don't. See related decisions about when to mark the * page prunable in heapam.c. */ break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } /* * Remember the last DEAD tuple seen. We will advance past * RECENTLY_DEAD tuples just in case there's a DEAD one after them; * but we can't advance past anything else. (XXX is it really worth * continuing to scan beyond RECENTLY_DEAD? The case where we will * find another DEAD tuple is a fairly unusual corner case.) */ if (tupdead) latestdead = offnum; else if (!recent_dead) break; /* * If the tuple is not HOT-updated, then we are at the end of this * HOT-update chain. */ if (!HeapTupleHeaderIsHotUpdated(htup)) break; /* * Advance to next chain member. */ Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == BufferGetBlockNumber(buffer)); offnum = ItemPointerGetOffsetNumber(&htup->t_ctid); priorXmax = HeapTupleHeaderGetXmax(htup); } /* * If we found a DEAD tuple in the chain, adjust the HOT chain so that all * the DEAD tuples at the start of the chain are removed and the root line * pointer is appropriately redirected. */ if (OffsetNumberIsValid(latestdead)) { /* * Mark as unused each intermediate item that we are able to remove * from the chain. * * When the previous item is the last dead tuple seen, we are at the * right candidate for redirection. */ for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++) { heap_prune_record_unused(prstate, chainitems[i]); ndeleted++; } /* * If the root entry had been a normal tuple, we are deleting it, so * count it in the result. But changing a redirect (even to DEAD * state) doesn't count. */ if (ItemIdIsNormal(rootlp)) ndeleted++; /* * If the DEAD tuple is at the end of the chain, the entire chain is * dead and the root line pointer can be marked dead. Otherwise just * redirect the root to the correct chain member. */ if (i >= nchain) heap_prune_record_dead(prstate, rootoffnum); else { heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]); /* If the redirection will be a move, need more processing */ if (redirect_move) redirect_target = chainitems[i]; } } else if (nchain < 2 && ItemIdIsRedirected(rootlp)) { /* * We found a redirect item that doesn't point to a valid follow-on * item. This can happen if the loop in heap_page_prune caused us to * visit the dead successor of a redirect item before visiting the * redirect item. We can clean up by setting the redirect item to * DEAD state. */ heap_prune_record_dead(prstate, rootoffnum); } else if (redirect_move && ItemIdIsRedirected(rootlp)) { /* * If we desire to eliminate LP_REDIRECT items by moving tuples, make * a redirection entry for each redirected root item; this will cause * heap_page_prune_execute to actually do the move. (We get here only * when there are no DEAD tuples in the chain; otherwise the * redirection entry was made above.) */ heap_prune_record_redirect(prstate, rootoffnum, chainitems[1]); redirect_target = chainitems[1]; } /* * If we are going to implement a redirect by moving tuples, we have to * issue a cache invalidation against the redirection target tuple, * because its CTID will be effectively changed by the move. Note that * CacheInvalidateHeapTuple only queues the request, it doesn't send it; * if we fail before reaching EndNonTransactionalInvalidation, nothing * happens and no harm is done. */ if (OffsetNumberIsValid(redirect_target)) { ItemId firstlp = PageGetItemId(dp, redirect_target); HeapTupleData firsttup; Assert(ItemIdIsNormal(firstlp)); /* Set up firsttup to reference the tuple at its existing CTID */ firsttup.t_data = (HeapTupleHeader) PageGetItem(dp, firstlp); firsttup.t_len = ItemIdGetLength(firstlp); ItemPointerSet(&firsttup.t_self, BufferGetBlockNumber(buffer), redirect_target); CacheInvalidateHeapTuple(relation, &firsttup); } return ndeleted; }