/* * lazy_vacuum_page() -- free dead tuples on a page * and repair its fragmentation. * * Caller must hold pin and buffer cleanup lock on the buffer. * * tupindex is the index in vacrelstats->dead_tuples of the first dead * tuple for this page. We assume the rest follow sequentially. * The return value is the first tupindex after the tuples of this page. */ static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, int tupindex, LVRelStats *vacrelstats) { Page page = BufferGetPage(buffer); OffsetNumber unused[MaxOffsetNumber]; int uncnt = 0; MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; START_CRIT_SECTION(); for (; tupindex < vacrelstats->num_dead_tuples; tupindex++) { BlockNumber tblk; OffsetNumber toff; ItemId itemid; tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]); if (tblk != blkno) break; /* past end of tuples for this block */ toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]); itemid = PageGetItemId(page, toff); ItemIdSetUnused(itemid); unused[uncnt++] = toff; } PageRepairFragmentation(page); MarkBufferDirty(buffer); /* XLOG stuff */ if (!onerel->rd_istemp) { XLogRecPtr recptr; recptr = log_heap_clean(onerel, buffer, NULL, 0, NULL, 0, unused, uncnt, false); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } END_CRIT_SECTION(); return tupindex; }
/* * Perform the actual page changes needed by heap_page_prune. * It is expected that the caller has suitable pin and lock on the * buffer, and is inside a critical section. * * This is split out because it is also used by heap_xlog_clean() * to replay the WAL record when needed after a crash. Note that the * arguments are identical to those of log_heap_clean(). */ void heap_page_prune_execute(Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused) { Page page = (Page) BufferGetPage(buffer); OffsetNumber *offnum; int i; /* Update all redirected line pointers */ offnum = redirected; for (i = 0; i < nredirected; i++) { OffsetNumber fromoff = *offnum++; OffsetNumber tooff = *offnum++; ItemId fromlp = PageGetItemId(page, fromoff); ItemIdSetRedirect(fromlp, tooff); } /* Update all now-dead line pointers */ offnum = nowdead; for (i = 0; i < ndead; i++) { OffsetNumber off = *offnum++; ItemId lp = PageGetItemId(page, off); ItemIdSetDead(lp); } /* Update all now-unused line pointers */ offnum = nowunused; for (i = 0; i < nunused; i++) { OffsetNumber off = *offnum++; ItemId lp = PageGetItemId(page, off); ItemIdSetUnused(lp); } /* * Finally, repair any fragmentation, and update the page's hint bit about * whether it has free pointers. */ PageRepairFragmentation(page); }
/* * PageIndexTupleDeleteNoCompact * * Remove the specified tuple from an index page, but set its line pointer * to "unused" instead of compacting it out, except that it can be removed * if it's the last line pointer on the page. * * This is used for index AMs that require that existing TIDs of live tuples * remain unchanged, and are willing to allow unused line pointers instead. */ void PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offnum) { PageHeader phdr = (PageHeader) page; char *addr; ItemId tup; Size size; unsigned offset; int nline; /* * As with PageRepairFragmentation, paranoia seems justified. */ if (phdr->pd_lower < SizeOfPageHeaderData || phdr->pd_lower > phdr->pd_upper || phdr->pd_upper > phdr->pd_special || phdr->pd_special > BLCKSZ || phdr->pd_special != MAXALIGN(phdr->pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", phdr->pd_lower, phdr->pd_upper, phdr->pd_special))); nline = PageGetMaxOffsetNumber(page); if ((int) offnum <= 0 || (int) offnum > nline) elog(ERROR, "invalid index offnum: %u", offnum); tup = PageGetItemId(page, offnum); Assert(ItemIdHasStorage(tup)); size = ItemIdGetLength(tup); offset = ItemIdGetOffset(tup); if (offset < phdr->pd_upper || (offset + size) > phdr->pd_special || offset != MAXALIGN(offset)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item pointer: offset = %u, size = %u", offset, (unsigned int) size))); /* Amount of space to actually be deleted */ size = MAXALIGN(size); /* * Either set the item pointer to "unused", or zap it if it's the last * one. (Note: it's possible that the next-to-last one(s) are already * unused, but we do not trouble to try to compact them out if so.) */ if ((int) offnum < nline) ItemIdSetUnused(tup); else { phdr->pd_lower -= sizeof(ItemIdData); nline--; /* there's one less than when we started */ } /* * Now move everything between the old upper bound (beginning of tuple * space) and the beginning of the deleted tuple forward, so that space in * the middle of the page is left free. If we've just deleted the tuple * at the beginning of tuple space, then there's no need to do the copy. */ /* beginning of tuple space */ addr = (char *) page + phdr->pd_upper; if (offset > phdr->pd_upper) memmove(addr + size, addr, offset - phdr->pd_upper); /* adjust free space boundary pointer */ phdr->pd_upper += size; /* * Finally, we need to adjust the linp entries that remain. * * Anything that used to be before the deleted tuple's data was moved * forward by the size of the deleted tuple. */ if (!PageIsEmpty(page)) { int i; for (i = 1; i <= nline; i++) { ItemId ii = PageGetItemId(phdr, i); if (ItemIdHasStorage(ii) && ItemIdGetOffset(ii) <= offset) ii->lp_off += size; } } }
/* * PageRepairFragmentation * * Frees fragmented space on a page. * It doesn't remove unused line pointers! Please don't change this. * * This routine is usable for heap pages only, but see PageIndexMultiDelete. * * As a side effect, the page's PD_HAS_FREE_LINES hint bit is updated. */ void PageRepairFragmentation(Page page) { Offset pd_lower = ((PageHeader) page)->pd_lower; Offset pd_upper = ((PageHeader) page)->pd_upper; Offset pd_special = ((PageHeader) page)->pd_special; ItemId lp; int nline, nstorage, nunused; int i; Size totallen; /* * It's worth the trouble to be more paranoid here than in most places, * because we are about to reshuffle data in (what is usually) a shared * disk buffer. If we aren't careful then corrupted pointers, lengths, * etc could cause us to clobber adjacent disk buffers, spreading the data * loss further. So, check everything. */ if (pd_lower < SizeOfPageHeaderData || pd_lower > pd_upper || pd_upper > pd_special || pd_special > BLCKSZ || pd_special != MAXALIGN(pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", pd_lower, pd_upper, pd_special))); nline = PageGetMaxOffsetNumber(page); nunused = nstorage = 0; for (i = FirstOffsetNumber; i <= nline; i++) { lp = PageGetItemId(page, i); if (ItemIdIsUsed(lp)) { if (ItemIdHasStorage(lp)) nstorage++; } else { /* Unused entries should have lp_len = 0, but make sure */ ItemIdSetUnused(lp); nunused++; } } if (nstorage == 0) { /* Page is completely empty, so just reset it quickly */ ((PageHeader) page)->pd_upper = pd_special; } else { /* Need to compact the page the hard way */ itemIdSortData itemidbase[MaxHeapTuplesPerPage]; itemIdSort itemidptr = itemidbase; totallen = 0; for (i = 0; i < nline; i++) { lp = PageGetItemId(page, i + 1); if (ItemIdHasStorage(lp)) { itemidptr->offsetindex = i; itemidptr->itemoff = ItemIdGetOffset(lp); if (itemidptr->itemoff < (int) pd_upper || itemidptr->itemoff >= (int) pd_special) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item pointer: %u", itemidptr->itemoff))); itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp)); totallen += itemidptr->alignedlen; itemidptr++; } } if (totallen > (Size) (pd_special - pd_lower)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item lengths: total %u, available space %u", (unsigned int) totallen, pd_special - pd_lower))); compactify_tuples(itemidbase, nstorage, page); } /* Set hint bit for PageAddItem */ if (nunused > 0) PageSetHasFreeLinePointers(page); else PageClearHasFreeLinePointers(page); }
/*---------- * Add an item to a disk page from the sort output. * * We must be careful to observe the page layout conventions of nbtsearch.c: * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY. * - on non-leaf pages, the key portion of the first item need not be * stored, we should store only the link. * * A leaf page being built looks like: * * +----------------+---------------------------------+ * | PageHeaderData | linp0 linp1 linp2 ... | * +-----------+----+---------------------------------+ * | ... linpN | | * +-----------+--------------------------------------+ * | ^ last | * | | * +-------------+------------------------------------+ * | | itemN ... | * +-------------+------------------+-----------------+ * | ... item3 item2 item1 | "special space" | * +--------------------------------+-----------------+ * * Contrast this with the diagram in bufpage.h; note the mismatch * between linps and items. This is because we reserve linp0 as a * placeholder for the pointer to the "high key" item; when we have * filled up the page, we will set linp0 to point to itemN and clear * linpN. On the other hand, if we find this is the last (rightmost) * page, we leave the items alone and slide the linp array over. * * 'last' pointer indicates the last offset added to the page. *---------- */ static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) { Page npage; BlockNumber nblkno; OffsetNumber last_off; Size pgspc; Size itupsz; /* * This is a handy place to check for cancel interrupts during the btree * load phase of index creation. */ CHECK_FOR_INTERRUPTS(); npage = state->btps_page; nblkno = state->btps_blkno; last_off = state->btps_lastoff; pgspc = PageGetFreeSpace(npage); itupsz = IndexTupleDSize(*itup); itupsz = MAXALIGN(itupsz); /* * Check whether the item can fit on a btree page at all. (Eventually, we * ought to try to apply TOAST methods if not.) We actually need to be * able to fit three items on every page, so restrict any one item to 1/3 * the per-page available space. Note that at this point, itupsz doesn't * include the ItemId. * * NOTE: similar code appears in _bt_insertonpg() to defend against * oversize items being inserted into an already-existing index. But * during creation of an index, we don't go through there. */ if (itupsz > BTMaxItemSize(npage)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", itupsz, BTMaxItemSize(npage), RelationGetRelationName(wstate->index)), errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" "Consider a function index of an MD5 hash of the value, " "or use full text indexing."), errtableconstraint(wstate->heap, RelationGetRelationName(wstate->index)))); /* * Check to see if page is "full". It's definitely full if the item won't * fit. Otherwise, compare to the target freespace derived from the * fillfactor. However, we must put at least two items on each page, so * disregard fillfactor if we don't have that many. */ if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY)) { /* * Finish off the page and write it out. */ Page opage = npage; BlockNumber oblkno = nblkno; ItemId ii; ItemId hii; IndexTuple oitup; /* Create new page of same level */ npage = _bt_blnewpage(state->btps_level); /* and assign it a page position */ nblkno = wstate->btws_pages_alloced++; /* * We copy the last item on the page into the new page, and then * rearrange the old page so that the 'last item' becomes its high key * rather than a true data item. There had better be at least two * items on the page already, else the page would be empty of useful * data. */ Assert(last_off > P_FIRSTKEY); ii = PageGetItemId(opage, last_off); oitup = (IndexTuple) PageGetItem(opage, ii); _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY); /* * Move 'last' into the high key position on opage */ hii = PageGetItemId(opage, P_HIKEY); *hii = *ii; ItemIdSetUnused(ii); /* redundant */ ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); /* * Link the old page into its parent, using its minimum key. If we * don't have a parent, we have to create one; this adds a new btree * level. */ if (state->btps_next == NULL) state->btps_next = _bt_pagestate(wstate, state->btps_level + 1); Assert(state->btps_minkey != NULL); ItemPointerSet(&(state->btps_minkey->t_tid), oblkno, P_HIKEY); _bt_buildadd(wstate, state->btps_next, state->btps_minkey); pfree(state->btps_minkey); /* * Save a copy of the minimum key for the new page. We have to copy * it off the old page, not the new one, in case we are not at leaf * level. */ state->btps_minkey = CopyIndexTuple(oitup); /* * Set the sibling links for both pages. */ { BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage); oopaque->btpo_next = nblkno; nopaque->btpo_prev = oblkno; nopaque->btpo_next = P_NONE; /* redundant */ } /* * Write out the old page. We never need to touch it again, so we can * free the opage workspace too. */ _bt_blwritepage(wstate, opage, oblkno); /* * Reset last_off to point to new page */ last_off = P_FIRSTKEY; } /* * If the new item is the first for its page, stash a copy for later. Note * this will only happen for the first item on a level; on later pages, * the first item for a page is copied from the prior page in the code * above. */ if (last_off == P_HIKEY) { Assert(state->btps_minkey == NULL); state->btps_minkey = CopyIndexTuple(itup); } /* * Add the new item into the current page. */ last_off = OffsetNumberNext(last_off); _bt_sortaddtup(npage, itupsz, itup, last_off); state->btps_page = npage; state->btps_blkno = nblkno; state->btps_lastoff = last_off; }
/* * Perform the actual page changes needed by heap_page_prune. * It is expected that the caller has suitable pin and lock on the * buffer, and is inside a critical section. * * This is split out because it is also used by heap_xlog_clean() * to replay the WAL record when needed after a crash. Note that the * arguments are identical to those of log_heap_clean(). */ void heap_page_prune_execute(Buffer buffer, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused, bool redirect_move) { Page page = (Page) BufferGetPage(buffer); OffsetNumber *offnum; int i; /* Update all redirected or moved line pointers */ offnum = redirected; for (i = 0; i < nredirected; i++) { OffsetNumber fromoff = *offnum++; OffsetNumber tooff = *offnum++; ItemId fromlp = PageGetItemId(page, fromoff); if (redirect_move) { /* Physically move the "to" item to the "from" slot */ ItemId tolp = PageGetItemId(page, tooff); HeapTupleHeader htup; *fromlp = *tolp; ItemIdSetUnused(tolp); /* * Change heap-only status of the tuple because after the line * pointer manipulation, it's no longer a heap-only tuple, but is * directly pointed to by index entries. */ Assert(ItemIdIsNormal(fromlp)); htup = (HeapTupleHeader) PageGetItem(page, fromlp); Assert(HeapTupleHeaderIsHeapOnly(htup)); HeapTupleHeaderClearHeapOnly(htup); } else { /* Just insert a REDIRECT link at fromoff */ ItemIdSetRedirect(fromlp, tooff); } } /* Update all now-dead line pointers */ offnum = nowdead; for (i = 0; i < ndead; i++) { OffsetNumber off = *offnum++; ItemId lp = PageGetItemId(page, off); ItemIdSetDead(lp); } /* Update all now-unused line pointers */ offnum = nowunused; for (i = 0; i < nunused; i++) { OffsetNumber off = *offnum++; ItemId lp = PageGetItemId(page, off); ItemIdSetUnused(lp); } /* * Finally, repair any fragmentation, and update the page's hint bit about * whether it has free pointers. */ PageRepairFragmentation(page); }
/* * PageIndexDeleteNoCompact * Delete the given items for an index page, and defragment the resulting * free space, but do not compact the item pointers array. * * itemnos is the array of tuples to delete; nitems is its size. maxIdxTuples * is the maximum number of tuples that can exist in a page. * * Unused items at the end of the array are removed. * * This is used for index AMs that require that existing TIDs of live tuples * remain unchanged. */ void PageIndexDeleteNoCompact(Page page, OffsetNumber *itemnos, int nitems) { PageHeader phdr = (PageHeader) page; LocationIndex pd_lower = phdr->pd_lower; LocationIndex pd_upper = phdr->pd_upper; LocationIndex pd_special = phdr->pd_special; int nline; bool empty; OffsetNumber offnum; int nextitm; /* * As with PageRepairFragmentation, paranoia seems justified. */ if (pd_lower < SizeOfPageHeaderData || pd_lower > pd_upper || pd_upper > pd_special || pd_special > BLCKSZ || pd_special != MAXALIGN(pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", pd_lower, pd_upper, pd_special))); /* * Scan the existing item pointer array and mark as unused those that are * in our kill-list; make sure any non-interesting ones are marked unused * as well. */ nline = PageGetMaxOffsetNumber(page); empty = true; nextitm = 0; for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum)) { ItemId lp; ItemLength itemlen; ItemOffset offset; lp = PageGetItemId(page, offnum); itemlen = ItemIdGetLength(lp); offset = ItemIdGetOffset(lp); if (ItemIdIsUsed(lp)) { if (offset < pd_upper || (offset + itemlen) > pd_special || offset != MAXALIGN(offset)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item pointer: offset = %u, length = %u", offset, (unsigned int) itemlen))); if (nextitm < nitems && offnum == itemnos[nextitm]) { /* this one is on our list to delete, so mark it unused */ ItemIdSetUnused(lp); nextitm++; } else if (ItemIdHasStorage(lp)) { /* This one's live -- must do the compaction dance */ empty = false; } else { /* get rid of this one too */ ItemIdSetUnused(lp); } } } /* this will catch invalid or out-of-order itemnos[] */ if (nextitm != nitems) elog(ERROR, "incorrect index offsets supplied"); if (empty) { /* Page is completely empty, so just reset it quickly */ phdr->pd_lower = SizeOfPageHeaderData; phdr->pd_upper = pd_special; } else { /* There are live items: need to compact the page the hard way */ itemIdSortData itemidbase[MaxOffsetNumber]; itemIdSort itemidptr; int i; Size totallen; /* * Scan the page taking note of each item that we need to preserve. * This includes both live items (those that contain data) and * interspersed unused ones. It's critical to preserve these unused * items, because otherwise the offset numbers for later live items * would change, which is not acceptable. Unused items might get used * again later; that is fine. */ itemidptr = itemidbase; totallen = 0; PageClearHasFreeLinePointers(page); for (i = 0; i < nline; i++) { ItemId lp; itemidptr->offsetindex = i; lp = PageGetItemId(page, i + 1); if (ItemIdHasStorage(lp)) { itemidptr->itemoff = ItemIdGetOffset(lp); itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp)); totallen += itemidptr->alignedlen; itemidptr++; } else { PageSetHasFreeLinePointers(page); ItemIdSetUnused(lp); } } nline = itemidptr - itemidbase; /* By here, there are exactly nline elements in itemidbase array */ if (totallen > (Size) (pd_special - pd_lower)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item lengths: total %u, available space %u", (unsigned int) totallen, pd_special - pd_lower))); /* * Defragment the data areas of each tuple, being careful to preserve * each item's position in the linp array. */ compactify_tuples(itemidbase, nline, page); } }