/* * Set value in given FSM page and slot. * * If minValue > 0, the updated page is also searched for a page with at * least minValue of free space. If one is found, its slot number is * returned, -1 otherwise. */ static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, uint8 newValue, uint8 minValue) { Buffer buf; Page page; int newslot = -1; buf = fsm_readbuf(rel, addr, true); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); if (fsm_set_avail(page, slot, newValue)) MarkBufferDirtyHint(buf, false); if (minValue != 0) { /* Search while we still hold the lock */ newslot = fsm_search_avail(buf, minValue, addr.level == FSM_BOTTOM_LEVEL, true); } UnlockReleaseBuffer(buf); return newslot; }
/* * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in * WAL replay */ void XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk, Size spaceAvail) { int new_cat = fsm_space_avail_to_cat(spaceAvail); FSMAddress addr; uint16 slot; BlockNumber blkno; Buffer buf; Page page; /* Get the location of the FSM byte representing the heap block */ addr = fsm_get_location(heapBlk, &slot); blkno = fsm_logical_to_physical(addr); /* If the page doesn't exist already, extend */ buf = XLogReadBufferExtended(rnode, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); if (PageIsNew(page)) PageInit(page, BLCKSZ, 0); if (fsm_set_avail(page, slot, new_cat)) MarkBufferDirtyHint(buf, false); UnlockReleaseBuffer(buf); }
/* * Initiate page evacuation protocol. * * The page must be locked in exclusive mode by the caller. * * If the page is not yet initialized or empty, return false without doing * anything; it can be used for revmap without any further changes. If it * contains tuples, mark it for evacuation and return true. */ bool brin_start_evacuating_page(Relation idxRel, Buffer buf) { OffsetNumber off; OffsetNumber maxoff; Page page; page = BufferGetPage(buf); if (PageIsNew(page)) return false; maxoff = PageGetMaxOffsetNumber(page); for (off = FirstOffsetNumber; off <= maxoff; off++) { ItemId lp; lp = PageGetItemId(page, off); if (ItemIdIsUsed(lp)) { /* prevent other backends from adding more stuff to this page */ BrinPageFlags(page) |= BRIN_EVACUATE_PAGE; MarkBufferDirtyHint(buf, true); return true; } } return false; }
/* * FreeSpaceMapTruncateRel - adjust for truncation of a relation. * * The caller must hold AccessExclusiveLock on the relation, to ensure that * other backends receive the smgr invalidation event that this function sends * before they access the FSM again. * * nblocks is the new___ size of the heap. */ void FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks) { BlockNumber new_nfsmblocks; FSMAddress first_removed_address; uint16 first_removed_slot; Buffer buf; RelationOpenSmgr(rel); /* * If no FSM has been created yet for this relation, there's nothing to * truncate. */ if (!smgrexists(rel->rd_smgr, FSM_FORKNUM)) return; /* Get the location in the FSM of the first removed heap block */ first_removed_address = fsm_get_location(nblocks, &first_removed_slot); /* * Zero out the tail of the last remaining FSM page. If the slot * representing the first removed heap block is at a page boundary, as the * first slot on the FSM page that first_removed_address points to, we can * just truncate that page altogether. */ if (first_removed_slot > 0) { buf = fsm_readbuf(rel, first_removed_address, false); if (!BufferIsValid(buf)) return; /* nothing to do; the FSM was already smaller */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); fsm_truncate_avail(BufferGetPage(buf), first_removed_slot); MarkBufferDirtyHint(buf, false); UnlockReleaseBuffer(buf); new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1; } else { new_nfsmblocks = fsm_logical_to_physical(first_removed_address); if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks) return; /* nothing to do; the FSM was already smaller */ } /* Truncate the unused FSM pages, and send smgr inval message */ smgrtruncate(rel->rd_smgr, FSM_FORKNUM, new_nfsmblocks); /* * We might as well update the local smgr_fsm_nblocks setting. * smgrtruncate sent an smgr cache inval message, which will cause other * backends to invalidate their copy of smgr_fsm_nblocks, and this one too * at the next command boundary. But this ensures it isn't outright wrong * until then. */ if (rel->rd_smgr) rel->rd_smgr->smgr_fsm_nblocks = new_nfsmblocks; }
/* * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in * WAL replay */ void XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk, Size spaceAvail) { int new_cat = fsm_space_avail_to_cat(spaceAvail); FSMAddress addr; uint16 slot; BlockNumber blkno; Buffer buf; Page page; bool write_to_fsm; /* This is meant to mirror the logic in fsm_allow_writes() */ if (heapBlk >= HEAP_FSM_CREATION_THRESHOLD) write_to_fsm = true; else { /* Open the relation at smgr level */ SMgrRelation smgr = smgropen(rnode, InvalidBackendId); if (smgrexists(smgr, FSM_FORKNUM)) write_to_fsm = true; else { BlockNumber heap_nblocks = smgrnblocks(smgr, MAIN_FORKNUM); if (heap_nblocks > HEAP_FSM_CREATION_THRESHOLD) write_to_fsm = true; else write_to_fsm = false; } } if (!write_to_fsm) return; /* Get the location of the FSM byte representing the heap block */ addr = fsm_get_location(heapBlk, &slot); blkno = fsm_logical_to_physical(addr); /* If the page doesn't exist already, extend */ buf = XLogReadBufferExtended(rnode, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); if (PageIsNew(page)) PageInit(page, BLCKSZ, 0); if (fsm_set_avail(page, slot, new_cat)) MarkBufferDirtyHint(buf, false); UnlockReleaseBuffer(buf); }
/* * Given an opened sequence relation, lock the page buffer and find the tuple * * *buf receives the reference to the pinned-and-ex-locked buffer * *seqtuple receives the reference to the sequence tuple proper * (this arg should point to a local variable of type HeapTupleData) * * Function's return value points to the data payload of the tuple */ static Form_pg_sequence read_seq_tuple(SeqTable elm, Relation rel, Buffer *buf, HeapTuple seqtuple) { Page page; ItemId lp; sequence_magic *sm; Form_pg_sequence seq; *buf = ReadBuffer(rel, 0); LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(*buf); sm = (sequence_magic *) PageGetSpecialPointer(page); if (sm->magic != SEQ_MAGIC) elog(ERROR, "bad magic number in sequence \"%s\": %08X", RelationGetRelationName(rel), sm->magic); lp = PageGetItemId(page, FirstOffsetNumber); Assert(ItemIdIsNormal(lp)); /* Note we currently only bother to set these two fields of *seqtuple */ seqtuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); seqtuple->t_len = ItemIdGetLength(lp); /* * Previous releases of Postgres neglected to prevent SELECT FOR UPDATE on * a sequence, which would leave a non-frozen XID in the sequence tuple's * xmax, which eventually leads to clog access failures or worse. If we * see this has happened, clean up after it. We treat this like a hint * bit update, ie, don't bother to WAL-log it, since we can certainly do * this again if the update gets lost. */ Assert(!(seqtuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI)); if (HeapTupleHeaderGetRawXmax(seqtuple->t_data) != InvalidTransactionId) { HeapTupleHeaderSetXmax(seqtuple->t_data, InvalidTransactionId); seqtuple->t_data->t_infomask &= ~HEAP_XMAX_COMMITTED; seqtuple->t_data->t_infomask |= HEAP_XMAX_INVALID; MarkBufferDirtyHint(*buf, true); } seq = (Form_pg_sequence) GETSTRUCT(seqtuple); /* this is a handy place to update our copy of the increment */ elm->increment = seq->increment_by; return seq; }
/* * SetHintBits() * * Set commit/abort hint bits on a tuple, if appropriate at this time. * * It is only safe to set a transaction-committed hint bit if we know the * transaction's commit record has been flushed to disk, or if the table is * temporary or unlogged and will be obliterated by a crash anyway. We * cannot change the LSN of the page here because we may hold only a share * lock on the buffer, so we can't use the LSN to interlock this; we have to * just refrain from setting the hint bit until some future re-examination * of the tuple. * * We can always set hint bits when marking a transaction aborted. (Some * code in heapam.c relies on that!) * * Also, if we are cleaning up HEAP_MOVED_IN or HEAP_MOVED_OFF entries, then * we can always set the hint bits, since pre-9.0 VACUUM FULL always used * synchronous commits and didn't move tuples that weren't previously * hinted. (This is not known by this subroutine, but is applied by its * callers.) Note: old-style VACUUM FULL is gone, but we have to keep this * module's support for MOVED_OFF/MOVED_IN flag bits for as long as we * support in-place update from pre-9.0 databases. * * Normal commits may be asynchronous, so for those we need to get the LSN * of the transaction and then check whether this is flushed. * * The caller should pass xid as the XID of the transaction to check, or * InvalidTransactionId if no check is needed. */ static inline void SetHintBits(HeapTupleHeader tuple, Buffer buffer, uint16 infomask, TransactionId xid) { if (TransactionIdIsValid(xid)) { /* NB: xid must be known committed here! */ XLogRecPtr commitLSN = TransactionIdGetCommitLSN(xid); if (XLogNeedsFlush(commitLSN) && BufferIsPermanent(buffer)) return; /* not flushed yet, so don't set hint */ } tuple->t_infomask |= infomask; MarkBufferDirtyHint(buffer, true); }
/* * btvacuumpage --- VACUUM one page * * This processes a single page for btvacuumscan(). In some cases we * must go back and re-examine previously-scanned pages; this routine * recurses when necessary to handle that case. * * blkno is the page to process. orig_blkno is the highest block number * reached by the outer btvacuumscan loop (the same as blkno, unless we * are recursing to re-examine a previous page). */ static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno) { IndexVacuumInfo *info = vstate->info; IndexBulkDeleteResult *stats = vstate->stats; IndexBulkDeleteCallback callback = vstate->callback; void *callback_state = vstate->callback_state; Relation rel = info->index; bool delete_now; BlockNumber recurse_to; Buffer buf; Page page; BTPageOpaque opaque = NULL; restart: delete_now = false; recurse_to = P_NONE; /* call vacuum_delay_point while not holding any buffer lock */ vacuum_delay_point(); /* * We can't use _bt_getbuf() here because it always applies * _bt_checkpage(), which will barf on an all-zero page. We want to * recycle all-zero pages, not fail. Also, we want to use a nondefault * buffer access strategy. */ buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buf, BT_READ); page = BufferGetPage(buf); if (!PageIsNew(page)) { _bt_checkpage(rel, buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); } /* * If we are recursing, the only case we want to do anything with is a * live leaf page having the current vacuum cycle ID. Any other state * implies we already saw the page (eg, deleted it as being empty). */ if (blkno != orig_blkno) { if (_bt_page_recyclable(page) || P_IGNORE(opaque) || !P_ISLEAF(opaque) || opaque->btpo_cycleid != vstate->cycleid) { _bt_relbuf(rel, buf); return; } } /* Page is valid, see what to do with it */ if (_bt_page_recyclable(page)) { /* Okay to recycle this page */ RecordFreeIndexPage(rel, blkno); vstate->totFreePages++; stats->pages_deleted++; } else if (P_ISDELETED(opaque)) { /* Already deleted, but can't recycle yet */ stats->pages_deleted++; } else if (P_ISHALFDEAD(opaque)) { /* Half-dead, try to delete */ delete_now = true; } else if (P_ISLEAF(opaque)) { OffsetNumber deletable[MaxOffsetNumber]; int ndeletable; OffsetNumber offnum, minoff, maxoff; /* * Trade in the initial read lock for a super-exclusive write lock on * this page. We must get such a lock on every leaf page over the * course of the vacuum scan, whether or not it actually contains any * deletable tuples --- see nbtree/README. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(buf); /* * Remember highest leaf page number we've taken cleanup lock on; see * notes in btvacuumscan */ if (blkno > vstate->lastBlockLocked) vstate->lastBlockLocked = blkno; /* * Check whether we need to recurse back to earlier pages. What we * are concerned about is a page split that happened since we started * the vacuum scan. If the split moved some tuples to a lower page * then we might have missed 'em. If so, set up for tail recursion. * (Must do this before possibly clearing btpo_cycleid below!) */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid && !(opaque->btpo_flags & BTP_SPLIT_END) && !P_RIGHTMOST(opaque) && opaque->btpo_next < orig_blkno) recurse_to = opaque->btpo_next; /* * Scan over all items to see which ones need deleted according to the * callback function. */ ndeletable = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); if (callback) { for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { IndexTuple itup; ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); htup = &(itup->t_tid); /* * During Hot Standby we currently assume that * XLOG_BTREE_VACUUM records do not produce conflicts. That is * only true as long as the callback function depends only * upon whether the index tuple refers to heap tuples removed * in the initial heap scan. When vacuum starts it derives a * value of OldestXmin. Backends taking later snapshots could * have a RecentGlobalXmin with a later xid than the vacuum's * OldestXmin, so it is possible that row versions deleted * after OldestXmin could be marked as killed by other * backends. The callback function *could* look at the index * tuple state in isolation and decide to delete the index * tuple, though currently it does not. If it ever did, we * would need to reconsider whether XLOG_BTREE_VACUUM records * should cause conflicts. If they did cause conflicts they * would be fairly harsh conflicts, since we haven't yet * worked out a way to pass a useful value for * latestRemovedXid on the XLOG_BTREE_VACUUM records. This * applies to *any* type of index that marks index tuples as * killed. */ if (callback(htup, callback_state)) deletable[ndeletable++] = offnum; } } /* * Apply any needed deletes. We issue just one _bt_delitems_vacuum() * call per page, so as to minimize WAL traffic. */ if (ndeletable > 0) { /* * Notice that the issued XLOG_BTREE_VACUUM WAL record includes * all information to the replay code to allow it to get a cleanup * lock on all pages between the previous lastBlockVacuumed and * this page. This ensures that WAL replay locks all leaf pages at * some point, which is important should non-MVCC scans be * requested. This is currently unused on standby, but we record * it anyway, so that the WAL contains the required information. * * Since we can visit leaf pages out-of-order when recursing, * replay might end up locking such pages an extra time, but it * doesn't seem worth the amount of bookkeeping it'd take to avoid * that. */ _bt_delitems_vacuum(rel, buf, deletable, ndeletable, vstate->lastBlockVacuumed); /* * Remember highest leaf page number we've issued a * XLOG_BTREE_VACUUM WAL record for. */ if (blkno > vstate->lastBlockVacuumed) vstate->lastBlockVacuumed = blkno; stats->tuples_removed += ndeletable; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page); } else { /* * If the page has been split during this vacuum cycle, it seems * worth expending a write to clear btpo_cycleid even if we don't * have any deletions to do. (If we do, _bt_delitems_vacuum takes * care of this.) This ensures we won't process the page again. * * We treat this like a hint-bit update because there's no need to * WAL-log it. */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid) { opaque->btpo_cycleid = 0; MarkBufferDirtyHint(buf, true); } } /* * If it's now empty, try to delete; else count the live tuples. We * don't delete when recursing, though, to avoid putting entries into * freePages out-of-order (doesn't seem worth any extra code to handle * the case). */ if (minoff > maxoff) delete_now = (blkno == orig_blkno); else stats->num_index_tuples += maxoff - minoff + 1; } if (delete_now) { MemoryContext oldcontext; int ndel; /* Run pagedel in a temp context to avoid memory leakage */ MemoryContextReset(vstate->pagedelcontext); oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext); ndel = _bt_pagedel(rel, buf); /* count only this page, else may double-count parent */ if (ndel) stats->pages_deleted++; MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ } else _bt_relbuf(rel, buf); /* * This is really tail recursion, but if the compiler is too stupid to * optimize it as such, we'd eat an uncomfortably large amount of stack * space per recursion level (due to the deletable[] array). A failure is * improbable since the number of levels isn't likely to be large ... but * just in case, let's hand-optimize into a loop. */ if (recurse_to != P_NONE) { blkno = recurse_to; goto restart; } }
/* * hashgettuple() -- Get the next tuple in the scan. */ bool hashgettuple(IndexScanDesc scan, ScanDirection dir) { HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; Buffer buf; Page page; OffsetNumber offnum; ItemPointer current; bool res; /* Hash indexes are always lossy since we store only the hash code */ scan->xs_recheck = true; /* * We hold pin but not lock on current buffer while outside the hash AM. * Reacquire the read lock here. */ if (BufferIsValid(so->hashso_curbuf)) _hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ); /* * If we've already initialized this scan, we can just advance it in the * appropriate direction. If we haven't done so yet, we call a routine to * get the first item in the scan. */ current = &(so->hashso_curpos); if (ItemPointerIsValid(current)) { /* * An insertion into the current index page could have happened while * we didn't have read lock on it. Re-find our position by looking * for the TID we previously returned. (Because we hold share lock on * the bucket, no deletions or splits could have occurred; therefore * we can expect that the TID still exists in the current index page, * at an offset >= where we were.) */ OffsetNumber maxoffnum; buf = so->hashso_curbuf; Assert(BufferIsValid(buf)); page = BufferGetPage(buf); TestForOldSnapshot(scan->xs_snapshot, rel, page); maxoffnum = PageGetMaxOffsetNumber(page); for (offnum = ItemPointerGetOffsetNumber(current); offnum <= maxoffnum; offnum = OffsetNumberNext(offnum)) { IndexTuple itup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid))) break; } if (offnum > maxoffnum) elog(ERROR, "failed to re-find scan position within index \"%s\"", RelationGetRelationName(rel)); ItemPointerSetOffsetNumber(current, offnum); /* * Check to see if we should kill the previously-fetched tuple. */ if (scan->kill_prior_tuple) { /* * Yes, so mark it by setting the LP_DEAD state in the item flags. */ ItemIdMarkDead(PageGetItemId(page, offnum)); /* * Since this can be redone later if needed, mark as a hint. */ MarkBufferDirtyHint(buf, true); } /* * Now continue the scan. */ res = _hash_next(scan, dir); } else res = _hash_first(scan, dir); /* * Skip killed tuples if asked to. */ if (scan->ignore_killed_tuples) { while (res) { offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(so->hashso_curbuf); if (!ItemIdIsDead(PageGetItemId(page, offnum))) break; res = _hash_next(scan, dir); } } /* Release read lock on current buffer, but keep it pinned */ if (BufferIsValid(so->hashso_curbuf)) _hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK); /* Return current heap TID on success */ scan->xs_ctup.t_self = so->hashso_heappos; return res; }
/* * Recursive guts of FreeSpaceMapVacuum */ static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p) { Buffer buf; Page page; uint8 max_avail; /* Read the page if it exists, or return EOF */ buf = fsm_readbuf(rel, addr, false); if (!BufferIsValid(buf)) { *eof_p = true; return 0; } else *eof_p = false; page = BufferGetPage(buf); /* * Recurse into children, and fix the information stored about them at * this level. */ if (addr.level > FSM_BOTTOM_LEVEL) { int slot; bool eof = false; for (slot = 0; slot < SlotsPerFSMPage; slot++) { int child_avail; CHECK_FOR_INTERRUPTS(); /* After we hit end-of-file, just clear the rest of the slots */ if (!eof) child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot), &eof); else child_avail = 0; /* Update information about the child */ if (fsm_get_avail(page, slot) != child_avail) { LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); fsm_set_avail(BufferGetPage(buf), slot, child_avail); MarkBufferDirtyHint(buf, false); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } } } max_avail = fsm_get_max_avail(BufferGetPage(buf)); /* * Reset the next slot pointer. This encourages the use of low-numbered * pages, increasing the chances that a later vacuum can truncate the * relation. */ ((FSMPage) PageGetContents(page))->fp_next_slot = 0; ReleaseBuffer(buf); return max_avail; }
/* * Prune and repair fragmentation in the specified page. * * Caller must have pin and buffer cleanup lock on the page. * * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum). * * If redirect_move is set, we remove redirecting line pointers by * updating the root line pointer to point directly to the first non-dead * tuple in the chain. NOTE: eliminating the redirect changes the first * tuple's effective CTID, and is therefore unsafe except within VACUUM FULL. * The only reason we support this capability at all is that by using it, * VACUUM FULL need not cope with LP_REDIRECT items at all; which seems a * good thing since VACUUM FULL is overly complicated already. * * If report_stats is true then we send the number of reclaimed heap-only * tuples to pgstats. (This must be FALSE during vacuum, since vacuum will * send its own new total to pgstats, and we don't want this delta applied * on top of that.) * * Returns the number of tuples deleted from the page. */ int heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, bool redirect_move, bool report_stats) { int ndeleted = 0; Page page = BufferGetPage(buffer); OffsetNumber offnum, maxoff; PruneState prstate; /* * Our strategy is to scan the page and make lists of items to change, * then apply the changes within a critical section. This keeps as much * logic as possible out of the critical section, and also ensures that * WAL replay will work the same as the normal case. * * First, inform inval.c that upcoming CacheInvalidateHeapTuple calls are * nontransactional. */ if (redirect_move) BeginNonTransactionalInvalidation(); /* * Initialize the new pd_prune_xid value to zero (indicating no prunable * tuples). If we find any tuples which may soon become prunable, we will * save the lowest relevant XID in new_prune_xid. Also initialize the rest * of our working state. */ prstate.new_prune_xid = InvalidTransactionId; prstate.nredirected = prstate.ndead = prstate.nunused = 0; memset(prstate.marked, 0, sizeof(prstate.marked)); /* Scan the page */ maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; /* Ignore items already processed as part of an earlier chain */ if (prstate.marked[offnum]) continue; /* Nothing to do if slot is empty or already dead */ itemid = PageGetItemId(page, offnum); if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid)) continue; /* Process this item or chain of items */ ndeleted += heap_prune_chain(relation, buffer, offnum, OldestXmin, &prstate, redirect_move); } /* * Send invalidation messages for any tuples we are about to move. It is * safe to do this now, even though we could theoretically still fail * before making the actual page update, because a useless cache * invalidation doesn't hurt anything. Also, no one else can reload the * tuples while we have exclusive buffer lock, so it's not too early to * send the invals. This avoids sending the invals while inside the * critical section, which is a good thing for robustness. */ if (redirect_move) EndNonTransactionalInvalidation(); /* Any error while applying the changes is critical */ START_CRIT_SECTION(); /* Have we found any prunable items? */ if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0) { /* * Apply the planned item changes, then repair page fragmentation, and * update the page's hint bit about whether it has free line pointers. */ heap_page_prune_execute(buffer, prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, prstate.nowunused, prstate.nunused, redirect_move); /* * Update the page's pd_prune_xid field to either zero, or the lowest * XID of any soon-prunable tuple. */ ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; /* * Also clear the "page is full" flag, since there's no point in * repeating the prune/defrag process until something else happens to * the page. */ PageClearFull(page); MarkBufferDirty(buffer); /* * Emit a WAL HEAP_CLEAN or HEAP_CLEAN_MOVE record showing what we did */ if (!relation->rd_istemp) { XLogRecPtr recptr; recptr = log_heap_clean(relation, buffer, prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, prstate.nowunused, prstate.nunused, redirect_move); PageSetLSN(BufferGetPage(buffer), recptr); } } else { /* * If we didn't prune anything, but have found a new value for the * pd_prune_xid field, update it and mark the buffer dirty. This is * treated as a non-WAL-logged hint. * * Also clear the "page is full" flag if it is set, since there's no * point in repeating the prune/defrag process until something else * happens to the page. */ if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid || PageIsFull(page)) { ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; PageClearFull(page); MarkBufferDirtyHint(buffer, relation); } } END_CRIT_SECTION(); /* * If requested, report the number of tuples reclaimed to pgstats. This is * ndeleted minus ndead, because we don't want to count a now-DEAD root * item as a deletion for this purpose. */ if (report_stats && ndeleted > prstate.ndead) pgstat_update_heap_dead_tuples(relation, ndeleted - prstate.ndead); /* * XXX Should we update the FSM information of this page ? * * There are two schools of thought here. We may not want to update FSM * information so that the page is not used for unrelated UPDATEs/INSERTs * and any free space in this page will remain available for further * UPDATEs in *this* page, thus improving chances for doing HOT updates. * * But for a large table and where a page does not receive further UPDATEs * for a long time, we might waste this space by not updating the FSM * information. The relation may get extended and fragmented further. * * One possibility is to leave "fillfactor" worth of space in this page * and update FSM with the remaining space. * * In any case, the current FSM implementation doesn't accept * one-page-at-a-time updates, so this is all academic for now. */ return ndeleted; }
/* * gistkillitems() -- set LP_DEAD state for items an indexscan caller has * told us were killed. * * We re-read page here, so it's important to check page LSN. If the page * has been modified since the last read (as determined by LSN), we cannot * flag any entries because it is possible that the old entry was vacuumed * away and the TID was re-used by a completely different heap tuple. */ static void gistkillitems(IndexScanDesc scan) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; Buffer buffer; Page page; OffsetNumber offnum; ItemId iid; int i; bool killedsomething = false; Assert(so->curBlkno != InvalidBlockNumber); Assert(!XLogRecPtrIsInvalid(so->curPageLSN)); Assert(so->killedItems != NULL); buffer = ReadBuffer(scan->indexRelation, so->curBlkno); if (!BufferIsValid(buffer)) return; LockBuffer(buffer, GIST_SHARE); gistcheckpage(scan->indexRelation, buffer); page = BufferGetPage(buffer); /* * If page LSN differs it means that the page was modified since the last * read. killedItems could be not valid so LP_DEAD hints applying is not * safe. */ if (BufferGetLSNAtomic(buffer) != so->curPageLSN) { UnlockReleaseBuffer(buffer); so->numKilled = 0; /* reset counter */ return; } Assert(GistPageIsLeaf(page)); /* * Mark all killedItems as dead. We need no additional recheck, because, * if page was modified, pageLSN must have changed. */ for (i = 0; i < so->numKilled; i++) { offnum = so->killedItems[i]; iid = PageGetItemId(page, offnum); ItemIdMarkDead(iid); killedsomething = true; } if (killedsomething) { GistMarkPageHasGarbage(page); MarkBufferDirtyHint(buffer, true); } UnlockReleaseBuffer(buffer); /* * Always reset the scan state, so we don't look for same items on other * pages. */ so->numKilled = 0; }
/* * Recursive guts of FreeSpaceMapVacuum * * Examine the FSM page indicated by addr, as well as its children, updating * upper-level nodes that cover the heap block range from start to end-1. * (It's okay if end is beyond the actual end of the map.) * Return the maximum freespace value on this page. * * If addr is past the end of the FSM, set *eof_p to true and return 0. * * This traverses the tree in depth-first order. The tree is stored * physically in depth-first order, so this should be pretty I/O efficient. */ static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, BlockNumber start, BlockNumber end, bool *eof_p) { Buffer buf; Page page; uint8 max_avail; /* Read the page if it exists, or return EOF */ buf = fsm_readbuf(rel, addr, false); if (!BufferIsValid(buf)) { *eof_p = true; return 0; } else *eof_p = false; page = BufferGetPage(buf); /* * If we're above the bottom level, recurse into children, and fix the * information stored about them at this level. */ if (addr.level > FSM_BOTTOM_LEVEL) { FSMAddress fsm_start, fsm_end; uint16 fsm_start_slot, fsm_end_slot; int slot, start_slot, end_slot; bool eof = false; /* * Compute the range of slots we need to update on this page, given * the requested range of heap blocks to consider. The first slot to * update is the one covering the "start" block, and the last slot is * the one covering "end - 1". (Some of this work will be duplicated * in each recursive call, but it's cheap enough to not worry about.) */ fsm_start = fsm_get_location(start, &fsm_start_slot); fsm_end = fsm_get_location(end - 1, &fsm_end_slot); while (fsm_start.level < addr.level) { fsm_start = fsm_get_parent(fsm_start, &fsm_start_slot); fsm_end = fsm_get_parent(fsm_end, &fsm_end_slot); } Assert(fsm_start.level == addr.level); if (fsm_start.logpageno == addr.logpageno) start_slot = fsm_start_slot; else if (fsm_start.logpageno > addr.logpageno) start_slot = SlotsPerFSMPage; /* shouldn't get here... */ else start_slot = 0; if (fsm_end.logpageno == addr.logpageno) end_slot = fsm_end_slot; else if (fsm_end.logpageno > addr.logpageno) end_slot = SlotsPerFSMPage - 1; else end_slot = -1; /* shouldn't get here... */ for (slot = start_slot; slot <= end_slot; slot++) { int child_avail; CHECK_FOR_INTERRUPTS(); /* After we hit end-of-file, just clear the rest of the slots */ if (!eof) child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot), start, end, &eof); else child_avail = 0; /* Update information about the child */ if (fsm_get_avail(page, slot) != child_avail) { LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); fsm_set_avail(page, slot, child_avail); MarkBufferDirtyHint(buf, false); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } } } /* Now get the maximum value on the page, to return to caller */ max_avail = fsm_get_max_avail(page); /* * Reset the next slot pointer. This encourages the use of low-numbered * pages, increasing the chances that a later vacuum can truncate the * relation. We don't bother with a lock here, nor with marking the page * dirty if it wasn't already, since this is just a hint. */ ((FSMPage) PageGetContents(page))->fp_next_slot = 0; ReleaseBuffer(buf); return max_avail; }
/* * Searches for a slot with category at least minvalue. * Returns slot number, or -1 if none found. * * The caller must hold at least a shared lock on the page, and this * function can unlock and lock the page again in exclusive mode if it * needs to be updated. exclusive_lock_held should be set to true if the * caller is already holding an exclusive lock, to avoid extra work. * * If advancenext is false, fp_next_slot is set to point to the returned * slot, and if it's true, to the slot after the returned slot. */ int fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext, bool exclusive_lock_held) { Page page = BufferGetPage(buf); FSMPage fsmpage = (FSMPage) PageGetContents(page); int nodeno; int target; uint16 slot; restart: /* * Check the root first, and exit quickly if there's no leaf with enough * free space */ if (fsmpage->fp_nodes[0] < minvalue) return -1; /* * Start search using fp_next_slot. It's just a hint, so check that it's * sane. (This also handles wrapping around when the prior call returned * the last slot on the page.) */ target = fsmpage->fp_next_slot; if (target < 0 || target >= LeafNodesPerPage) target = 0; target += NonLeafNodesPerPage; /*---------- * Start the search from the target slot. At every step, move one * node to the right, then climb up to the parent. Stop when we reach * a node with enough free space (as we must, since the root has enough * space). * * The idea is to gradually expand our "search triangle", that is, all * nodes covered by the current node, and to be sure we search to the * right from the start point. At the first step, only the target slot * is examined. When we move up from a left child to its parent, we are * adding the right-hand subtree of that parent to the search triangle. * When we move right then up from a right child, we are dropping the * current search triangle (which we know doesn't contain any suitable * page) and instead looking at the next-larger-size triangle to its * right. So we never look left from our original start point, and at * each step the size of the search triangle doubles, ensuring it takes * only log2(N) work to search N pages. * * The "move right" operation will wrap around if it hits the right edge * of the tree, so the behavior is still good if we start near the right. * Note also that the move-and-climb behavior ensures that we can't end * up on one of the missing nodes at the right of the leaf level. * * For example, consider this tree: * * 7 * 7 6 * 5 7 6 5 * 4 5 5 7 2 6 5 2 * T * * Assume that the target node is the node indicated by the letter T, * and we're searching for a node with value of 6 or higher. The search * begins at T. At the first iteration, we move to the right, then to the * parent, arriving at the rightmost 5. At the second iteration, we move * to the right, wrapping around, then climb up, arriving at the 7 on the * third level. 7 satisfies our search, so we descend down to the bottom, * following the path of sevens. This is in fact the first suitable page * to the right of (allowing for wraparound) our start point. *---------- */ nodeno = target; while (nodeno > 0) { if (fsmpage->fp_nodes[nodeno] >= minvalue) break; /* * Move to the right, wrapping around on same level if necessary, then * climb up. */ nodeno = parentof(rightneighbor(nodeno)); } /* * We're now at a node with enough free space, somewhere in the middle of * the tree. Descend to the bottom, following a path with enough free * space, preferring to move left if there's a choice. */ while (nodeno < NonLeafNodesPerPage) { int childnodeno = leftchild(nodeno); if (childnodeno < NodesPerPage && fsmpage->fp_nodes[childnodeno] >= minvalue) { nodeno = childnodeno; continue; } childnodeno++; /* point to right child */ if (childnodeno < NodesPerPage && fsmpage->fp_nodes[childnodeno] >= minvalue) { nodeno = childnodeno; } else { /* * Oops. The parent node promised that either left or right child * has enough space, but neither actually did. This can happen in * case of a "torn page", IOW if we crashed earlier while writing * the page to disk, and only part of the page made it to disk. * * Fix the corruption and restart. */ RelFileNode rnode; ForkNumber forknum; BlockNumber blknum; BufferGetTag(buf, &rnode, &forknum, &blknum); elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u", blknum, rnode.spcNode, rnode.dbNode, rnode.relNode); /* make sure we hold an exclusive lock */ if (!exclusive_lock_held) { LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); exclusive_lock_held = true; } fsm_rebuild_page(page); MarkBufferDirtyHint(buf, false); goto restart; } } /* We're now at the bottom level, at a node with enough space. */ slot = nodeno - NonLeafNodesPerPage; /* * Update the next-target pointer. Note that we do this even if we're only * holding a shared lock, on the grounds that it's better to use a shared * lock and get a garbled next pointer every now and then, than take the * concurrency hit of an exclusive lock. * * Wrap-around is handled at the beginning of this function. */ fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0); return slot; }