static void killtuple(Relation r, GISTScanOpaque so, ItemPointer iptr) { MIRROREDLOCK_BUFMGR_DECLARE; Page p; OffsetNumber offset; // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; LockBuffer(so->curbuf, GIST_SHARE); gistcheckpage(r, so->curbuf); p = (Page) BufferGetPage(so->curbuf); if (XLByteEQ(so->stack->lsn, PageGetLSN(p))) { /* page unchanged, so all is simple */ offset = ItemPointerGetOffsetNumber(iptr); ItemIdMarkDead(PageGetItemId(p, offset)); SetBufferCommitInfoNeedsSave(so->curbuf); } else { OffsetNumber maxoff = PageGetMaxOffsetNumber(p); for (offset = FirstOffsetNumber; offset <= maxoff; offset = OffsetNumberNext(offset)) { IndexTuple ituple = (IndexTuple) PageGetItem(p, PageGetItemId(p, offset)); if (ItemPointerEquals(&(ituple->t_tid), iptr)) { /* found */ ItemIdMarkDead(PageGetItemId(p, offset)); SetBufferCommitInfoNeedsSave(so->curbuf); break; } } } LockBuffer(so->curbuf, GIST_UNLOCK); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- }
/* * SetHintBits() * * Set commit/abort hint bits on a tuple, if appropriate at this time. * * It is only safe to set a transaction-committed hint bit if we know the * transaction's commit record has been flushed to disk. We cannot change * the LSN of the page here because we may hold only a share lock on the * buffer, so we can't use the LSN to interlock this; we have to just refrain * from setting the hint bit until some future re-examination of the tuple. * * We can always set hint bits when marking a transaction aborted. (Some * code in heapam.c relies on that!) * * Also, if we are cleaning up HEAP_MOVED_IN or HEAP_MOVED_OFF entries, then * we can always set the hint bits, since VACUUM FULL always uses synchronous * commits and doesn't move tuples that weren't previously hinted. (This is * not known by this subroutine, but is applied by its callers.) * * Normal commits may be asynchronous, so for those we need to get the LSN * of the transaction and then check whether this is flushed. * * The caller should pass xid as the XID of the transaction to check, or * InvalidTransactionId if no check is needed. */ static inline void SetHintBits(HeapTupleHeader tuple, Buffer buffer, uint16 infomask, TransactionId xid) { if (TransactionIdIsValid(xid)) { /* NB: xid must be known committed here! */ XLogRecPtr commitLSN = TransactionIdGetCommitLSN(xid); if (XLogNeedsFlush(commitLSN)) return; /* not flushed yet, so don't set hint */ } tuple->t_infomask |= infomask; SetBufferCommitInfoNeedsSave(buffer); }
Datum rtgettuple(PG_FUNCTION_ARGS) { IndexScanDesc s = (IndexScanDesc) PG_GETARG_POINTER(0); ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1); RTreeScanOpaque so = (RTreeScanOpaque) s->opaque; Page page; OffsetNumber offnum; /* * If we've already produced a tuple and the executor has informed us that * it should be marked "killed", do so now. */ if (s->kill_prior_tuple && ItemPointerIsValid(&(s->currentItemData))) { offnum = ItemPointerGetOffsetNumber(&(s->currentItemData)); page = BufferGetPage(so->curbuf); PageGetItemId(page, offnum)->lp_flags |= LP_DELETE; SetBufferCommitInfoNeedsSave(so->curbuf); } /* * Get the next tuple that matches the search key; if asked to skip killed * tuples, find the first non-killed tuple that matches. Return as soon as * we've run out of matches or we've found an acceptable match. */ for (;;) { bool res = rtnext(s, dir); if (res && s->ignore_killed_tuples) { offnum = ItemPointerGetOffsetNumber(&(s->currentItemData)); page = BufferGetPage(so->curbuf); if (ItemIdDeleted(PageGetItemId(page, offnum))) continue; } PG_RETURN_BOOL(res); } }
/* * lazy_scan_heap() -- scan an open heap relation * * This routine sets commit status bits, builds lists of dead tuples * and pages with free space, and calculates statistics on the number * of live tuples in the heap. When done, or when we run low on space * for dead-tuple TIDs, invoke vacuuming of indexes and heap. * * If there are no indexes then we just vacuum each dirty page as we * process it, since there's no point in gathering many tuples. */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, Relation *Irel, int nindexes, bool scan_all) { BlockNumber nblocks, blkno; HeapTupleData tuple; char *relname; BlockNumber empty_pages, vacuumed_pages; double num_tuples, tups_vacuumed, nkeep, nunused; IndexBulkDeleteResult **indstats; int i; PGRUsage ru0; Buffer vmbuffer = InvalidBuffer; BlockNumber next_not_all_visible_block; bool skipping_all_visible_blocks; pg_rusage_init(&ru0); relname = RelationGetRelationName(onerel); ereport(elevel, (errmsg("vacuuming \"%s.%s\"", get_namespace_name(RelationGetNamespace(onerel)), relname))); empty_pages = vacuumed_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); nblocks = RelationGetNumberOfBlocks(onerel); vacrelstats->rel_pages = nblocks; vacrelstats->scanned_pages = 0; vacrelstats->nonempty_pages = 0; vacrelstats->latestRemovedXid = InvalidTransactionId; lazy_space_alloc(vacrelstats, nblocks); /* * We want to skip pages that don't require vacuuming according to the * visibility map, but only when we can skip at least SKIP_PAGES_THRESHOLD * consecutive pages. Since we're reading sequentially, the OS should be * doing readahead for us, so there's no gain in skipping a page now and * then; that's likely to disable readahead and so be counterproductive. * Also, skipping even a single page means that we can't update * relfrozenxid, so we only want to do it if we can skip a goodly number * of pages. * * Before entering the main loop, establish the invariant that * next_not_all_visible_block is the next block number >= blkno that's not * all-visible according to the visibility map, or nblocks if there's no * such block. Also, we set up the skipping_all_visible_blocks flag, * which is needed because we need hysteresis in the decision: once we've * started skipping blocks, we may as well skip everything up to the next * not-all-visible block. * * Note: if scan_all is true, we won't actually skip any pages; but we * maintain next_not_all_visible_block anyway, so as to set up the * all_visible_according_to_vm flag correctly for each page. */ for (next_not_all_visible_block = 0; next_not_all_visible_block < nblocks; next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } if (next_not_all_visible_block >= SKIP_PAGES_THRESHOLD) skipping_all_visible_blocks = true; else skipping_all_visible_blocks = false; for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool tupgone, hastup; int prev_dead_count; OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; Size freespace; bool all_visible_according_to_vm; bool all_visible; bool has_dead_tuples; if (blkno == next_not_all_visible_block) { /* Time to advance next_not_all_visible_block */ for (next_not_all_visible_block++; next_not_all_visible_block < nblocks; next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } /* * We know we can't skip the current block. But set up * skipping_all_visible_blocks to do the right thing at the * following blocks. */ if (next_not_all_visible_block - blkno > SKIP_PAGES_THRESHOLD) skipping_all_visible_blocks = true; else skipping_all_visible_blocks = false; all_visible_according_to_vm = false; } else { /* Current block is all-visible */ if (skipping_all_visible_blocks && !scan_all) continue; all_visible_according_to_vm = true; } vacuum_delay_point(); vacrelstats->scanned_pages++; /* * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. */ if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && vacrelstats->num_dead_tuples > 0) { /* Log cleanup info before we touch indexes */ vacuum_log_cleanup_info(onerel, vacrelstats); /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); /* * Forget the now-vacuumed tuples, and press on, but be careful * not to reset latestRemovedXid since we want that value to be * valid. */ vacrelstats->num_dead_tuples = 0; vacrelstats->num_index_scans++; } buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, vac_strategy); /* We need buffer cleanup lock so that we can prune HOT chains. */ LockBufferForCleanup(buf); page = BufferGetPage(buf); if (PageIsNew(page)) { /* * An all-zeroes page could be left over if a backend extends the * relation but crashes before initializing the page. Reclaim such * pages for use. * * We have to be careful here because we could be looking at a * page that someone has just added to the relation and not yet * been able to initialize (see RelationGetBufferForTuple). To * protect against that, release the buffer lock, grab the * relation extension lock momentarily, and re-lock the buffer. If * the page is still uninitialized by then, it must be left over * from a crashed backend, and we can initialize it. * * We don't really need the relation lock when this is a new or * temp relation, but it's probably not worth the code space to * check that, since this surely isn't a critical path. * * Note: the comparable code in vacuum.c need not worry because * it's got exclusive lock on the whole relation. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockRelationForExtension(onerel, ExclusiveLock); UnlockRelationForExtension(onerel, ExclusiveLock); LockBufferForCleanup(buf); if (PageIsNew(page)) { ereport(WARNING, (errmsg("relation \"%s\" page %u is uninitialized --- fixing", relname, blkno))); PageInit(page, BufferGetPageSize(buf), 0); empty_pages++; } freespace = PageGetHeapFreeSpace(page); MarkBufferDirty(buf); UnlockReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } if (PageIsEmpty(page)) { empty_pages++; freespace = PageGetHeapFreeSpace(page); if (!PageIsAllVisible(page)) { PageSetAllVisible(page); SetBufferCommitInfoNeedsSave(buf); } LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* Update the visibility map */ if (!all_visible_according_to_vm) { visibilitymap_pin(onerel, blkno, &vmbuffer); LockBuffer(buf, BUFFER_LOCK_SHARE); if (PageIsAllVisible(page)) visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } ReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } /* * Prune all HOT-update chains in this page. * * We count tuples removed by the pruning step as removed by VACUUM. */ tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, &vacrelstats->latestRemovedXid); /* * Now scan the page to collect vacuumable items and check for tuples * requiring freezing. */ all_visible = true; has_dead_tuples = false; nfrozen = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* Unused items require no processing, but we count 'em */ if (!ItemIdIsUsed(itemid)) { nunused += 1; continue; } /* Redirect items mustn't be touched */ if (ItemIdIsRedirected(itemid)) { hastup = true; /* this page won't be truncatable */ continue; } ItemPointerSet(&(tuple.t_self), blkno, offnum); /* * DEAD item pointers are to be vacuumed normally; but we don't * count them in tups_vacuumed, else we'd be double-counting (at * least in the common case where heap_page_prune() just freed up * a non-HOT tuple). */ if (ItemIdIsDead(itemid)) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); all_visible = false; continue; } Assert(ItemIdIsNormal(itemid)); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tupgone = false; switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf)) { case HEAPTUPLE_DEAD: /* * Ordinarily, DEAD tuples would have been removed by * heap_page_prune(), but it's possible that the tuple * state changed since heap_page_prune() looked. In * particular an INSERT_IN_PROGRESS tuple could have * changed to DEAD if the inserter aborted. So this * cannot be considered an error condition. * * If the tuple is HOT-updated then it must only be * removed by a prune operation; so we keep it just as if * it were RECENTLY_DEAD. Also, if it's a heap-only * tuple, we choose to keep it, because it'll be a lot * cheaper to get rid of it in the next pruning pass than * to treat it like an indexed tuple. */ if (HeapTupleIsHotUpdated(&tuple) || HeapTupleIsHeapOnly(&tuple)) nkeep += 1; else tupgone = true; /* we can delete the tuple */ all_visible = false; break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ if (onerel->rd_rel->relhasoids && !OidIsValid(HeapTupleGetOid(&tuple))) elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); /* * Is the tuple definitely visible to all transactions? * * NB: Like with per-tuple hint bits, we can't set the * PD_ALL_VISIBLE flag if the inserter committed * asynchronously. See SetHintBits for more info. Check * that the HEAP_XMIN_COMMITTED hint bit is set because of * that. */ if (all_visible) { TransactionId xmin; if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)) { all_visible = false; break; } /* * The inserter definitely committed. But is it old * enough that everyone sees it as committed? */ xmin = HeapTupleHeaderGetXmin(tuple.t_data); if (!TransactionIdPrecedes(xmin, OldestXmin)) { all_visible = false; break; } } break; case HEAPTUPLE_RECENTLY_DEAD: /* * If tuple is recently deleted then we must not remove it * from relation. */ nkeep += 1; all_visible = false; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } if (tupgone) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, &vacrelstats->latestRemovedXid); tups_vacuumed += 1; has_dead_tuples = true; } else { num_tuples += 1; hastup = true; /* * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ if (heap_freeze_tuple(tuple.t_data, FreezeLimit, InvalidBuffer)) frozen[nfrozen++] = offnum; } } /* scan along page */ /* * If we froze any tuples, mark the buffer dirty, and write a WAL * record recording the changes. We must log the changes to be * crash-safe against future truncation of CLOG. */ if (nfrozen > 0) { MarkBufferDirty(buf); if (RelationNeedsWAL(onerel)) { XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, frozen, nfrozen); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } /* * If there are no indexes then we can vacuum the page right now * instead of doing a second scan. */ if (nindexes == 0 && vacrelstats->num_dead_tuples > 0) { /* Remove tuples from heap */ lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); /* * Forget the now-vacuumed tuples, and press on, but be careful * not to reset latestRemovedXid since we want that value to be * valid. */ vacrelstats->num_dead_tuples = 0; vacuumed_pages++; } freespace = PageGetHeapFreeSpace(page); /* Update the all-visible flag on the page */ if (!PageIsAllVisible(page) && all_visible) { PageSetAllVisible(page); SetBufferCommitInfoNeedsSave(buf); } /* * It's possible for the value returned by GetOldestXmin() to move * backwards, so it's not wrong for us to see tuples that appear to * not be visible to everyone yet, while PD_ALL_VISIBLE is already * set. The real safe xmin value never moves backwards, but * GetOldestXmin() is conservative and sometimes returns a value * that's unnecessarily small, so if we see that contradiction it just * means that the tuples that we think are not visible to everyone yet * actually are, and the PD_ALL_VISIBLE flag is correct. * * There should never be dead tuples on a page with PD_ALL_VISIBLE * set, however. */ else if (PageIsAllVisible(page) && has_dead_tuples) { elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u", relname, blkno); PageClearAllVisible(page); SetBufferCommitInfoNeedsSave(buf); /* * Normally, we would drop the lock on the heap page before * updating the visibility map, but since this case shouldn't * happen anyway, don't worry about that. */ visibilitymap_clear(onerel, blkno); } LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* Update the visibility map */ if (!all_visible_according_to_vm && all_visible) { visibilitymap_pin(onerel, blkno, &vmbuffer); LockBuffer(buf, BUFFER_LOCK_SHARE); if (PageIsAllVisible(page)) visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } ReleaseBuffer(buf); /* Remember the location of the last page with nonremovable tuples */ if (hastup) vacrelstats->nonempty_pages = blkno + 1; /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record * its post-compaction free space. If not, then we're done with this * page, so remember its free space as-is. (This path will always be * taken if there are no indexes.) */ if (vacrelstats->num_dead_tuples == prev_dead_count) RecordPageWithFreeSpace(onerel, blkno, freespace); } /* save stats for use later */ vacrelstats->scanned_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; /* now we can compute the new value for pg_class.reltuples */ vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false, nblocks, vacrelstats->scanned_pages, num_tuples); /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) { /* Log cleanup info before we touch indexes */ vacuum_log_cleanup_info(onerel, vacrelstats); /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); vacrelstats->num_index_scans++; } /* Release the pin on the visibility map page */ if (BufferIsValid(vmbuffer)) { ReleaseBuffer(vmbuffer); vmbuffer = InvalidBuffer; } /* Do post-vacuum cleanup and statistics update for each index */ for (i = 0; i < nindexes; i++) lazy_cleanup_index(Irel[i], indstats[i], vacrelstats); /* If no indexes, make log report that lazy_vacuum_heap would've made */ if (vacuumed_pages) ereport(elevel, (errmsg("\"%s\": removed %.0f row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, vacuumed_pages))); ereport(elevel, (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages", RelationGetRelationName(onerel), tups_vacuumed, num_tuples, vacrelstats->scanned_pages, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages are entirely empty.\n" "%s.", nkeep, nunused, empty_pages, pg_rusage_show(&ru0)))); }
/* * btvacuumpage --- VACUUM one page * * This processes a single page for btvacuumscan(). In some cases we * must go back and re-examine previously-scanned pages; this routine * recurses when necessary to handle that case. * * blkno is the page to process. orig_blkno is the highest block number * reached by the outer btvacuumscan loop (the same as blkno, unless we * are recursing to re-examine a previous page). */ static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno) { MIRROREDLOCK_BUFMGR_DECLARE; IndexVacuumInfo *info = vstate->info; IndexBulkDeleteResult *stats = vstate->stats; IndexBulkDeleteCallback callback = vstate->callback; void *callback_state = vstate->callback_state; Relation rel = info->index; bool delete_now; BlockNumber recurse_to; Buffer buf; Page page; BTPageOpaque opaque; restart: delete_now = false; recurse_to = P_NONE; /* call vacuum_delay_point while not holding any buffer lock */ vacuum_delay_point(); /* * We can't use _bt_getbuf() here because it always applies * _bt_checkpage(), which will barf on an all-zero page. We want to * recycle all-zero pages, not fail. Also, we want to use a nondefault * buffer access strategy. */ // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; buf = ReadBufferWithStrategy(rel, blkno, info->strategy); LockBuffer(buf, BT_READ); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!PageIsNew(page)) _bt_checkpage(rel, buf); /* * If we are recursing, the only case we want to do anything with is a * live leaf page having the current vacuum cycle ID. Any other state * implies we already saw the page (eg, deleted it as being empty). In * particular, we don't want to risk adding it to freePages twice. */ if (blkno != orig_blkno) { if (_bt_page_recyclable(page) || P_IGNORE(opaque) || !P_ISLEAF(opaque) || opaque->btpo_cycleid != vstate->cycleid) { _bt_relbuf(rel, buf); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return; } } /* Page is valid, see what to do with it */ if (_bt_page_recyclable(page)) { /* Okay to recycle this page */ if (vstate->nFreePages < vstate->maxFreePages) vstate->freePages[vstate->nFreePages++] = blkno; vstate->totFreePages++; stats->pages_deleted++; } else if (P_ISDELETED(opaque)) { /* Already deleted, but can't recycle yet */ stats->pages_deleted++; } else if (P_ISHALFDEAD(opaque)) { /* Half-dead, try to delete */ delete_now = true; } else if (P_ISLEAF(opaque)) { OffsetNumber deletable[MaxOffsetNumber]; int ndeletable; OffsetNumber offnum, minoff, maxoff; /* * Trade in the initial read lock for a super-exclusive write lock on * this page. We must get such a lock on every leaf page over the * course of the vacuum scan, whether or not it actually contains any * deletable tuples --- see nbtree/README. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(buf); /* * Check whether we need to recurse back to earlier pages. What we * are concerned about is a page split that happened since we started * the vacuum scan. If the split moved some tuples to a lower page * then we might have missed 'em. If so, set up for tail recursion. * (Must do this before possibly clearing btpo_cycleid below!) */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid && !(opaque->btpo_flags & BTP_SPLIT_END) && !P_RIGHTMOST(opaque) && opaque->btpo_next < orig_blkno) recurse_to = opaque->btpo_next; /* * Scan over all items to see which ones need deleted according to the * callback function. */ ndeletable = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); if (callback) { for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { IndexTuple itup; ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); htup = &(itup->t_tid); if (callback(htup, callback_state)) deletable[ndeletable++] = offnum; } } /* * Apply any needed deletes. We issue just one _bt_delitems() call * per page, so as to minimize WAL traffic. */ if (ndeletable > 0) { _bt_delitems(rel, buf, deletable, ndeletable, true); stats->tuples_removed += ndeletable; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page); } else { /* * If the page has been split during this vacuum cycle, it seems * worth expending a write to clear btpo_cycleid even if we don't * have any deletions to do. (If we do, _bt_delitems takes care * of this.) This ensures we won't process the page again. * * We treat this like a hint-bit update because there's no need to * WAL-log it. */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid) { opaque->btpo_cycleid = 0; SetBufferCommitInfoNeedsSave(buf); } } /* * If it's now empty, try to delete; else count the live tuples. We * don't delete when recursing, though, to avoid putting entries into * freePages out-of-order (doesn't seem worth any extra code to handle * the case). */ if (minoff > maxoff) delete_now = (blkno == orig_blkno); else stats->num_index_tuples += maxoff - minoff + 1; } if (delete_now) { MemoryContext oldcontext; int ndel; /* Run pagedel in a temp context to avoid memory leakage */ MemoryContextReset(vstate->pagedelcontext); oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext); ndel = _bt_pagedel(rel, buf, NULL, info->vacuum_full); /* count only this page, else may double-count parent */ if (ndel) stats->pages_deleted++; /* * During VACUUM FULL it's okay to recycle deleted pages immediately, * since there can be no other transactions scanning the index. Note * that we will only recycle the current page and not any parent pages * that _bt_pagedel might have recursed to; this seems reasonable in * the name of simplicity. (Trying to do otherwise would mean we'd * have to sort the list of recyclable pages we're building.) */ if (ndel && info->vacuum_full) { if (vstate->nFreePages < vstate->maxFreePages) vstate->freePages[vstate->nFreePages++] = blkno; vstate->totFreePages++; } MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ } else _bt_relbuf(rel, buf); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- /* * This is really tail recursion, but if the compiler is too stupid to * optimize it as such, we'd eat an uncomfortably large amount of stack * space per recursion level (due to the deletable[] array). A failure is * improbable since the number of levels isn't likely to be large ... but * just in case, let's hand-optimize into a loop. */ if (recurse_to != P_NONE) { blkno = recurse_to; goto restart; } }
/* * _bt_killitems - set LP_DEAD state for items an indexscan caller has * told us were killed * * scan->so contains information about the current page and killed tuples * thereon (generally, this should only be called if so->numKilled > 0). * * The caller must have pin on so->currPos.buf, but may or may not have * read-lock, as indicated by haveLock. Note that we assume read-lock * is sufficient for setting LP_DEAD status (which is only a hint). * * We match items by heap TID before assuming they are the right ones to * delete. We cope with cases where items have moved right due to insertions. * If an item has moved off the current page due to a split, we'll fail to * find it and do nothing (this is not an error case --- we assume the item * will eventually get marked in a future indexscan). Note that because we * hold pin on the target page continuously from initially reading the items * until applying this function, VACUUM cannot have deleted any items from * the page, and so there is no need to search left from the recorded offset. * (This observation also guarantees that the item is still the right one * to delete, which might otherwise be questionable since heap TIDs can get * recycled.) */ void _bt_killitems(IndexScanDesc scan, bool haveLock) { BTScanOpaque so = (BTScanOpaque) scan->opaque; Page page; BTPageOpaque opaque; OffsetNumber minoff; OffsetNumber maxoff; int i; bool killedsomething = false; Assert(BufferIsValid(so->currPos.buf)); if (!haveLock) LockBuffer(so->currPos.buf, BT_READ); page = BufferGetPage(so->currPos.buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); for (i = 0; i < so->numKilled; i++) { int itemIndex = so->killedItems[i]; BTScanPosItem *kitem = &so->currPos.items[itemIndex]; OffsetNumber offnum = kitem->indexOffset; Assert(itemIndex >= so->currPos.firstItem && itemIndex <= so->currPos.lastItem); if (offnum < minoff) continue; /* pure paranoia */ while (offnum <= maxoff) { ItemId iid = PageGetItemId(page, offnum); IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)) { /* found the item */ ItemIdMarkDead(iid); killedsomething = true; break; /* out of inner search loop */ } offnum = OffsetNumberNext(offnum); } } /* * Since this can be redone later if needed, it's treated the same as a * commit-hint-bit status update for heap tuples: we mark the buffer dirty * but don't make a WAL log entry. * * Whenever we mark anything LP_DEAD, we also set the page's * BTP_HAS_GARBAGE flag, which is likewise just a hint. */ if (killedsomething) { opaque->btpo_flags |= BTP_HAS_GARBAGE; SetBufferCommitInfoNeedsSave(so->currPos.buf); } if (!haveLock) LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); /* * Always reset the scan state, so we don't look for same items on other * pages. */ so->numKilled = 0; }
/* * hashgettuple() -- Get the next tuple in the scan. */ Datum hashgettuple(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1); HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; Buffer buf; Page page; OffsetNumber offnum; ItemPointer current; bool res; /* Hash indexes are always lossy since we store only the hash code */ scan->xs_recheck = true; /* * We hold pin but not lock on current buffer while outside the hash AM. * Reacquire the read lock here. */ if (BufferIsValid(so->hashso_curbuf)) _hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ); /* * If we've already initialized this scan, we can just advance it in the * appropriate direction. If we haven't done so yet, we call a routine to * get the first item in the scan. */ current = &(so->hashso_curpos); if (ItemPointerIsValid(current)) { /* * An insertion into the current index page could have happened while * we didn't have read lock on it. Re-find our position by looking * for the TID we previously returned. (Because we hold share lock on * the bucket, no deletions or splits could have occurred; therefore * we can expect that the TID still exists in the current index page, * at an offset >= where we were.) */ OffsetNumber maxoffnum; buf = so->hashso_curbuf; Assert(BufferIsValid(buf)); page = BufferGetPage(buf); maxoffnum = PageGetMaxOffsetNumber(page); for (offnum = ItemPointerGetOffsetNumber(current); offnum <= maxoffnum; offnum = OffsetNumberNext(offnum)) { IndexTuple itup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid))) break; } if (offnum > maxoffnum) elog(ERROR, "failed to re-find scan position within index \"%s\"", RelationGetRelationName(rel)); ItemPointerSetOffsetNumber(current, offnum); /* * Check to see if we should kill the previously-fetched tuple. */ if (scan->kill_prior_tuple) { /* * Yes, so mark it by setting the LP_DEAD state in the item flags. */ ItemIdMarkDead(PageGetItemId(page, offnum)); /* * Since this can be redone later if needed, it's treated the same * as a commit-hint-bit status update for heap tuples: we mark the * buffer dirty but don't make a WAL log entry. */ SetBufferCommitInfoNeedsSave(buf); } /* * Now continue the scan. */ res = _hash_next(scan, dir); } else res = _hash_first(scan, dir); /* * Skip killed tuples if asked to. */ if (scan->ignore_killed_tuples) { while (res) { offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(so->hashso_curbuf); if (!ItemIdIsDead(PageGetItemId(page, offnum))) break; res = _hash_next(scan, dir); } } /* Release read lock on current buffer, but keep it pinned */ if (BufferIsValid(so->hashso_curbuf)) _hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK); /* Return current heap TID on success */ scan->xs_ctup.t_self = so->hashso_heappos; PG_RETURN_BOOL(res); }
/* * btvacuumpage --- VACUUM one page * * This processes a single page for btvacuumscan(). In some cases we * must go back and re-examine previously-scanned pages; this routine * recurses when necessary to handle that case. * * blkno is the page to process. orig_blkno is the highest block number * reached by the outer btvacuumscan loop (the same as blkno, unless we * are recursing to re-examine a previous page). */ static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno) { IndexVacuumInfo *info = vstate->info; IndexBulkDeleteResult *stats = vstate->stats; IndexBulkDeleteCallback callback = vstate->callback; void *callback_state = vstate->callback_state; Relation rel = info->index; bool delete_now; BlockNumber recurse_to; Buffer buf; Page page; BTPageOpaque opaque; restart: delete_now = false; recurse_to = P_NONE; /* call vacuum_delay_point while not holding any buffer lock */ vacuum_delay_point(); /* * We can't use _bt_getbuf() here because it always applies * _bt_checkpage(), which will barf on an all-zero page. We want to * recycle all-zero pages, not fail. Also, we want to use a nondefault * buffer access strategy. */ buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buf, BT_READ); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!PageIsNew(page)) _bt_checkpage(rel, buf); /* * If we are recursing, the only case we want to do anything with is a * live leaf page having the current vacuum cycle ID. Any other state * implies we already saw the page (eg, deleted it as being empty). */ if (blkno != orig_blkno) { if (_bt_page_recyclable(page) || P_IGNORE(opaque) || !P_ISLEAF(opaque) || opaque->btpo_cycleid != vstate->cycleid) { _bt_relbuf(rel, buf); return; } } /* If the page is in use, update lastUsedPage */ if (!_bt_page_recyclable(page) && vstate->lastUsedPage < blkno) vstate->lastUsedPage = blkno; /* Page is valid, see what to do with it */ if (_bt_page_recyclable(page)) { /* Okay to recycle this page */ RecordFreeIndexPage(rel, blkno); vstate->totFreePages++; stats->pages_deleted++; } else if (P_ISDELETED(opaque)) { /* Already deleted, but can't recycle yet */ stats->pages_deleted++; } else if (P_ISHALFDEAD(opaque)) { /* Half-dead, try to delete */ delete_now = true; } else if (P_ISLEAF(opaque)) { OffsetNumber deletable[MaxOffsetNumber]; int ndeletable; OffsetNumber offnum, minoff, maxoff; /* * Trade in the initial read lock for a super-exclusive write lock on * this page. We must get such a lock on every leaf page over the * course of the vacuum scan, whether or not it actually contains any * deletable tuples --- see nbtree/README. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(buf); /* * Check whether we need to recurse back to earlier pages. What we * are concerned about is a page split that happened since we started * the vacuum scan. If the split moved some tuples to a lower page * then we might have missed 'em. If so, set up for tail recursion. * (Must do this before possibly clearing btpo_cycleid below!) */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid && !(opaque->btpo_flags & BTP_SPLIT_END) && !P_RIGHTMOST(opaque) && opaque->btpo_next < orig_blkno) recurse_to = opaque->btpo_next; /* * Scan over all items to see which ones need deleted according to the * callback function. */ ndeletable = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); if (callback) { for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { IndexTuple itup; ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); htup = &(itup->t_tid); /* * During Hot Standby we currently assume that * XLOG_BTREE_VACUUM records do not produce conflicts. That is * only true as long as the callback function depends only * upon whether the index tuple refers to heap tuples removed * in the initial heap scan. When vacuum starts it derives a * value of OldestXmin. Backends taking later snapshots could * have a RecentGlobalXmin with a later xid than the vacuum's * OldestXmin, so it is possible that row versions deleted * after OldestXmin could be marked as killed by other * backends. The callback function *could* look at the index * tuple state in isolation and decide to delete the index * tuple, though currently it does not. If it ever did, we * would need to reconsider whether XLOG_BTREE_VACUUM records * should cause conflicts. If they did cause conflicts they * would be fairly harsh conflicts, since we haven't yet * worked out a way to pass a useful value for * latestRemovedXid on the XLOG_BTREE_VACUUM records. This * applies to *any* type of index that marks index tuples as * killed. */ if (callback(htup, callback_state)) deletable[ndeletable++] = offnum; } } /* * Apply any needed deletes. We issue just one _bt_delitems_vacuum() * call per page, so as to minimize WAL traffic. */ if (ndeletable > 0) { BlockNumber lastBlockVacuumed = BufferGetBlockNumber(buf); _bt_delitems_vacuum(rel, buf, deletable, ndeletable, vstate->lastBlockVacuumed); /* * Keep track of the block number of the lastBlockVacuumed, so we * can scan those blocks as well during WAL replay. This then * provides concurrency protection and allows btrees to be used * while in recovery. */ if (lastBlockVacuumed > vstate->lastBlockVacuumed) vstate->lastBlockVacuumed = lastBlockVacuumed; stats->tuples_removed += ndeletable; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page); } else { /* * If the page has been split during this vacuum cycle, it seems * worth expending a write to clear btpo_cycleid even if we don't * have any deletions to do. (If we do, _bt_delitems_vacuum takes * care of this.) This ensures we won't process the page again. * * We treat this like a hint-bit update because there's no need to * WAL-log it. */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid) { opaque->btpo_cycleid = 0; SetBufferCommitInfoNeedsSave(buf); } } /* * If it's now empty, try to delete; else count the live tuples. We * don't delete when recursing, though, to avoid putting entries into * freePages out-of-order (doesn't seem worth any extra code to handle * the case). */ if (minoff > maxoff) delete_now = (blkno == orig_blkno); else stats->num_index_tuples += maxoff - minoff + 1; } if (delete_now) { MemoryContext oldcontext; int ndel; /* Run pagedel in a temp context to avoid memory leakage */ MemoryContextReset(vstate->pagedelcontext); oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext); ndel = _bt_pagedel(rel, buf, NULL); /* count only this page, else may double-count parent */ if (ndel) stats->pages_deleted++; MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ } else _bt_relbuf(rel, buf); /* * This is really tail recursion, but if the compiler is too stupid to * optimize it as such, we'd eat an uncomfortably large amount of stack * space per recursion level (due to the deletable[] array). A failure is * improbable since the number of levels isn't likely to be large ... but * just in case, let's hand-optimize into a loop. */ if (recurse_to != P_NONE) { blkno = recurse_to; goto restart; } }
/* * hashgettuple() -- Get the next tuple in the scan. */ Datum hashgettuple(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1); HashScanOpaque so = (HashScanOpaque) scan->opaque; Relation rel = scan->indexRelation; Page page; OffsetNumber offnum; bool res; /* * We hold pin but not lock on current buffer while outside the hash AM. * Reacquire the read lock here. */ if (BufferIsValid(so->hashso_curbuf)) _hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ); /* * If we've already initialized this scan, we can just advance it in the * appropriate direction. If we haven't done so yet, we call a routine to * get the first item in the scan. */ if (ItemPointerIsValid(&(scan->currentItemData))) { /* * Check to see if we should kill the previously-fetched tuple. */ if (scan->kill_prior_tuple) { /* * Yes, so mark it by setting the LP_DELETE bit in the item flags. */ offnum = ItemPointerGetOffsetNumber(&(scan->currentItemData)); page = BufferGetPage(so->hashso_curbuf); PageGetItemId(page, offnum)->lp_flags |= LP_DELETE; /* * Since this can be redone later if needed, it's treated the same * as a commit-hint-bit status update for heap tuples: we mark the * buffer dirty but don't make a WAL log entry. */ SetBufferCommitInfoNeedsSave(so->hashso_curbuf); } /* * Now continue the scan. */ res = _hash_next(scan, dir); } else res = _hash_first(scan, dir); /* * Skip killed tuples if asked to. */ if (scan->ignore_killed_tuples) { while (res) { offnum = ItemPointerGetOffsetNumber(&(scan->currentItemData)); page = BufferGetPage(so->hashso_curbuf); if (!ItemIdDeleted(PageGetItemId(page, offnum))) break; res = _hash_next(scan, dir); } } /* Release read lock on current buffer, but keep it pinned */ if (BufferIsValid(so->hashso_curbuf)) _hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK); PG_RETURN_BOOL(res); }
/* * pgstattuple_real * * The real work occurs here */ static Datum pgstattuple_real(Relation rel) { HeapScanDesc scan; HeapTuple tuple; BlockNumber nblocks; BlockNumber block = 0; /* next block to count free space in */ BlockNumber tupblock; Buffer buffer; uint64 table_len; uint64 tuple_len = 0; uint64 dead_tuple_len = 0; uint64 tuple_count = 0; uint64 dead_tuple_count = 0; double tuple_percent; double dead_tuple_percent; uint64 free_space = 0; /* free/reusable space in bytes */ double free_percent; /* free/reusable space in % */ TupleDesc tupdesc; TupleTableSlot *slot; AttInMetadata *attinmeta; char **values; int i; Datum result; /* * Build a tuple description for a pgstattupe_type tuple */ tupdesc = RelationNameGetTupleDesc(DUMMY_TUPLE); /* allocate a slot for a tuple with this tupdesc */ slot = TupleDescGetSlot(tupdesc); /* * Generate attribute metadata needed later to produce tuples from raw * C strings */ attinmeta = TupleDescGetAttInMetadata(tupdesc); nblocks = RelationGetNumberOfBlocks(rel); scan = heap_beginscan(rel, SnapshotAny, 0, NULL); /* scan the relation */ while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { uint16 sv_infomask; sv_infomask = tuple->t_data->t_infomask; if (HeapTupleSatisfiesNow(tuple->t_data)) { tuple_len += tuple->t_len; tuple_count++; } else { dead_tuple_len += tuple->t_len; dead_tuple_count++; } if (sv_infomask != tuple->t_data->t_infomask) SetBufferCommitInfoNeedsSave(scan->rs_cbuf); /* * To avoid physically reading the table twice, try to do the * free-space scan in parallel with the heap scan. However, * heap_getnext may find no tuples on a given page, so we cannot * simply examine the pages returned by the heap scan. */ tupblock = BlockIdGetBlockNumber(&tuple->t_self.ip_blkid); while (block <= tupblock) { buffer = ReadBuffer(rel, block); free_space += PageGetFreeSpace((Page) BufferGetPage(buffer)); ReleaseBuffer(buffer); block++; } } heap_endscan(scan); while (block < nblocks) { buffer = ReadBuffer(rel, block); free_space += PageGetFreeSpace((Page) BufferGetPage(buffer)); ReleaseBuffer(buffer); block++; } heap_close(rel, AccessShareLock); table_len = (uint64) nblocks *BLCKSZ; if (nblocks == 0) { tuple_percent = 0.0; dead_tuple_percent = 0.0; free_percent = 0.0; } else { tuple_percent = (double) tuple_len *100.0 / table_len; dead_tuple_percent = (double) dead_tuple_len *100.0 / table_len; free_percent = (double) free_space *100.0 / table_len; } /* * Prepare a values array for storage in our slot. This should be an * array of C strings which will be processed later by the appropriate * "in" functions. */ values = (char **) palloc(NCOLUMNS * sizeof(char *)); for (i = 0; i < NCOLUMNS; i++) values[i] = (char *) palloc(NCHARS * sizeof(char)); i = 0; snprintf(values[i++], NCHARS, INT64_FORMAT, table_len); snprintf(values[i++], NCHARS, INT64_FORMAT, tuple_count); snprintf(values[i++], NCHARS, INT64_FORMAT, tuple_len); snprintf(values[i++], NCHARS, "%.2f", tuple_percent); snprintf(values[i++], NCHARS, INT64_FORMAT, dead_tuple_count); snprintf(values[i++], NCHARS, INT64_FORMAT, dead_tuple_len); snprintf(values[i++], NCHARS, "%.2f", dead_tuple_percent); snprintf(values[i++], NCHARS, INT64_FORMAT, free_space); snprintf(values[i++], NCHARS, "%.2f", free_percent); /* build a tuple */ tuple = BuildTupleFromCStrings(attinmeta, values); /* make the tuple into a datum */ result = TupleGetDatum(slot, tuple); /* Clean up */ for (i = 0; i < NCOLUMNS; i++) pfree(values[i]); pfree(values); return (result); }
/* * Prune and repair fragmentation in the specified page. * * Caller must have pin and buffer cleanup lock on the page. * * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum). * * If report_stats is true then we send the number of reclaimed heap-only * tuples to pgstats. (This must be FALSE during vacuum, since vacuum will * send its own new total to pgstats, and we don't want this delta applied * on top of that.) * * Returns the number of tuples deleted from the page and sets * latestRemovedXid. */ int heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, bool report_stats, TransactionId *latestRemovedXid) { int ndeleted = 0; Page page = BufferGetPage(buffer); OffsetNumber offnum, maxoff; PruneState prstate; /* * Our strategy is to scan the page and make lists of items to change, * then apply the changes within a critical section. This keeps as much * logic as possible out of the critical section, and also ensures that * WAL replay will work the same as the normal case. * * First, initialize the new pd_prune_xid value to zero (indicating no * prunable tuples). If we find any tuples which may soon become * prunable, we will save the lowest relevant XID in new_prune_xid. Also * initialize the rest of our working state. */ prstate.new_prune_xid = InvalidTransactionId; prstate.latestRemovedXid = *latestRemovedXid; prstate.nredirected = prstate.ndead = prstate.nunused = 0; memset(prstate.marked, 0, sizeof(prstate.marked)); /* Scan the page */ maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; /* Ignore items already processed as part of an earlier chain */ if (prstate.marked[offnum]) continue; /* Nothing to do if slot is empty or already dead */ itemid = PageGetItemId(page, offnum); if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid)) continue; /* Process this item or chain of items */ ndeleted += heap_prune_chain(relation, buffer, offnum, OldestXmin, &prstate); } /* Any error while applying the changes is critical */ START_CRIT_SECTION(); /* Have we found any prunable items? */ if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0) { /* * Apply the planned item changes, then repair page fragmentation, and * update the page's hint bit about whether it has free line pointers. */ heap_page_prune_execute(buffer, prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, prstate.nowunused, prstate.nunused); /* * Update the page's pd_prune_xid field to either zero, or the lowest * XID of any soon-prunable tuple. */ ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; /* * Also clear the "page is full" flag, since there's no point in * repeating the prune/defrag process until something else happens to * the page. */ PageClearFull(page); MarkBufferDirty(buffer); /* * Emit a WAL HEAP_CLEAN record showing what we did */ if (RelationNeedsWAL(relation)) { XLogRecPtr recptr; recptr = log_heap_clean(relation, buffer, prstate.redirected, prstate.nredirected, prstate.nowdead, prstate.ndead, prstate.nowunused, prstate.nunused, prstate.latestRemovedXid); PageSetLSN(BufferGetPage(buffer), recptr); PageSetTLI(BufferGetPage(buffer), ThisTimeLineID); } } else { /* * If we didn't prune anything, but have found a new value for the * pd_prune_xid field, update it and mark the buffer dirty. This is * treated as a non-WAL-logged hint. * * Also clear the "page is full" flag if it is set, since there's no * point in repeating the prune/defrag process until something else * happens to the page. */ if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid || PageIsFull(page)) { ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid; PageClearFull(page); SetBufferCommitInfoNeedsSave(buffer); } } END_CRIT_SECTION(); /* * If requested, report the number of tuples reclaimed to pgstats. This is * ndeleted minus ndead, because we don't want to count a now-DEAD root * item as a deletion for this purpose. */ if (report_stats && ndeleted > prstate.ndead) pgstat_update_heap_dead_tuples(relation, ndeleted - prstate.ndead); *latestRemovedXid = prstate.latestRemovedXid; /* * XXX Should we update the FSM information of this page ? * * There are two schools of thought here. We may not want to update FSM * information so that the page is not used for unrelated UPDATEs/INSERTs * and any free space in this page will remain available for further * UPDATEs in *this* page, thus improving chances for doing HOT updates. * * But for a large table and where a page does not receive further UPDATEs * for a long time, we might waste this space by not updating the FSM * information. The relation may get extended and fragmented further. * * One possibility is to leave "fillfactor" worth of space in this page * and update FSM with the remaining space. * * In any case, the current FSM implementation doesn't accept * one-page-at-a-time updates, so this is all academic for now. */ return ndeleted; }
/* ---------------- * index_getnext - get the next heap tuple from a scan * * The result is the next heap tuple satisfying the scan keys and the * snapshot, or NULL if no more matching tuples exist. On success, * the buffer containing the heap tuple is pinned (the pin will be dropped * at the next index_getnext or index_endscan). The index TID corresponding * to the heap tuple can be obtained if needed from scan->currentItemData. * ---------------- */ HeapTuple index_getnext(IndexScanDesc scan, ScanDirection direction) { HeapTuple heapTuple = &scan->xs_ctup; SCAN_CHECKS; /* Release any previously held pin */ if (BufferIsValid(scan->xs_cbuf)) { ReleaseBuffer(scan->xs_cbuf); scan->xs_cbuf = InvalidBuffer; } /* * If we already got a tuple and it must be unique, there's no need to * make the index AM look through any additional tuples. (This can * save a useful amount of work in scenarios where there are many dead * tuples due to heavy update activity.) * * To do this we must keep track of the logical scan position * (before/on/after tuple). Also, we have to be sure to release scan * resources before returning NULL; if we fail to do so then a * multi-index scan can easily run the system out of free buffers. We * can release index-level resources fairly cheaply by calling * index_rescan. This means there are two persistent states as far as * the index AM is concerned: on-tuple and rescanned. If we are * actually asked to re-fetch the single tuple, we have to go through * a fresh indexscan startup, which penalizes that (infrequent) case. */ if (scan->keys_are_unique && scan->got_tuple) { int new_tuple_pos = scan->unique_tuple_pos; if (ScanDirectionIsForward(direction)) { if (new_tuple_pos <= 0) new_tuple_pos++; } else { if (new_tuple_pos >= 0) new_tuple_pos--; } if (new_tuple_pos == 0) { /* * We are moving onto the unique tuple from having been off * it. We just fall through and let the index AM do the work. * Note we should get the right answer regardless of scan * direction. */ scan->unique_tuple_pos = 0; /* need to update position */ } else { /* * Moving off the tuple; must do amrescan to release * index-level pins before we return NULL. Since index_rescan * will reset my state, must save and restore... */ int unique_tuple_mark = scan->unique_tuple_mark; index_rescan(scan, NULL /* no change to key */ ); scan->keys_are_unique = true; scan->got_tuple = true; scan->unique_tuple_pos = new_tuple_pos; scan->unique_tuple_mark = unique_tuple_mark; return NULL; } } /* just make sure this is false... */ scan->kill_prior_tuple = false; for (;;) { bool found; uint16 sv_infomask; pgstat_count_index_scan(&scan->xs_pgstat_info); /* * The AM's gettuple proc finds the next tuple matching the scan * keys. index_beginscan already set up fn_getnext. */ found = DatumGetBool(FunctionCall2(&scan->fn_getnext, PointerGetDatum(scan), Int32GetDatum(direction))); /* Reset kill flag immediately for safety */ scan->kill_prior_tuple = false; if (!found) return NULL; /* failure exit */ /* * Fetch the heap tuple and see if it matches the snapshot. */ if (heap_fetch(scan->heapRelation, scan->xs_snapshot, heapTuple, &scan->xs_cbuf, true, &scan->xs_pgstat_info)) break; /* Skip if no tuple at this location */ if (heapTuple->t_data == NULL) continue; /* should we raise an error instead? */ /* * If we can't see it, maybe no one else can either. Check to see * if the tuple is dead to all transactions. If so, signal the * index AM to not return it on future indexscans. * * We told heap_fetch to keep a pin on the buffer, so we can * re-access the tuple here. But we must re-lock the buffer * first. Also, it's just barely possible for an update of hint * bits to occur here. */ LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); sv_infomask = heapTuple->t_data->t_infomask; if (HeapTupleSatisfiesVacuum(heapTuple->t_data, RecentGlobalXmin) == HEAPTUPLE_DEAD) scan->kill_prior_tuple = true; if (sv_infomask != heapTuple->t_data->t_infomask) SetBufferCommitInfoNeedsSave(scan->xs_cbuf); LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); ReleaseBuffer(scan->xs_cbuf); scan->xs_cbuf = InvalidBuffer; } /* Success exit */ scan->got_tuple = true; /* * If we just fetched a known-unique tuple, then subsequent calls will * go through the short-circuit code above. unique_tuple_pos has been * initialized to 0, which is the correct state ("on row"). */ pgstat_count_index_getnext(&scan->xs_pgstat_info); return heapTuple; }
/* * IndexBuildHeapScan - scan the heap relation to find tuples to be indexed * * This is called back from an access-method-specific index build procedure * after the AM has done whatever setup it needs. The parent heap relation * is scanned to find tuples that should be entered into the index. Each * such tuple is passed to the AM's callback routine, which does the right * things to add it to the new index. After we return, the AM's index * build procedure does whatever cleanup is needed; in particular, it should * close the heap and index relations. * * The total count of heap tuples is returned. This is for updating pg_class * statistics. (It's annoying not to be able to do that here, but we can't * do it until after the relation is closed.) Note that the index AM itself * must keep track of the number of index tuples; we don't do so here because * the AM might reject some of the tuples for its own reasons, such as being * unable to store NULLs. */ double IndexBuildHeapScan(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo, IndexBuildCallback callback, void *callback_state) { HeapScanDesc scan; HeapTuple heapTuple; TupleDesc heapDescriptor; Datum attdata[INDEX_MAX_KEYS]; char nulls[INDEX_MAX_KEYS]; double reltuples; List *predicate; TupleTable tupleTable; TupleTableSlot *slot; EState *estate; ExprContext *econtext; Snapshot snapshot; TransactionId OldestXmin; /* * sanity checks */ Assert(OidIsValid(indexRelation->rd_rel->relam)); heapDescriptor = RelationGetDescr(heapRelation); /* * Need an EState for evaluation of index expressions and * partial-index predicates. */ estate = CreateExecutorState(); econtext = GetPerTupleExprContext(estate); /* * If this is a predicate (partial) index, we will need to evaluate * the predicate using ExecQual, which requires the current tuple to * be in a slot of a TupleTable. Likewise if there are any * expressions. */ if (indexInfo->ii_Predicate != NIL || indexInfo->ii_Expressions != NIL) { tupleTable = ExecCreateTupleTable(1); slot = ExecAllocTableSlot(tupleTable); ExecSetSlotDescriptor(slot, heapDescriptor, false); /* Arrange for econtext's scan tuple to be the tuple under test */ econtext->ecxt_scantuple = slot; /* Set up execution state for predicate. */ predicate = (List *) ExecPrepareExpr((Expr *) indexInfo->ii_Predicate, estate); } else { tupleTable = NULL; slot = NULL; predicate = NIL; } /* * Ok, begin our scan of the base relation. We use SnapshotAny * because we must retrieve all tuples and do our own time qual * checks. */ if (IsBootstrapProcessingMode()) { snapshot = SnapshotNow; OldestXmin = InvalidTransactionId; } else { snapshot = SnapshotAny; OldestXmin = GetOldestXmin(heapRelation->rd_rel->relisshared); } scan = heap_beginscan(heapRelation, /* relation */ snapshot, /* seeself */ 0, /* number of keys */ (ScanKey) NULL); /* scan key */ reltuples = 0; /* * Scan all tuples in the base relation. */ while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { bool tupleIsAlive; CHECK_FOR_INTERRUPTS(); if (snapshot == SnapshotAny) { /* do our own time qual check */ bool indexIt; uint16 sv_infomask; /* * HeapTupleSatisfiesVacuum may update tuple's hint status * bits. We could possibly get away with not locking the * buffer here, since caller should hold ShareLock on the * relation, but let's be conservative about it. */ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); sv_infomask = heapTuple->t_data->t_infomask; switch (HeapTupleSatisfiesVacuum(heapTuple->t_data, OldestXmin)) { case HEAPTUPLE_DEAD: indexIt = false; tupleIsAlive = false; break; case HEAPTUPLE_LIVE: indexIt = true; tupleIsAlive = true; break; case HEAPTUPLE_RECENTLY_DEAD: /* * If tuple is recently deleted then we must index it * anyway to keep VACUUM from complaining. */ indexIt = true; tupleIsAlive = false; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* * Since caller should hold ShareLock or better, we * should not see any tuples inserted by open * transactions --- unless it's our own transaction. * (Consider INSERT followed by CREATE INDEX within a * transaction.) An exception occurs when reindexing * a system catalog, because we often release lock on * system catalogs before committing. */ if (!TransactionIdIsCurrentTransactionId( HeapTupleHeaderGetXmin(heapTuple->t_data)) && !IsSystemRelation(heapRelation)) elog(ERROR, "concurrent insert in progress"); indexIt = true; tupleIsAlive = true; break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* * Since caller should hold ShareLock or better, we * should not see any tuples deleted by open * transactions --- unless it's our own transaction. * (Consider DELETE followed by CREATE INDEX within a * transaction.) An exception occurs when reindexing * a system catalog, because we often release lock on * system catalogs before committing. */ if (!TransactionIdIsCurrentTransactionId( HeapTupleHeaderGetXmax(heapTuple->t_data)) && !IsSystemRelation(heapRelation)) elog(ERROR, "concurrent delete in progress"); indexIt = true; tupleIsAlive = false; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); indexIt = tupleIsAlive = false; /* keep compiler quiet */ break; } /* check for hint-bit update by HeapTupleSatisfiesVacuum */ if (sv_infomask != heapTuple->t_data->t_infomask) SetBufferCommitInfoNeedsSave(scan->rs_cbuf); LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); if (!indexIt) continue; } else { /* heap_getnext did the time qual check */ tupleIsAlive = true; } reltuples += 1; MemoryContextReset(econtext->ecxt_per_tuple_memory); /* Set up for predicate or expression evaluation */ if (slot) ExecStoreTuple(heapTuple, slot, InvalidBuffer, false); /* * In a partial index, discard tuples that don't satisfy the * predicate. We can also discard recently-dead tuples, since * VACUUM doesn't complain about tuple count mismatch for partial * indexes. */ if (predicate != NIL) { if (!tupleIsAlive) continue; if (!ExecQual(predicate, econtext, false)) continue; } /* * For the current heap tuple, extract all the attributes we use * in this index, and note which are null. This also performs * evaluation of any expressions needed. */ FormIndexDatum(indexInfo, heapTuple, heapDescriptor, estate, attdata, nulls); /* * You'd think we should go ahead and build the index tuple here, * but some index AMs want to do further processing on the data * first. So pass the attdata and nulls arrays, instead. */ /* Call the AM's callback routine to process the tuple */ callback(indexRelation, heapTuple, attdata, nulls, tupleIsAlive, callback_state); } heap_endscan(scan); if (tupleTable) ExecDropTupleTable(tupleTable, true); FreeExecutorState(estate); /* These may have been pointing to the now-gone estate */ indexInfo->ii_ExpressionsState = NIL; indexInfo->ii_PredicateState = NIL; return reltuples; }