/* * Allocate a new page (either by recycling, or by extending the index file). * * The returned buffer is already pinned and exclusive-locked. * Caller is responsible for initializing the page by calling SpGistInitBuffer. */ Buffer SpGistNewBuffer(Relation index) { Buffer buffer; bool needLock; /* First, try to get a page from FSM */ for (;;) { BlockNumber blkno = GetFreeIndexPage(index); if (blkno == InvalidBlockNumber) break; /* nothing known to FSM */ /* * The fixed pages shouldn't ever be listed in FSM, but just in case * one is, ignore it. */ if (SpGistBlockIsFixed(blkno)) continue; buffer = ReadBuffer(index, blkno); /* * We have to guard against the possibility that someone else already * recycled this page; the buffer may be locked if so. */ if (ConditionalLockBuffer(buffer)) { Page page = BufferGetPage(buffer); if (PageIsNew(page)) return buffer; /* OK to use, if never initialized */ if (SpGistPageIsDeleted(page) || PageIsEmpty(page)) return buffer; /* OK to use */ LockBuffer(buffer, BUFFER_LOCK_UNLOCK); } /* Can't use it, so release buffer and try again */ ReleaseBuffer(buffer); } /* Must extend the file */ needLock = !RELATION_IS_LOCAL(index); if (needLock) LockRelationForExtension(index, ExclusiveLock); buffer = ReadBuffer(index, P_NEW); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); if (needLock) UnlockRelationForExtension(index, ExclusiveLock); return buffer; }
void _rtdump(Relation r) { Buffer buf; Page page; OffsetNumber offnum, maxoff; BlockNumber blkno; BlockNumber nblocks; RTreePageOpaque po; IndexTuple itup; BlockNumber itblkno; OffsetNumber itoffno; Datum datum; char *itkey; nblocks = RelationGetNumberOfBlocks(r); for (blkno = 0; blkno < nblocks; blkno++) { buf = ReadBuffer(r, blkno); page = BufferGetPage(buf); po = (RTreePageOpaque) PageGetSpecialPointer(page); maxoff = PageGetMaxOffsetNumber(page); printf("Page %d maxoff %d <%s>\n", blkno, maxoff, (po->flags & F_LEAF ? "LEAF" : "INTERNAL")); if (PageIsEmpty(page)) { ReleaseBuffer(buf); continue; } for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); itblkno = ItemPointerGetBlockNumber(&(itup->t_tid)); itoffno = ItemPointerGetOffsetNumber(&(itup->t_tid)); datum = IndexTupleGetDatum(itup); itkey = DatumGetCString(DirectFunctionCall1(box_out, datum)); printf("\t[%d] size %d heap <%d,%d> key:%s\n", offnum, IndexTupleSize(itup), itblkno, itoffno, itkey); pfree(itkey); } ReleaseBuffer(buf); } }
/* * slide an array of ItemIds back one slot (from P_FIRSTKEY to * P_HIKEY, overwriting P_HIKEY). we need to do this when we discover * that we have built an ItemId array in what has turned out to be a * P_RIGHTMOST page. */ static void _bt_slideleft(Page page) { OffsetNumber off; OffsetNumber maxoff; ItemId previi; ItemId thisii; if (!PageIsEmpty(page)) { maxoff = PageGetMaxOffsetNumber(page); previi = PageGetItemId(page, P_HIKEY); for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) { thisii = PageGetItemId(page, off); *previi = *thisii; previi = thisii; } ((PageHeader) page)->pd_lower -= sizeof(ItemIdData); } }
/* * Write itup vector to page, has no control of free space. */ void gistfillbuffer(Page page, IndexTuple *itup, int len, OffsetNumber off) { OffsetNumber l = InvalidOffsetNumber; int i; if (off == InvalidOffsetNumber) off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); for (i = 0; i < len; i++) { Size sz = IndexTupleSize(itup[i]); l = PageAddItem(page, (Item) itup[i], sz, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to GiST index page, item %d out of %d, size %d bytes", i, len, (int) sz); off++; } }
/* * Write itup vector to page, has no control of free space */ OffsetNumber gistfillbuffer(Relation r, Page page, IndexTuple *itup, int len, OffsetNumber off) { OffsetNumber l = InvalidOffsetNumber; int i; if (off == InvalidOffsetNumber) off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); for (i = 0; i < len; i++) { l = PageAddItem(page, (Item) itup[i], IndexTupleSize(itup[i]), off, LP_USED); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(r)); off++; } return l; }
/* * lazy_check_needs_freeze() -- scan page to see if any tuples * need to be cleaned to avoid wraparound * * Returns true if the page needs to be vacuumed using cleanup lock. */ static bool lazy_check_needs_freeze(Buffer buf) { Page page; OffsetNumber offnum, maxoff; HeapTupleHeader tupleheader; page = BufferGetPage(buf); if (PageIsNew(page) || PageIsEmpty(page)) { /* PageIsNew probably shouldn't happen... */ return false; } maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); if (!ItemIdIsNormal(itemid)) continue; tupleheader = (HeapTupleHeader) PageGetItem(page, itemid); if (heap_tuple_needs_freeze(tupleheader, FreezeLimit, buf)) return true; } /* scan along page */ return false; }
/* * _bt_step() -- Step one item in the requested direction in a scan on * the tree. * * *bufP is the current buffer (read-locked and pinned). If we change * pages, it's updated appropriately. * * If successful, update scan's currentItemData and return true. * If no adjacent record exists in the requested direction, * release buffer pin/locks and return false. */ bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) { Relation rel = scan->indexRelation; ItemPointer current = &(scan->currentItemData); BTScanOpaque so = (BTScanOpaque) scan->opaque; Page page; BTPageOpaque opaque; OffsetNumber offnum, maxoff; BlockNumber blkno; /* * Don't use ItemPointerGetOffsetNumber or you risk to get assertion * due to ability of ip_posid to be equal 0. */ offnum = current->ip_posid; page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); maxoff = PageGetMaxOffsetNumber(page); if (ScanDirectionIsForward(dir)) { if (!PageIsEmpty(page) && offnum < maxoff) offnum = OffsetNumberNext(offnum); else { /* Walk right to the next page with data */ for (;;) { /* if we're at end of scan, release the buffer and return */ if (P_RIGHTMOST(opaque)) { _bt_relbuf(rel, *bufP); ItemPointerSetInvalid(current); *bufP = so->btso_curbuf = InvalidBuffer; return false; } /* step right one page */ blkno = opaque->btpo_next; _bt_relbuf(rel, *bufP); *bufP = _bt_getbuf(rel, blkno, BT_READ); page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!P_IGNORE(opaque)) { maxoff = PageGetMaxOffsetNumber(page); /* done if it's not empty */ offnum = P_FIRSTDATAKEY(opaque); if (!PageIsEmpty(page) && offnum <= maxoff) break; } } } } else /* backwards scan */ { if (offnum > P_FIRSTDATAKEY(opaque)) offnum = OffsetNumberPrev(offnum); else { /* * Walk left to the next page with data. This is much more * complex than the walk-right case because of the possibility * that the page to our left splits while we are in flight to * it, plus the possibility that the page we were on gets * deleted after we leave it. See nbtree/README for details. */ for (;;) { *bufP = _bt_walk_left(rel, *bufP); /* if we're at end of scan, return failure */ if (*bufP == InvalidBuffer) { ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; return false; } page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* * Okay, we managed to move left to a non-deleted page. * Done if it's not half-dead and not empty. Else loop * back and do it all again. */ if (!P_IGNORE(opaque)) { maxoff = PageGetMaxOffsetNumber(page); offnum = maxoff; if (!PageIsEmpty(page) && maxoff >= P_FIRSTDATAKEY(opaque)) break; } } } } /* Update scan state */ so->btso_curbuf = *bufP; blkno = BufferGetBlockNumber(*bufP); ItemPointerSet(current, blkno, offnum); return true; }
static void ginRedoUpdateMetapage(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; ginxlogUpdateMeta *data = (ginxlogUpdateMeta *) XLogRecGetData(record); Buffer metabuffer; Page metapage; Buffer buffer; /* * Restore the metapage. This is essentially the same as a full-page * image, so restore the metapage unconditionally without looking at the * LSN, to avoid torn page hazards. */ metabuffer = XLogInitBufferForRedo(record, 0); Assert(BufferGetBlockNumber(metabuffer) == GIN_METAPAGE_BLKNO); metapage = BufferGetPage(metabuffer); memcpy(GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData)); PageSetLSN(metapage, lsn); MarkBufferDirty(metabuffer); if (data->ntuples > 0) { /* * insert into tail page */ if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) { Page page = BufferGetPage(buffer); OffsetNumber off; int i; Size tupsize; char *payload; IndexTuple tuples; Size totaltupsize; payload = XLogRecGetBlockData(record, 1, &totaltupsize); tuples = (IndexTuple) payload; if (PageIsEmpty(page)) off = FirstOffsetNumber; else off = OffsetNumberNext(PageGetMaxOffsetNumber(page)); for (i = 0; i < data->ntuples; i++) { tupsize = IndexTupleSize(tuples); if (PageAddItem(page, (Item) tuples, tupsize, off, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page"); tuples = (IndexTuple) (((char *) tuples) + tupsize); off++; } Assert(payload + totaltupsize == (char *) tuples); /* * Increase counter of heap tuples */ GinPageGetOpaque(page)->maxoff++; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); } else if (data->prevTail != InvalidBlockNumber) { /* * New tail */ if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) { Page page = BufferGetPage(buffer); GinPageGetOpaque(page)->rightlink = data->newRightlink; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); } UnlockReleaseBuffer(metabuffer); }
/* * This function takes an already open relation and scans its pages, * skipping those that have the corresponding visibility map bit set. * For pages we skip, we find the free space from the free space map * and approximate tuple_len on that basis. For the others, we count * the exact number of dead tuples etc. * * This scan is loosely based on vacuumlazy.c:lazy_scan_heap(), but * we do not try to avoid skipping single pages. */ static void statapprox_heap(Relation rel, output_type *stat) { BlockNumber scanned, nblocks, blkno; Buffer vmbuffer = InvalidBuffer; BufferAccessStrategy bstrategy; TransactionId OldestXmin; uint64 misc_count = 0; OldestXmin = GetOldestXmin(rel, PROCARRAY_FLAGS_VACUUM); bstrategy = GetAccessStrategy(BAS_BULKREAD); nblocks = RelationGetNumberOfBlocks(rel); scanned = 0; for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; Size freespace; CHECK_FOR_INTERRUPTS(); /* * If the page has only visible tuples, then we can find out the free * space from the FSM and move on. */ if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer)) { freespace = GetRecordedFreeSpace(rel, blkno); stat->tuple_len += BLCKSZ - freespace; stat->free_space += freespace; continue; } buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buf, BUFFER_LOCK_SHARE); page = BufferGetPage(buf); /* * It's not safe to call PageGetHeapFreeSpace() on new pages, so we * treat them as being free space for our purposes. */ if (!PageIsNew(page)) stat->free_space += PageGetHeapFreeSpace(page); else stat->free_space += BLCKSZ - SizeOfPageHeaderData; if (PageIsNew(page) || PageIsEmpty(page)) { UnlockReleaseBuffer(buf); continue; } scanned++; /* * Look at each tuple on the page and decide whether it's live or * dead, then count it and its size. Unlike lazy_scan_heap, we can * afford to ignore problems and special cases. */ maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; HeapTupleData tuple; itemid = PageGetItemId(page, offnum); if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid) || ItemIdIsDead(itemid)) { continue; } Assert(ItemIdIsNormal(itemid)); ItemPointerSet(&(tuple.t_self), blkno, offnum); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tuple.t_tableOid = RelationGetRelid(rel); /* * We count live and dead tuples, but we also need to add up * others in order to feed vac_estimate_reltuples. */ switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) { case HEAPTUPLE_RECENTLY_DEAD: misc_count++; /* Fall through */ case HEAPTUPLE_DEAD: stat->dead_tuple_len += tuple.t_len; stat->dead_tuple_count++; break; case HEAPTUPLE_LIVE: stat->tuple_len += tuple.t_len; stat->tuple_count++; break; case HEAPTUPLE_INSERT_IN_PROGRESS: case HEAPTUPLE_DELETE_IN_PROGRESS: misc_count++; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } } UnlockReleaseBuffer(buf); } stat->table_len = (uint64) nblocks *BLCKSZ; stat->tuple_count = vac_estimate_reltuples(rel, false, nblocks, scanned, stat->tuple_count + misc_count); /* * Calculate percentages if the relation has one or more pages. */ if (nblocks != 0) { stat->scanned_percent = 100 * scanned / nblocks; stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len; stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len; stat->free_percent = 100.0 * stat->free_space / stat->table_len; } if (BufferIsValid(vmbuffer)) { ReleaseBuffer(vmbuffer); vmbuffer = InvalidBuffer; } }
static void ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record) { ginxlogUpdateMeta *data = (ginxlogUpdateMeta *) XLogRecGetData(record); Buffer metabuffer; Page metapage; Buffer buffer; /* * Restore the metapage. This is essentially the same as a full-page * image, so restore the metapage unconditionally without looking at the * LSN, to avoid torn page hazards. */ metabuffer = XLogReadBuffer(data->node, GIN_METAPAGE_BLKNO, false); if (!BufferIsValid(metabuffer)) return; /* assume index was deleted, nothing to do */ metapage = BufferGetPage(metabuffer); memcpy(GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData)); PageSetLSN(metapage, lsn); MarkBufferDirty(metabuffer); if (data->ntuples > 0) { /* * insert into tail page */ if (record->xl_info & XLR_BKP_BLOCK(0)) (void) RestoreBackupBlock(lsn, record, 0, false, false); else { buffer = XLogReadBuffer(data->node, data->metadata.tail, false); if (BufferIsValid(buffer)) { Page page = BufferGetPage(buffer); if (lsn > PageGetLSN(page)) { OffsetNumber l, off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); int i, tupsize; IndexTuple tuples = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogUpdateMeta)); for (i = 0; i < data->ntuples; i++) { tupsize = IndexTupleSize(tuples); l = PageAddItem(page, (Item) tuples, tupsize, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page"); tuples = (IndexTuple) (((char *) tuples) + tupsize); off++; } /* * Increase counter of heap tuples */ GinPageGetOpaque(page)->maxoff++; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } } else if (data->prevTail != InvalidBlockNumber) { /* * New tail */ if (record->xl_info & XLR_BKP_BLOCK(0)) (void) RestoreBackupBlock(lsn, record, 0, false, false); else { buffer = XLogReadBuffer(data->node, data->prevTail, false); if (BufferIsValid(buffer)) { Page page = BufferGetPage(buffer); if (lsn > PageGetLSN(page)) { GinPageGetOpaque(page)->rightlink = data->newRightlink; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } UnlockReleaseBuffer(buffer); } } } UnlockReleaseBuffer(metabuffer); }
/* * Write the index tuples contained in *collector into the index's * pending list. * * Function guarantees that all these tuples will be inserted consecutively, * preserving order */ void ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector) { Relation index = ginstate->index; Buffer metabuffer; Page metapage; GinMetaPageData *metadata = NULL; XLogRecData rdata[2]; Buffer buffer = InvalidBuffer; Page page = NULL; ginxlogUpdateMeta data; bool separateList = false; bool needCleanup = false; if (collector->ntuples == 0) return; data.node = index->rd_node; data.ntuples = 0; data.newRightlink = data.prevTail = InvalidBlockNumber; rdata[0].buffer = InvalidBuffer; rdata[0].data = (char *) &data; rdata[0].len = sizeof(ginxlogUpdateMeta); rdata[0].next = NULL; metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); metapage = BufferGetPage(metabuffer); if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GinListPageSize) { /* * Total size is greater than one page => make sublist */ separateList = true; } else { LockBuffer(metabuffer, GIN_EXCLUSIVE); metadata = GinPageGetMeta(metapage); if (metadata->head == InvalidBlockNumber || collector->sumsize + collector->ntuples * sizeof(ItemIdData) > metadata->tailFreeSize) { /* * Pending list is empty or total size is greater than freespace * on tail page => make sublist * * We unlock metabuffer to keep high concurrency */ separateList = true; LockBuffer(metabuffer, GIN_UNLOCK); } } if (separateList) { /* * We should make sublist separately and append it to the tail */ GinMetaPageData sublist; memset(&sublist, 0, sizeof(GinMetaPageData)); makeSublist(index, collector->tuples, collector->ntuples, &sublist); /* * metapage was unlocked, see above */ LockBuffer(metabuffer, GIN_EXCLUSIVE); metadata = GinPageGetMeta(metapage); if (metadata->head == InvalidBlockNumber) { /* * Main list is empty, so just insert sublist as main list */ START_CRIT_SECTION(); metadata->head = sublist.head; metadata->tail = sublist.tail; metadata->tailFreeSize = sublist.tailFreeSize; metadata->nPendingPages = sublist.nPendingPages; metadata->nPendingHeapTuples = sublist.nPendingHeapTuples; } else { /* * Merge lists */ data.prevTail = metadata->tail; data.newRightlink = sublist.head; buffer = ReadBuffer(index, metadata->tail); LockBuffer(buffer, GIN_EXCLUSIVE); page = BufferGetPage(buffer); rdata[0].next = rdata + 1; rdata[1].buffer = buffer; rdata[1].buffer_std = true; rdata[1].data = NULL; rdata[1].len = 0; rdata[1].next = NULL; Assert(GinPageGetOpaque(page)->rightlink == InvalidBlockNumber); START_CRIT_SECTION(); GinPageGetOpaque(page)->rightlink = sublist.head; MarkBufferDirty(buffer); metadata->tail = sublist.tail; metadata->tailFreeSize = sublist.tailFreeSize; metadata->nPendingPages += sublist.nPendingPages; metadata->nPendingHeapTuples += sublist.nPendingHeapTuples; } } else { /* * Insert into tail page. Metapage is already locked */ OffsetNumber l, off; int i, tupsize; char *ptr; buffer = ReadBuffer(index, metadata->tail); LockBuffer(buffer, GIN_EXCLUSIVE); page = BufferGetPage(buffer); off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); rdata[0].next = rdata + 1; rdata[1].buffer = buffer; rdata[1].buffer_std = true; ptr = rdata[1].data = (char *) palloc(collector->sumsize); rdata[1].len = collector->sumsize; rdata[1].next = NULL; data.ntuples = collector->ntuples; START_CRIT_SECTION(); /* * Increase counter of heap tuples */ Assert(GinPageGetOpaque(page)->maxoff <= metadata->nPendingHeapTuples); GinPageGetOpaque(page)->maxoff++; metadata->nPendingHeapTuples++; for (i = 0; i < collector->ntuples; i++) { tupsize = IndexTupleSize(collector->tuples[i]); l = PageAddItem(page, (Item) collector->tuples[i], tupsize, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(index)); memcpy(ptr, collector->tuples[i], tupsize); ptr += tupsize; off++; } Assert((ptr - rdata[1].data) <= collector->sumsize); metadata->tailFreeSize = PageGetExactFreeSpace(page); MarkBufferDirty(buffer); } /* * Write metabuffer, make xlog entry */ MarkBufferDirty(metabuffer); if (RelationNeedsWAL(index)) { XLogRecPtr recptr; memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE, rdata); PageSetLSN(metapage, recptr); PageSetTLI(metapage, ThisTimeLineID); if (buffer != InvalidBuffer) { PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } if (buffer != InvalidBuffer) UnlockReleaseBuffer(buffer); /* * Force pending list cleanup when it becomes too long. And, * ginInsertCleanup could take significant amount of time, so we prefer to * call it when it can do all the work in a single collection cycle. In * non-vacuum mode, it shouldn't require maintenance_work_mem, so fire it * while pending list is still small enough to fit into work_mem. * * ginInsertCleanup() should not be called inside our CRIT_SECTION. */ if (metadata->nPendingPages * GIN_PAGE_FREESIZE > work_mem * 1024L) needCleanup = true; UnlockReleaseBuffer(metabuffer); END_CRIT_SECTION(); if (needCleanup) ginInsertCleanup(ginstate, false, NULL); }
/* * redo any page update (except page split) */ static void gistRedoPageUpdateRecord(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) XLogRecGetData(record); Buffer buffer; Page page; if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { char *begin; char *data; Size datalen; int ninserted = 0; data = begin = XLogRecGetBlockData(record, 0, &datalen); page = (Page) BufferGetPage(buffer); /* Delete old tuples */ if (xldata->ntodelete > 0) { int i; OffsetNumber *todelete = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntodelete; for (i = 0; i < xldata->ntodelete; i++) PageIndexTupleDelete(page, todelete[i]); if (GistPageIsLeaf(page)) GistMarkTuplesDeleted(page); } /* add tuples */ if (data - begin < datalen) { OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); while (data - begin < datalen) { IndexTuple itup = (IndexTuple) data; Size sz = IndexTupleSize(itup); OffsetNumber l; data += sz; l = PageAddItem(page, (Item) itup, sz, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to GiST index page, size %d bytes", (int) sz); off++; ninserted++; } } Assert(ninserted == xldata->ntoinsert); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } /* * Fix follow-right data on left child page * * This must be done while still holding the lock on the target page. Note * that even if the target page no longer exists, we still attempt to * replay the change on the child page. */ if (XLogRecHasBlockRef(record, 1)) gistRedoClearFollowRight(record, 1); if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }
/* * Process one page during a bulkdelete scan */ static void spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno) { Relation index = bds->info->index; Buffer buffer; Page page; /* call vacuum_delay_point while not holding any buffer lock */ vacuum_delay_point(); buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, bds->info->strategy); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); page = (Page) BufferGetPage(buffer); if (PageIsNew(page)) { /* * We found an all-zero page, which could happen if the database * crashed just after extending the file. Initialize and recycle it. */ SpGistInitBuffer(buffer, 0); SpGistPageSetDeleted(page); /* We don't bother to WAL-log this action; easy to redo */ MarkBufferDirty(buffer); } else if (SpGistPageIsDeleted(page)) { /* nothing to do */ } else if (SpGistPageIsLeaf(page)) { if (SpGistBlockIsRoot(blkno)) { vacuumLeafRoot(bds, index, buffer); /* no need for vacuumRedirectAndPlaceholder */ } else { vacuumLeafPage(bds, index, buffer, false); vacuumRedirectAndPlaceholder(index, buffer); } } else { /* inner page */ vacuumRedirectAndPlaceholder(index, buffer); } /* * The root pages must never be deleted, nor marked as available in FSM, * because we don't want them ever returned by a search for a place to put * a new tuple. Otherwise, check for empty/deletable page, and make sure * FSM knows about it. */ if (!SpGistBlockIsRoot(blkno)) { /* If page is now empty, mark it deleted */ if (PageIsEmpty(page) && !SpGistPageIsDeleted(page)) { SpGistPageSetDeleted(page); /* We don't bother to WAL-log this action; easy to redo */ MarkBufferDirty(buffer); } if (SpGistPageIsDeleted(page)) { RecordFreeIndexPage(index, blkno); bds->stats->pages_deleted++; } else bds->lastFilledBlock = blkno; } SpGistSetLastUsedPage(index, buffer); UnlockReleaseBuffer(buffer); }
/* * redo any page update (except page split) */ static void gistRedoPageUpdateRecord(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) XLogRecGetData(record); Buffer buffer; Page page; if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { char *begin; char *data; Size datalen; int ninserted = 0; data = begin = XLogRecGetBlockData(record, 0, &datalen); page = (Page) BufferGetPage(buffer); if (xldata->ntodelete == 1 && xldata->ntoinsert == 1) { /* * When replacing one tuple with one other tuple, we must use * PageIndexTupleOverwrite for consistency with gistplacetopage. */ OffsetNumber offnum = *((OffsetNumber *) data); IndexTuple itup; Size itupsize; data += sizeof(OffsetNumber); itup = (IndexTuple) data; itupsize = IndexTupleSize(itup); if (!PageIndexTupleOverwrite(page, offnum, (Item) itup, itupsize)) elog(ERROR, "failed to add item to GiST index page, size %d bytes", (int) itupsize); data += itupsize; /* should be nothing left after consuming 1 tuple */ Assert(data - begin == datalen); /* update insertion count for assert check below */ ninserted++; } else if (xldata->ntodelete > 0) { /* Otherwise, delete old tuples if any */ OffsetNumber *todelete = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntodelete; PageIndexMultiDelete(page, todelete, xldata->ntodelete); if (GistPageIsLeaf(page)) GistMarkTuplesDeleted(page); } /* Add new tuples if any */ if (data - begin < datalen) { OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); while (data - begin < datalen) { IndexTuple itup = (IndexTuple) data; Size sz = IndexTupleSize(itup); OffsetNumber l; data += sz; l = PageAddItem(page, (Item) itup, sz, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to GiST index page, size %d bytes", (int) sz); off++; ninserted++; } } /* Check that XLOG record contained expected number of tuples */ Assert(ninserted == xldata->ntoinsert); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } /* * Fix follow-right data on left child page * * This must be done while still holding the lock on the target page. Note * that even if the target page no longer exists, we still attempt to * replay the change on the child page. */ if (XLogRecHasBlockRef(record, 1)) gistRedoClearFollowRight(record, 1); if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }
/* * Rescan end pages to verify that they are (still) empty of tuples. * * Returns number of nondeletable pages (last nonempty page + 1). */ static BlockNumber count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats) { BlockNumber blkno; /* Strange coding of loop control is needed because blkno is unsigned */ blkno = vacrelstats->rel_pages; while (blkno > vacrelstats->nonempty_pages) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool hastup; /* * We don't insert a vacuum delay point here, because we have an * exclusive lock on the table which we want to hold for as short a * time as possible. We still need to check for interrupts however. */ CHECK_FOR_INTERRUPTS(); blkno--; buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, vac_strategy); /* In this phase we only need shared access to the buffer */ LockBuffer(buf, BUFFER_LOCK_SHARE); page = BufferGetPage(buf); if (PageIsNew(page) || PageIsEmpty(page)) { /* PageIsNew probably shouldn't happen... */ UnlockReleaseBuffer(buf); continue; } hastup = false; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* * Note: any non-unused item should be taken as a reason to keep * this page. We formerly thought that DEAD tuples could be * thrown away, but that's not so, because we'd not have cleaned * out their index entries. */ if (ItemIdIsUsed(itemid)) { hastup = true; break; /* can stop scanning */ } } /* scan along page */ UnlockReleaseBuffer(buf); /* Done scanning if we found a tuple here */ if (hastup) return blkno + 1; } /* * If we fall out of the loop, all the previously-thought-to-be-empty * pages still are; we need not bother to look at the last known-nonempty * page. */ return vacrelstats->nonempty_pages; }
static int gistlayerinsert(Relation r, BlockNumber blkno, IndexTuple **itup, /* in - out, has compressed entry */ int *len, /* in - out */ InsertIndexResult *res, /* out */ GISTSTATE *giststate) { Buffer buffer; Page page; OffsetNumber child; int ret; GISTPageOpaque opaque; buffer = ReadBuffer(r, blkno); page = (Page) BufferGetPage(buffer); opaque = (GISTPageOpaque) PageGetSpecialPointer(page); if (!(opaque->flags & F_LEAF)) { /* internal page, so we must walk on tree */ /* len IS equal 1 */ ItemId iid; BlockNumber nblkno; ItemPointerData oldtid; IndexTuple oldtup; child = gistchoose(r, page, *(*itup), giststate); iid = PageGetItemId(page, child); oldtup = (IndexTuple) PageGetItem(page, iid); nblkno = ItemPointerGetBlockNumber(&(oldtup->t_tid)); /* * After this call: 1. if child page was splited, then itup * contains keys for each page 2. if child page wasn't splited, * then itup contains additional for adjustment of current key */ ret = gistlayerinsert(r, nblkno, itup, len, res, giststate); /* nothing inserted in child */ if (!(ret & INSERTED)) { ReleaseBuffer(buffer); return 0x00; } /* child does not splited */ if (!(ret & SPLITED)) { IndexTuple newtup = gistgetadjusted(r, oldtup, (*itup)[0], giststate); if (!newtup) { /* not need to update key */ ReleaseBuffer(buffer); return 0x00; } pfree((*itup)[0]); /* !!! */ (*itup)[0] = newtup; } /* key is modified, so old version must be deleted */ ItemPointerSet(&oldtid, blkno, child); gistdelete(r, &oldtid); /* * if child was splitted, new key for child will be inserted in * the end list of child, so we must say to any scans that page is * changed beginning from 'child' offset */ if (ret & SPLITED) gistadjscans(r, GISTOP_SPLIT, blkno, child); } ret = INSERTED; if (gistnospace(page, (*itup), *len)) { /* no space for insertion */ IndexTuple *itvec, *newitup; int tlen, oldlen; ret |= SPLITED; itvec = gistreadbuffer(buffer, &tlen); itvec = gistjoinvector(itvec, &tlen, (*itup), *len); oldlen = *len; newitup = gistSplit(r, buffer, itvec, &tlen, giststate, (opaque->flags & F_LEAF) ? res : NULL); /* res only for * inserting in leaf */ ReleaseBuffer(buffer); do pfree((*itup)[oldlen - 1]); while ((--oldlen) > 0); pfree((*itup)); pfree(itvec); *itup = newitup; *len = tlen; /* now tlen >= 2 */ } else { /* enogth space */ OffsetNumber off, l; off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); l = gistwritebuffer(r, page, (*itup), *len, off); WriteBuffer(buffer); /* * set res if insert into leaf page, in this case, len = 1 always */ if (res && (opaque->flags & F_LEAF)) ItemPointerSet(&((*res)->pointerData), blkno, l); if (*len > 1) { /* previous insert ret & SPLITED != 0 */ int i; /* * child was splited, so we must form union for insertion in * parent */ IndexTuple newtup = gistunion(r, (*itup), *len, giststate); ItemPointerSet(&(newtup->t_tid), blkno, 1); for (i = 0; i < *len; i++) pfree((*itup)[i]); (*itup)[0] = newtup; *len = 1; } } return ret; }
/* * PageIndexTupleDelete * * This routine does the work of removing a tuple from an index page. * * Unlike heap pages, we compact out the line pointer for the removed tuple. */ void PageIndexTupleDelete(Page page, OffsetNumber offnum) { PageHeader phdr = (PageHeader) page; char *addr; ItemId tup; Size size; unsigned offset; int nbytes; int offidx; int nline; /* * As with PageRepairFragmentation, paranoia seems justified. */ if (phdr->pd_lower < SizeOfPageHeaderData || phdr->pd_lower > phdr->pd_upper || phdr->pd_upper > phdr->pd_special || phdr->pd_special > BLCKSZ || phdr->pd_special != MAXALIGN(phdr->pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", phdr->pd_lower, phdr->pd_upper, phdr->pd_special))); nline = PageGetMaxOffsetNumber(page); if ((int) offnum <= 0 || (int) offnum > nline) elog(ERROR, "invalid index offnum: %u", offnum); /* change offset number to offset index */ offidx = offnum - 1; tup = PageGetItemId(page, offnum); Assert(ItemIdHasStorage(tup)); size = ItemIdGetLength(tup); offset = ItemIdGetOffset(tup); if (offset < phdr->pd_upper || (offset + size) > phdr->pd_special || offset != MAXALIGN(offset)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item pointer: offset = %u, size = %u", offset, (unsigned int) size))); /* Amount of space to actually be deleted */ size = MAXALIGN(size); /* * First, we want to get rid of the pd_linp entry for the index tuple. We * copy all subsequent linp's back one slot in the array. We don't use * PageGetItemId, because we are manipulating the _array_, not individual * linp's. */ nbytes = phdr->pd_lower - ((char *) &phdr->pd_linp[offidx + 1] - (char *) phdr); if (nbytes > 0) memmove((char *) &(phdr->pd_linp[offidx]), (char *) &(phdr->pd_linp[offidx + 1]), nbytes); /* * Now move everything between the old upper bound (beginning of tuple * space) and the beginning of the deleted tuple forward, so that space in * the middle of the page is left free. If we've just deleted the tuple * at the beginning of tuple space, then there's no need to do the copy. */ /* beginning of tuple space */ addr = (char *) page + phdr->pd_upper; if (offset > phdr->pd_upper) memmove(addr + size, addr, offset - phdr->pd_upper); /* adjust free space boundary pointers */ phdr->pd_upper += size; phdr->pd_lower -= sizeof(ItemIdData); /* * Finally, we need to adjust the linp entries that remain. * * Anything that used to be before the deleted tuple's data was moved * forward by the size of the deleted tuple. */ if (!PageIsEmpty(page)) { int i; nline--; /* there's one less than when we started */ for (i = 1; i <= nline; i++) { ItemId ii = PageGetItemId(phdr, i); Assert(ItemIdHasStorage(ii)); if (ItemIdGetOffset(ii) <= offset) ii->lp_off += size; } } }
/* * redo any page update (except page split) */ static void gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record) { char *begin = XLogRecGetData(record); gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) begin; Buffer buffer; Page page; char *data; if (BlockNumberIsValid(xldata->leftchild)) gistRedoClearFollowRight(xldata->node, lsn, xldata->leftchild); /* nothing more to do if page was backed up (and no info to do it with) */ if (record->xl_info & XLR_BKP_BLOCK_1) return; buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); if (!BufferIsValid(buffer)) return; page = (Page) BufferGetPage(buffer); if (XLByteLE(lsn, PageGetLSN(page))) { UnlockReleaseBuffer(buffer); return; } data = begin + sizeof(gistxlogPageUpdate); /* Delete old tuples */ if (xldata->ntodelete > 0) { int i; OffsetNumber *todelete = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntodelete; for (i = 0; i < xldata->ntodelete; i++) PageIndexTupleDelete(page, todelete[i]); if (GistPageIsLeaf(page)) GistMarkTuplesDeleted(page); } /* add tuples */ if (data - begin < record->xl_len) { OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); while (data - begin < record->xl_len) { IndexTuple itup = (IndexTuple) data; Size sz = IndexTupleSize(itup); OffsetNumber l; data += sz; l = PageAddItem(page, (Item) itup, sz, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to GiST index page, size %d bytes", (int) sz); off++; } } else { /* * special case: leafpage, nothing to insert, nothing to delete, then * vacuum marks page */ if (GistPageIsLeaf(page) && xldata->ntodelete == 0) GistClearTuplesDeleted(page); } if (!GistPageIsLeaf(page) && PageGetMaxOffsetNumber(page) == InvalidOffsetNumber && xldata->blkno == GIST_ROOT_BLKNO) /* * all links on non-leaf root page was deleted by vacuum full, so root * page becomes a leaf */ GistPageSetLeaf(page); GistPageGetOpaque(page)->rightlink = InvalidBlockNumber; PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); }
/* * Get a buffer of the type and parity specified by flags, having at least * as much free space as indicated by needSpace. We use the lastUsedPages * cache to assign the same buffer previously requested when possible. * The returned buffer is already pinned and exclusive-locked. * * *isNew is set true if the page was initialized here, false if it was * already valid. */ Buffer SpGistGetBuffer(Relation index, int flags, int needSpace, bool *isNew) { SpGistCache *cache = spgGetCache(index); SpGistLastUsedPage *lup; /* Bail out if even an empty page wouldn't meet the demand */ if (needSpace > SPGIST_PAGE_CAPACITY) elog(ERROR, "desired SPGiST tuple size is too big"); /* * If possible, increase the space request to include relation's * fillfactor. This ensures that when we add unrelated tuples to a page, * we try to keep 100-fillfactor% available for adding tuples that are * related to the ones already on it. But fillfactor mustn't cause an * error for requests that would otherwise be legal. */ needSpace += RelationGetTargetPageFreeSpace(index, SPGIST_DEFAULT_FILLFACTOR); needSpace = Min(needSpace, SPGIST_PAGE_CAPACITY); /* Get the cache entry for this flags setting */ lup = GET_LUP(cache, flags); /* If we have nothing cached, just turn it over to allocNewBuffer */ if (lup->blkno == InvalidBlockNumber) { *isNew = true; return allocNewBuffer(index, flags); } /* fixed pages should never be in cache */ Assert(!SpGistBlockIsFixed(lup->blkno)); /* If cached freeSpace isn't enough, don't bother looking at the page */ if (lup->freeSpace >= needSpace) { Buffer buffer; Page page; buffer = ReadBuffer(index, lup->blkno); if (!ConditionalLockBuffer(buffer)) { /* * buffer is locked by another process, so return a new buffer */ ReleaseBuffer(buffer); *isNew = true; return allocNewBuffer(index, flags); } page = BufferGetPage(buffer); if (PageIsNew(page) || SpGistPageIsDeleted(page) || PageIsEmpty(page)) { /* OK to initialize the page */ uint16 pageflags = 0; if (GBUF_REQ_LEAF(flags)) pageflags |= SPGIST_LEAF; if (GBUF_REQ_NULLS(flags)) pageflags |= SPGIST_NULLS; SpGistInitBuffer(buffer, pageflags); lup->freeSpace = PageGetExactFreeSpace(page) - needSpace; *isNew = true; return buffer; } /* * Check that page is of right type and has enough space. We must * recheck this since our cache isn't necessarily up to date. */ if ((GBUF_REQ_LEAF(flags) ? SpGistPageIsLeaf(page) : !SpGistPageIsLeaf(page)) && (GBUF_REQ_NULLS(flags) ? SpGistPageStoresNulls(page) : !SpGistPageStoresNulls(page))) { int freeSpace = PageGetExactFreeSpace(page); if (freeSpace >= needSpace) { /* Success, update freespace info and return the buffer */ lup->freeSpace = freeSpace - needSpace; *isNew = false; return buffer; } } /* * fallback to allocation of new buffer */ UnlockReleaseBuffer(buffer); } /* No success with cache, so return a new buffer */ *isNew = true; return allocNewBuffer(index, flags); }
/* * _hash_squeezebucket(rel, bucket) * * Try to squeeze the tuples onto pages occurring earlier in the * bucket chain in an attempt to free overflow pages. When we start * the "squeezing", the page from which we start taking tuples (the * "read" page) is the last bucket in the bucket chain and the page * onto which we start squeezing tuples (the "write" page) is the * first page in the bucket chain. The read page works backward and * the write page works forward; the procedure terminates when the * read page and write page are the same page. * * At completion of this procedure, it is guaranteed that all pages in * the bucket are nonempty, unless the bucket is totally empty (in * which case all overflow pages will be freed). The original implementation * required that to be true on entry as well, but it's a lot easier for * callers to leave empty overflow pages and let this guy clean it up. * * Caller must acquire cleanup lock on the primary page of the target * bucket to exclude any scans that are in progress, which could easily * be confused into returning the same tuple more than once or some tuples * not at all by the rearrangement we are performing here. To prevent * any concurrent scan to cross the squeeze scan we use lock chaining * similar to hasbucketcleanup. Refer comments atop hashbucketcleanup. * * We need to retain a pin on the primary bucket to ensure that no concurrent * split can start. * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. */ void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy) { BlockNumber wblkno; BlockNumber rblkno; Buffer wbuf; Buffer rbuf; Page wpage; Page rpage; HashPageOpaque wopaque; HashPageOpaque ropaque; /* * start squeezing into the primary bucket page. */ wblkno = bucket_blkno; wbuf = bucket_buf; wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); /* * if there aren't any overflow pages, there's nothing to squeeze. caller * is responsible for releasing the pin on primary bucket page. */ if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) { LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); return; } /* * Find the last page in the bucket chain by starting at the base bucket * page and working forward. Note: we assume that a hash bucket chain is * usually smaller than the buffer ring being used by VACUUM, else using * the access strategy here would be counterproductive. */ rbuf = InvalidBuffer; ropaque = wopaque; do { rblkno = ropaque->hasho_nextblkno; if (rbuf != InvalidBuffer) _hash_relbuf(rel, rbuf); rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); /* * squeeze the tuples. */ for (;;) { OffsetNumber roffnum; OffsetNumber maxroffnum; OffsetNumber deletable[MaxOffsetNumber]; IndexTuple itups[MaxIndexTuplesPerPage]; Size tups_size[MaxIndexTuplesPerPage]; OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; uint16 ndeletable = 0; uint16 nitups = 0; Size all_tups_size = 0; int i; bool retain_pin = false; readpage: /* Scan each tuple in "read" page */ maxroffnum = PageGetMaxOffsetNumber(rpage); for (roffnum = FirstOffsetNumber; roffnum <= maxroffnum; roffnum = OffsetNumberNext(roffnum)) { IndexTuple itup; Size itemsz; /* skip dead tuples */ if (ItemIdIsDead(PageGetItemId(rpage, roffnum))) continue; itup = (IndexTuple) PageGetItem(rpage, PageGetItemId(rpage, roffnum)); itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* * Walk up the bucket chain, looking for a page big enough for * this item and all other accumulated items. Exit if we reach * the read page. */ while (PageGetFreeSpaceForMultipleTuples(wpage, nitups + 1) < (all_tups_size + itemsz)) { Buffer next_wbuf = InvalidBuffer; bool tups_moved = false; Assert(!PageIsEmpty(wpage)); if (wblkno == bucket_blkno) retain_pin = true; wblkno = wopaque->hasho_nextblkno; Assert(BlockNumberIsValid(wblkno)); /* don't need to move to next page if we reached the read page */ if (wblkno != rblkno) next_wbuf = _hash_getbuf_with_strategy(rel, wblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); if (nitups > 0) { Assert(nitups == ndeletable); /* * This operation needs to log multiple tuples, prepare * WAL for that. */ if (RelationNeedsWAL(rel)) XLogEnsureRecordSpace(0, 3 + nitups); START_CRIT_SECTION(); /* * we have to insert tuples on the "write" page, being * careful to preserve hashkey ordering. (If we insert * many tuples into the same "write" page it would be * worth qsort'ing them). */ _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); MarkBufferDirty(wbuf); /* Delete tuples we already moved off read page */ PageIndexMultiDelete(rpage, deletable, ndeletable); MarkBufferDirty(rbuf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; xl_hash_move_page_contents xlrec; xlrec.ntups = nitups; xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf) ? true : false; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashMovePageContents); /* * bucket buffer needs to be registered to ensure that * we can acquire a cleanup lock on it during replay. */ if (!xlrec.is_prim_bucket_same_wrt) XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); XLogRegisterBufData(1, (char *) itup_offsets, nitups * sizeof(OffsetNumber)); for (i = 0; i < nitups; i++) XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD); XLogRegisterBufData(2, (char *) deletable, ndeletable * sizeof(OffsetNumber)); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS); PageSetLSN(BufferGetPage(wbuf), recptr); PageSetLSN(BufferGetPage(rbuf), recptr); } END_CRIT_SECTION(); tups_moved = true; } /* * release the lock on previous page after acquiring the lock * on next page */ if (retain_pin) LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, wbuf); /* nothing more to do if we reached the read page */ if (rblkno == wblkno) { _hash_relbuf(rel, rbuf); return; } wbuf = next_wbuf; wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); Assert(wopaque->hasho_bucket == bucket); retain_pin = false; /* be tidy */ for (i = 0; i < nitups; i++) pfree(itups[i]); nitups = 0; all_tups_size = 0; ndeletable = 0; /* * after moving the tuples, rpage would have been compacted, * so we need to rescan it. */ if (tups_moved) goto readpage; } /* remember tuple for deletion from "read" page */ deletable[ndeletable++] = roffnum; /* * we need a copy of index tuples as they can be freed as part of * overflow page, however we need them to write a WAL record in * _hash_freeovflpage. */ itups[nitups] = CopyIndexTuple(itup); tups_size[nitups++] = itemsz; all_tups_size += itemsz; } /* * If we reach here, there are no live tuples on the "read" page --- * it was empty when we got to it, or we moved them all. So we can * just free the page without bothering with deleting tuples * individually. Then advance to the previous "read" page. * * Tricky point here: if our read and write pages are adjacent in the * bucket chain, our write lock on wbuf will conflict with * _hash_freeovflpage's attempt to update the sibling links of the * removed page. In that case, we don't need to lock it again. */ rblkno = ropaque->hasho_prevblkno; Assert(BlockNumberIsValid(rblkno)); /* free this overflow page (releases rbuf) */ _hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets, tups_size, nitups, bstrategy); /* be tidy */ for (i = 0; i < nitups; i++) pfree(itups[i]); /* are we freeing the page adjacent to wbuf? */ if (rblkno == wblkno) { /* retain the pin on primary bucket page till end of bucket scan */ if (wblkno == bucket_blkno) LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, wbuf); return; } rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } /* NOTREACHED */ }
/* * Rescan end pages to verify that they are (still) empty of tuples. * * Returns number of nondeletable pages (last nonempty page + 1). */ static BlockNumber count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats) { BlockNumber blkno; instr_time starttime; instr_time currenttime; instr_time elapsed; /* Initialize the starttime if we check for conflicting lock requests */ INSTR_TIME_SET_CURRENT(starttime); /* Strange coding of loop control is needed because blkno is unsigned */ blkno = vacrelstats->rel_pages; while (blkno > vacrelstats->nonempty_pages) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool hastup; /* * Check if another process requests a lock on our relation. We are * holding an AccessExclusiveLock here, so they will be waiting. We * only do this in autovacuum_truncate_lock_check millisecond * intervals, and we only check if that interval has elapsed once * every 32 blocks to keep the number of system calls and actual * shared lock table lookups to a minimum. */ if ((blkno % 32) == 0) { INSTR_TIME_SET_CURRENT(currenttime); elapsed = currenttime; INSTR_TIME_SUBTRACT(elapsed, starttime); if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000) >= AUTOVACUUM_TRUNCATE_LOCK_CHECK_INTERVAL) { if (LockHasWaitersRelation(onerel, AccessExclusiveLock)) { ereport(elevel, (errmsg("\"%s\": suspending truncate " "due to conflicting lock request", RelationGetRelationName(onerel)))); vacrelstats->lock_waiter_detected = true; return blkno; } starttime = currenttime; } } /* * We don't insert a vacuum delay point here, because we have an * exclusive lock on the table which we want to hold for as short a * time as possible. We still need to check for interrupts however. */ CHECK_FOR_INTERRUPTS(); blkno--; buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, vac_strategy); /* In this phase we only need shared access to the buffer */ LockBuffer(buf, BUFFER_LOCK_SHARE); page = BufferGetPage(buf); if (PageIsNew(page) || PageIsEmpty(page)) { /* PageIsNew probably shouldn't happen... */ UnlockReleaseBuffer(buf); continue; } hastup = false; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* * Note: any non-unused item should be taken as a reason to keep * this page. We formerly thought that DEAD tuples could be * thrown away, but that's not so, because we'd not have cleaned * out their index entries. */ if (ItemIdIsUsed(itemid)) { hastup = true; break; /* can stop scanning */ } } /* scan along page */ UnlockReleaseBuffer(buf); /* Done scanning if we found a tuple here */ if (hastup) return blkno + 1; } /* * If we fall out of the loop, all the previously-thought-to-be-empty * pages still are; we need not bother to look at the last known-nonempty * page. */ return vacrelstats->nonempty_pages; }
/* * Fetch a tuples that matchs the search key; this can be invoked * either to fetch the first such tuple or subsequent matching * tuples. Returns true iff a matching tuple was found. */ static int gistnext(IndexScanDesc scan, ScanDirection dir, ItemPointer tids, int maxtids, bool ignore_killed_tuples) { MIRROREDLOCK_BUFMGR_DECLARE; Page p; OffsetNumber n; GISTScanOpaque so; GISTSearchStack *stk; IndexTuple it; GISTPageOpaque opaque; int ntids = 0; so = (GISTScanOpaque) scan->opaque; // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; if ( so->qual_ok == false ) return 0; if (ItemPointerIsValid(&so->curpos) == false) { /* Being asked to fetch the first entry, so start at the root */ Assert(so->curbuf == InvalidBuffer); Assert(so->stack == NULL); so->curbuf = ReadBuffer(scan->indexRelation, GIST_ROOT_BLKNO); stk = so->stack = (GISTSearchStack *) palloc0(sizeof(GISTSearchStack)); stk->next = NULL; stk->block = GIST_ROOT_BLKNO; pgstat_count_index_scan(scan->indexRelation); } else if (so->curbuf == InvalidBuffer) { MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return 0; } /* * check stored pointers from last visit */ if ( so->nPageData > 0 ) { while( ntids < maxtids && so->curPageData < so->nPageData ) { tids[ ntids ] = scan->xs_ctup.t_self = so->pageData[ so->curPageData ].heapPtr; ItemPointerSet(&(so->curpos), BufferGetBlockNumber(so->curbuf), so->pageData[ so->curPageData ].pageOffset); so->curPageData ++; ntids++; } if ( ntids == maxtids ) { MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } /* * Go to the next page */ stk = so->stack->next; pfree(so->stack); so->stack = stk; /* If we're out of stack entries, we're done */ if (so->stack == NULL) { ReleaseBuffer(so->curbuf); so->curbuf = InvalidBuffer; MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } so->curbuf = ReleaseAndReadBuffer(so->curbuf, scan->indexRelation, stk->block); } for (;;) { /* First of all, we need lock buffer */ Assert(so->curbuf != InvalidBuffer); LockBuffer(so->curbuf, GIST_SHARE); gistcheckpage(scan->indexRelation, so->curbuf); p = BufferGetPage(so->curbuf); opaque = GistPageGetOpaque(p); /* remember lsn to identify page changed for tuple's killing */ so->stack->lsn = PageGetLSN(p); /* check page split, occured from last visit or visit to parent */ if (!XLogRecPtrIsInvalid(so->stack->parentlsn) && XLByteLT(so->stack->parentlsn, opaque->nsn) && opaque->rightlink != InvalidBlockNumber /* sanity check */ && (so->stack->next == NULL || so->stack->next->block != opaque->rightlink) /* check if already added */ ) { /* detect page split, follow right link to add pages */ stk = (GISTSearchStack *) palloc(sizeof(GISTSearchStack)); stk->next = so->stack->next; stk->block = opaque->rightlink; stk->parentlsn = so->stack->parentlsn; memset(&(stk->lsn), 0, sizeof(GistNSN)); so->stack->next = stk; } /* if page is empty, then just skip it */ if (PageIsEmpty(p)) { LockBuffer(so->curbuf, GIST_UNLOCK); stk = so->stack->next; pfree(so->stack); so->stack = stk; if (so->stack == NULL) { ReleaseBuffer(so->curbuf); so->curbuf = InvalidBuffer; MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } so->curbuf = ReleaseAndReadBuffer(so->curbuf, scan->indexRelation, stk->block); continue; } if (ScanDirectionIsBackward(dir)) n = PageGetMaxOffsetNumber(p); else n = FirstOffsetNumber; /* wonderful, we can look at page */ so->nPageData = so->curPageData = 0; for (;;) { n = gistfindnext(scan, n, dir); if (!OffsetNumberIsValid(n)) { while( ntids < maxtids && so->curPageData < so->nPageData ) { tids[ ntids ] = scan->xs_ctup.t_self = so->pageData[ so->curPageData ].heapPtr; ItemPointerSet(&(so->curpos), BufferGetBlockNumber(so->curbuf), so->pageData[ so->curPageData ].pageOffset); so->curPageData ++; ntids++; } if ( ntids == maxtids ) { LockBuffer(so->curbuf, GIST_UNLOCK); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } /* * We ran out of matching index entries on the current page, * so pop the top stack entry and use it to continue the * search. */ LockBuffer(so->curbuf, GIST_UNLOCK); stk = so->stack->next; pfree(so->stack); so->stack = stk; /* If we're out of stack entries, we're done */ if (so->stack == NULL) { ReleaseBuffer(so->curbuf); so->curbuf = InvalidBuffer; MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; } so->curbuf = ReleaseAndReadBuffer(so->curbuf, scan->indexRelation, stk->block); /* XXX go up */ break; } if (GistPageIsLeaf(p)) { /* * We've found a matching index entry in a leaf page, so * return success. Note that we keep "curbuf" pinned so that * we can efficiently resume the index scan later. */ if (!(ignore_killed_tuples && ItemIdIsDead(PageGetItemId(p, n)))) { it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); so->pageData[ so->nPageData ].heapPtr = it->t_tid; so->pageData[ so->nPageData ].pageOffset = n; so->nPageData ++; } } else { /* * We've found an entry in an internal node whose key is * consistent with the search key, so push it to stack */ stk = (GISTSearchStack *) palloc(sizeof(GISTSearchStack)); it = (IndexTuple) PageGetItem(p, PageGetItemId(p, n)); stk->block = ItemPointerGetBlockNumber(&(it->t_tid)); memset(&(stk->lsn), 0, sizeof(GistNSN)); stk->parentlsn = so->stack->lsn; stk->next = so->stack->next; so->stack->next = stk; } if (ScanDirectionIsBackward(dir)) n = OffsetNumberPrev(n); else n = OffsetNumberNext(n); } } MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- return ntids; }
static void rtdoinsert(Relation r, IndexTuple itup, RTSTATE *rtstate) { Page page; Buffer buffer; BlockNumber blk; IndexTuple which; OffsetNumber l; RTSTACK *stack; RTreePageOpaque opaque; Datum datum; blk = P_ROOT; buffer = InvalidBuffer; stack = NULL; do { /* release the current buffer, read in the next one */ buffer = ReleaseAndReadBuffer(buffer, r, blk); page = (Page) BufferGetPage(buffer); opaque = (RTreePageOpaque) PageGetSpecialPointer(page); if (!(opaque->flags & F_LEAF)) { RTSTACK *n; ItemId iid; n = (RTSTACK *) palloc(sizeof(RTSTACK)); n->rts_parent = stack; n->rts_blk = blk; n->rts_child = choose(r, page, itup, rtstate); stack = n; iid = PageGetItemId(page, n->rts_child); which = (IndexTuple) PageGetItem(page, iid); blk = ItemPointerGetBlockNumber(&(which->t_tid)); } } while (!(opaque->flags & F_LEAF)); if (nospace(page, itup)) { /* need to do a split */ rtdosplit(r, buffer, stack, itup, rtstate); freestack(stack); WriteBuffer(buffer); /* don't forget to release buffer! */ return; } /* add the item and write the buffer */ if (PageIsEmpty(page)) { l = PageAddItem(page, (Item) itup, IndexTupleSize(itup), FirstOffsetNumber, LP_USED); } else { l = PageAddItem(page, (Item) itup, IndexTupleSize(itup), OffsetNumberNext(PageGetMaxOffsetNumber(page)), LP_USED); } if (l == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(r)); WriteBuffer(buffer); datum = IndexTupleGetDatum(itup); /* now expand the page boundary in the parent to include the new child */ rttighten(r, stack, datum, IndexTupleAttSize(itup), rtstate); freestack(stack); }
/* * redo any page update (except page split) */ static void gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record) { char *begin = XLogRecGetData(record); gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) begin; Buffer buffer; Page page; char *data; /* * We need to acquire and hold lock on target page while updating the left * child page. If we have a full-page image of target page, getting the * lock is a side-effect of restoring that image. Note that even if the * target page no longer exists, we'll still attempt to replay the change * on the child page. */ if (record->xl_info & XLR_BKP_BLOCK(0)) buffer = RestoreBackupBlock(lsn, record, 0, false, true); else buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); /* Fix follow-right data on left child page */ if (BlockNumberIsValid(xldata->leftchild)) gistRedoClearFollowRight(lsn, record, 1, xldata->node, xldata->leftchild); /* Done if target page no longer exists */ if (!BufferIsValid(buffer)) return; /* nothing more to do if page was backed up (and no info to do it with) */ if (record->xl_info & XLR_BKP_BLOCK(0)) { UnlockReleaseBuffer(buffer); return; } page = (Page) BufferGetPage(buffer); /* nothing more to do if change already applied */ if (lsn <= PageGetLSN(page)) { UnlockReleaseBuffer(buffer); return; } data = begin + sizeof(gistxlogPageUpdate); /* Delete old tuples */ if (xldata->ntodelete > 0) { int i; OffsetNumber *todelete = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntodelete; for (i = 0; i < xldata->ntodelete; i++) PageIndexTupleDelete(page, todelete[i]); if (GistPageIsLeaf(page)) GistMarkTuplesDeleted(page); } /* add tuples */ if (data - begin < record->xl_len) { OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); while (data - begin < record->xl_len) { IndexTuple itup = (IndexTuple) data; Size sz = IndexTupleSize(itup); OffsetNumber l; data += sz; l = PageAddItem(page, (Item) itup, sz, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to GiST index page, size %d bytes", (int) sz); off++; } } else { /* * special case: leafpage, nothing to insert, nothing to delete, then * vacuum marks page */ if (GistPageIsLeaf(page) && xldata->ntodelete == 0) GistClearTuplesDeleted(page); } if (!GistPageIsLeaf(page) && PageGetMaxOffsetNumber(page) == InvalidOffsetNumber && xldata->blkno == GIST_ROOT_BLKNO) { /* * all links on non-leaf root page was deleted by vacuum full, so root * page becomes a leaf */ GistPageSetLeaf(page); } GistPageGetOpaque(page)->rightlink = InvalidBlockNumber; PageSetLSN(page, lsn); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); }
/* * PageIndexTupleDeleteNoCompact * * Remove the specified tuple from an index page, but set its line pointer * to "unused" instead of compacting it out, except that it can be removed * if it's the last line pointer on the page. * * This is used for index AMs that require that existing TIDs of live tuples * remain unchanged, and are willing to allow unused line pointers instead. */ void PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offnum) { PageHeader phdr = (PageHeader) page; char *addr; ItemId tup; Size size; unsigned offset; int nline; /* * As with PageRepairFragmentation, paranoia seems justified. */ if (phdr->pd_lower < SizeOfPageHeaderData || phdr->pd_lower > phdr->pd_upper || phdr->pd_upper > phdr->pd_special || phdr->pd_special > BLCKSZ || phdr->pd_special != MAXALIGN(phdr->pd_special)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", phdr->pd_lower, phdr->pd_upper, phdr->pd_special))); nline = PageGetMaxOffsetNumber(page); if ((int) offnum <= 0 || (int) offnum > nline) elog(ERROR, "invalid index offnum: %u", offnum); tup = PageGetItemId(page, offnum); Assert(ItemIdHasStorage(tup)); size = ItemIdGetLength(tup); offset = ItemIdGetOffset(tup); if (offset < phdr->pd_upper || (offset + size) > phdr->pd_special || offset != MAXALIGN(offset)) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("corrupted item pointer: offset = %u, size = %u", offset, (unsigned int) size))); /* Amount of space to actually be deleted */ size = MAXALIGN(size); /* * Either set the item pointer to "unused", or zap it if it's the last * one. (Note: it's possible that the next-to-last one(s) are already * unused, but we do not trouble to try to compact them out if so.) */ if ((int) offnum < nline) ItemIdSetUnused(tup); else { phdr->pd_lower -= sizeof(ItemIdData); nline--; /* there's one less than when we started */ } /* * Now move everything between the old upper bound (beginning of tuple * space) and the beginning of the deleted tuple forward, so that space in * the middle of the page is left free. If we've just deleted the tuple * at the beginning of tuple space, then there's no need to do the copy. */ /* beginning of tuple space */ addr = (char *) page + phdr->pd_upper; if (offset > phdr->pd_upper) memmove(addr + size, addr, offset - phdr->pd_upper); /* adjust free space boundary pointer */ phdr->pd_upper += size; /* * Finally, we need to adjust the linp entries that remain. * * Anything that used to be before the deleted tuple's data was moved * forward by the size of the deleted tuple. */ if (!PageIsEmpty(page)) { int i; for (i = 1; i <= nline; i++) { ItemId ii = PageGetItemId(phdr, i); if (ItemIdHasStorage(ii) && ItemIdGetOffset(ii) <= offset) ii->lp_off += size; } } }
/* * lazy_scan_heap() -- scan an open heap relation * * This routine sets commit status bits, builds lists of dead tuples * and pages with free space, and calculates statistics on the number * of live tuples in the heap. When done, or when we run low on space * for dead-tuple TIDs, invoke vacuuming of indexes and heap. * * If there are no indexes then we just vacuum each dirty page as we * process it, since there's no point in gathering many tuples. */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, Relation *Irel, int nindexes, List *updated_stats) { MIRROREDLOCK_BUFMGR_DECLARE; BlockNumber nblocks, blkno; HeapTupleData tuple; char *relname; BlockNumber empty_pages, vacuumed_pages; double num_tuples, tups_vacuumed, nkeep, nunused; IndexBulkDeleteResult **indstats; int i; int reindex_count = 1; PGRUsage ru0; /* Fetch gp_persistent_relation_node information that will be added to XLOG record. */ RelationFetchGpRelationNodeForXLog(onerel); pg_rusage_init(&ru0); relname = RelationGetRelationName(onerel); ereport(elevel, (errmsg("vacuuming \"%s.%s\"", get_namespace_name(RelationGetNamespace(onerel)), relname))); empty_pages = vacuumed_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); nblocks = RelationGetNumberOfBlocks(onerel); vacrelstats->rel_pages = nblocks; vacrelstats->nonempty_pages = 0; lazy_space_alloc(vacrelstats, nblocks); for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool tupgone, hastup; int prev_dead_count; OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; vacuum_delay_point(); /* * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. */ if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && vacrelstats->num_dead_tuples > 0) { /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); reindex_count++; /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); /* Forget the now-vacuumed tuples, and press on */ vacrelstats->num_dead_tuples = 0; vacrelstats->num_index_scans++; } /* -------- MirroredLock ---------- */ MIRROREDLOCK_BUFMGR_LOCK; buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy); /* We need buffer cleanup lock so that we can prune HOT chains. */ LockBufferForCleanup(buf); page = BufferGetPage(buf); if (PageIsNew(page)) { /* * An all-zeroes page could be left over if a backend extends the * relation but crashes before initializing the page. Reclaim such * pages for use. * * We have to be careful here because we could be looking at a * page that someone has just added to the relation and not yet * been able to initialize (see RelationGetBufferForTuple). To * protect against that, release the buffer lock, grab the * relation extension lock momentarily, and re-lock the buffer. If * the page is still uninitialized by then, it must be left over * from a crashed backend, and we can initialize it. * * We don't really need the relation lock when this is a new or * temp relation, but it's probably not worth the code space to * check that, since this surely isn't a critical path. * * Note: the comparable code in vacuum.c need not worry because * it's got exclusive lock on the whole relation. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ LockRelationForExtension(onerel, ExclusiveLock); UnlockRelationForExtension(onerel, ExclusiveLock); /* -------- MirroredLock ---------- */ MIRROREDLOCK_BUFMGR_LOCK; LockBufferForCleanup(buf); if (PageIsNew(page)) { ereport(WARNING, (errmsg("relation \"%s\" page %u is uninitialized --- fixing", relname, blkno))); PageInit(page, BufferGetPageSize(buf), 0); /* must record in xlog so that changetracking will know about this change */ log_heap_newpage(onerel, page, blkno); empty_pages++; lazy_record_free_space(vacrelstats, blkno, PageGetHeapFreeSpace(page)); } MarkBufferDirty(buf); UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ continue; } if (PageIsEmpty(page)) { empty_pages++; lazy_record_free_space(vacrelstats, blkno, PageGetHeapFreeSpace(page)); UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ continue; } /* * Prune all HOT-update chains in this page. * * We count tuples removed by the pruning step as removed by VACUUM. */ tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, false); /* * Now scan the page to collect vacuumable items and check for tuples * requiring freezing. */ nfrozen = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* Unused items require no processing, but we count 'em */ if (!ItemIdIsUsed(itemid)) { nunused += 1; continue; } /* Redirect items mustn't be touched */ if (ItemIdIsRedirected(itemid)) { hastup = true; /* this page won't be truncatable */ continue; } ItemPointerSet(&(tuple.t_self), blkno, offnum); /* * DEAD item pointers are to be vacuumed normally; but we don't * count them in tups_vacuumed, else we'd be double-counting (at * least in the common case where heap_page_prune() just freed up * a non-HOT tuple). */ if (ItemIdIsDead(itemid)) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); continue; } Assert(ItemIdIsNormal(itemid)); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tupgone = false; switch (HeapTupleSatisfiesVacuum(onerel, tuple.t_data, OldestXmin, buf)) { case HEAPTUPLE_DEAD: /* * Ordinarily, DEAD tuples would have been removed by * heap_page_prune(), but it's possible that the tuple * state changed since heap_page_prune() looked. In * particular an INSERT_IN_PROGRESS tuple could have * changed to DEAD if the inserter aborted. So this * cannot be considered an error condition. * * If the tuple is HOT-updated then it must only be * removed by a prune operation; so we keep it just as if * it were RECENTLY_DEAD. Also, if it's a heap-only * tuple, we choose to keep it, because it'll be a lot * cheaper to get rid of it in the next pruning pass than * to treat it like an indexed tuple. */ if (HeapTupleIsHotUpdated(&tuple) || HeapTupleIsHeapOnly(&tuple)) nkeep += 1; else tupgone = true; /* we can delete the tuple */ break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ if (onerel->rd_rel->relhasoids && !OidIsValid(HeapTupleGetOid(&tuple))) elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); break; case HEAPTUPLE_RECENTLY_DEAD: /* * If tuple is recently deleted then we must not remove it * from relation. */ nkeep += 1; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } if (tupgone) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); tups_vacuumed += 1; } else { num_tuples += 1; hastup = true; /* * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ if (heap_freeze_tuple(tuple.t_data, &FreezeLimit, InvalidBuffer, false)) frozen[nfrozen++] = offnum; } } /* scan along page */ /* * If we froze any tuples, mark the buffer dirty, and write a WAL * record recording the changes. We must log the changes to be * crash-safe against future truncation of CLOG. */ if (nfrozen > 0) { MarkBufferDirty(buf); /* no XLOG for temp tables, though */ if (!onerel->rd_istemp) { XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, frozen, nfrozen); PageSetLSN(page, recptr); } } /* * If there are no indexes then we can vacuum the page right now * instead of doing a second scan. */ if (nindexes == 0 && vacrelstats->num_dead_tuples > 0) { /* Remove tuples from heap */ lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); /* Forget the now-vacuumed tuples, and press on */ vacrelstats->num_dead_tuples = 0; vacuumed_pages++; } /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record * its post-compaction free space. If not, then we're done with this * page, so remember its free space as-is. (This path will always be * taken if there are no indexes.) */ if (vacrelstats->num_dead_tuples == prev_dead_count) { lazy_record_free_space(vacrelstats, blkno, PageGetHeapFreeSpace(page)); } /* Remember the location of the last page with nonremovable tuples */ if (hastup) vacrelstats->nonempty_pages = blkno + 1; UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ } /* save stats for use later */ vacrelstats->rel_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) { /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); reindex_count++; /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); vacrelstats->num_index_scans++; } /* Do post-vacuum cleanup and statistics update for each index */ for (i = 0; i < nindexes; i++) lazy_cleanup_index(Irel[i], indstats[i], vacrelstats, updated_stats); /* If no indexes, make log report that lazy_vacuum_heap would've made */ if (vacuumed_pages) ereport(elevel, (errmsg("\"%s\": removed %.0f row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, vacuumed_pages))); ereport(elevel, (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, num_tuples, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages contain useful free space.\n" "%u pages are entirely empty.\n" "%s.", nkeep, nunused, vacrelstats->tot_free_pages, empty_pages, pg_rusage_show(&ru0)))); }
/* * _hash_squeezebucket(rel, bucket) * * Try to squeeze the tuples onto pages occurring earlier in the * bucket chain in an attempt to free overflow pages. When we start * the "squeezing", the page from which we start taking tuples (the * "read" page) is the last bucket in the bucket chain and the page * onto which we start squeezing tuples (the "write" page) is the * first page in the bucket chain. The read page works backward and * the write page works forward; the procedure terminates when the * read page and write page are the same page. * * At completion of this procedure, it is guaranteed that all pages in * the bucket are nonempty, unless the bucket is totally empty (in * which case all overflow pages will be freed). The original implementation * required that to be true on entry as well, but it's a lot easier for * callers to leave empty overflow pages and let this guy clean it up. * * Caller must hold exclusive lock on the target bucket. This allows * us to safely lock multiple pages in the bucket. * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. */ void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy) { BlockNumber wblkno; BlockNumber rblkno; Buffer wbuf; Buffer rbuf; Page wpage; Page rpage; HashPageOpaque wopaque; HashPageOpaque ropaque; bool wbuf_dirty; /* * start squeezing into the base bucket page. */ wblkno = bucket_blkno; wbuf = _hash_getbuf_with_strategy(rel, wblkno, HASH_WRITE, LH_BUCKET_PAGE, bstrategy); wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); /* * if there aren't any overflow pages, there's nothing to squeeze. */ if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) { _hash_relbuf(rel, wbuf); return; } /* * Find the last page in the bucket chain by starting at the base bucket * page and working forward. Note: we assume that a hash bucket chain is * usually smaller than the buffer ring being used by VACUUM, else using * the access strategy here would be counterproductive. */ rbuf = InvalidBuffer; ropaque = wopaque; do { rblkno = ropaque->hasho_nextblkno; if (rbuf != InvalidBuffer) _hash_relbuf(rel, rbuf); rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); /* * squeeze the tuples. */ wbuf_dirty = false; for (;;) { OffsetNumber roffnum; OffsetNumber maxroffnum; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; /* Scan each tuple in "read" page */ maxroffnum = PageGetMaxOffsetNumber(rpage); for (roffnum = FirstOffsetNumber; roffnum <= maxroffnum; roffnum = OffsetNumberNext(roffnum)) { IndexTuple itup; Size itemsz; itup = (IndexTuple) PageGetItem(rpage, PageGetItemId(rpage, roffnum)); itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* * Walk up the bucket chain, looking for a page big enough for * this item. Exit if we reach the read page. */ while (PageGetFreeSpace(wpage) < itemsz) { Assert(!PageIsEmpty(wpage)); wblkno = wopaque->hasho_nextblkno; Assert(BlockNumberIsValid(wblkno)); if (wbuf_dirty) _hash_wrtbuf(rel, wbuf); else _hash_relbuf(rel, wbuf); /* nothing more to do if we reached the read page */ if (rblkno == wblkno) { if (ndeletable > 0) { /* Delete tuples we already moved off read page */ PageIndexMultiDelete(rpage, deletable, ndeletable); _hash_wrtbuf(rel, rbuf); } else _hash_relbuf(rel, rbuf); return; } wbuf = _hash_getbuf_with_strategy(rel, wblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); Assert(wopaque->hasho_bucket == bucket); wbuf_dirty = false; } /* * we have found room so insert on the "write" page, being careful * to preserve hashkey ordering. (If we insert many tuples into * the same "write" page it would be worth qsort'ing instead of * doing repeated _hash_pgaddtup.) */ (void) _hash_pgaddtup(rel, wbuf, itemsz, itup); wbuf_dirty = true; /* remember tuple for deletion from "read" page */ deletable[ndeletable++] = roffnum; } /* * If we reach here, there are no live tuples on the "read" page --- * it was empty when we got to it, or we moved them all. So we can * just free the page without bothering with deleting tuples * individually. Then advance to the previous "read" page. * * Tricky point here: if our read and write pages are adjacent in the * bucket chain, our write lock on wbuf will conflict with * _hash_freeovflpage's attempt to update the sibling links of the * removed page. However, in that case we are done anyway, so we can * simply drop the write lock before calling _hash_freeovflpage. */ rblkno = ropaque->hasho_prevblkno; Assert(BlockNumberIsValid(rblkno)); /* are we freeing the page adjacent to wbuf? */ if (rblkno == wblkno) { /* yes, so release wbuf lock first */ if (wbuf_dirty) _hash_wrtbuf(rel, wbuf); else _hash_relbuf(rel, wbuf); /* free this overflow page (releases rbuf) */ _hash_freeovflpage(rel, rbuf, bstrategy); /* done */ return; } /* free this overflow page, then get the previous one */ _hash_freeovflpage(rel, rbuf, bstrategy); rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } /* NOTREACHED */ }
static ArrayTuple gistVacuumUpdate(GistVacuum *gv, BlockNumber blkno, bool needunion) { ArrayTuple res = {NULL, 0, false}; Buffer buffer; Page page, tempPage = NULL; OffsetNumber i, maxoff; ItemId iid; int lenaddon = 4, curlenaddon = 0, nOffToDelete = 0, nBlkToDelete = 0; IndexTuple idxtuple, *addon = NULL; bool needwrite = false; OffsetNumber offToDelete[MaxOffsetNumber]; BlockNumber blkToDelete[MaxOffsetNumber]; ItemPointerData *completed = NULL; int ncompleted = 0, lencompleted = 16; vacuum_delay_point(); buffer = ReadBufferWithStrategy(gv->index, blkno, gv->strategy); LockBuffer(buffer, GIST_EXCLUSIVE); gistcheckpage(gv->index, buffer); page = (Page) BufferGetPage(buffer); maxoff = PageGetMaxOffsetNumber(page); if (GistPageIsLeaf(page)) { if (GistTuplesDeleted(page)) needunion = needwrite = true; } else { completed = (ItemPointerData *) palloc(sizeof(ItemPointerData) * lencompleted); addon = (IndexTuple *) palloc(sizeof(IndexTuple) * lenaddon); /* get copy of page to work */ tempPage = GistPageGetCopyPage(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { ArrayTuple chldtuple; bool needchildunion; iid = PageGetItemId(tempPage, i); idxtuple = (IndexTuple) PageGetItem(tempPage, iid); needchildunion = (GistTupleIsInvalid(idxtuple)) ? true : false; if (needchildunion) elog(DEBUG2, "gistVacuumUpdate: need union for block %u", ItemPointerGetBlockNumber(&(idxtuple->t_tid))); chldtuple = gistVacuumUpdate(gv, ItemPointerGetBlockNumber(&(idxtuple->t_tid)), needchildunion); if (chldtuple.ituplen || chldtuple.emptypage) { /* update tuple or/and inserts new */ if (chldtuple.emptypage) blkToDelete[nBlkToDelete++] = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); offToDelete[nOffToDelete++] = i; PageIndexTupleDelete(tempPage, i); i--; maxoff--; needwrite = needunion = true; if (chldtuple.ituplen) { Assert(chldtuple.emptypage == false); while (curlenaddon + chldtuple.ituplen >= lenaddon) { lenaddon *= 2; addon = (IndexTuple *) repalloc(addon, sizeof(IndexTuple) * lenaddon); } memcpy(addon + curlenaddon, chldtuple.itup, chldtuple.ituplen * sizeof(IndexTuple)); curlenaddon += chldtuple.ituplen; if (chldtuple.ituplen > 1) { /* * child was split, so we need mark completion * insert(split) */ int j; while (ncompleted + chldtuple.ituplen > lencompleted) { lencompleted *= 2; completed = (ItemPointerData *) repalloc(completed, sizeof(ItemPointerData) * lencompleted); } for (j = 0; j < chldtuple.ituplen; j++) { ItemPointerCopy(&(chldtuple.itup[j]->t_tid), completed + ncompleted); ncompleted++; } } pfree(chldtuple.itup); } } } Assert(maxoff == PageGetMaxOffsetNumber(tempPage)); if (curlenaddon) { /* insert updated tuples */ if (gistnospace(tempPage, addon, curlenaddon, InvalidOffsetNumber, 0)) { /* there is no space on page to insert tuples */ res = vacuumSplitPage(gv, tempPage, buffer, addon, curlenaddon); tempPage = NULL; /* vacuumSplitPage() free tempPage */ needwrite = needunion = false; /* gistSplit already forms * unions and writes pages */ } else /* enough free space */ gistfillbuffer(gv->index, tempPage, addon, curlenaddon, InvalidOffsetNumber); } } /* * If page is empty, we should remove pointer to it before deleting page * (except root) */ if (blkno != GIST_ROOT_BLKNO && (PageIsEmpty(page) || (tempPage && PageIsEmpty(tempPage)))) { /* * New version of page is empty, so leave it unchanged, upper call * will mark our page as deleted. In case of page split we never will * be here... * * If page was empty it can't become non-empty during processing */ res.emptypage = true; UnlockReleaseBuffer(buffer); } else { /* write page and remove its childs if it need */ START_CRIT_SECTION(); if (tempPage && needwrite) { PageRestoreTempPage(tempPage, page); tempPage = NULL; } /* Empty index */ if (PageIsEmpty(page) && blkno == GIST_ROOT_BLKNO) { needwrite = true; GistPageSetLeaf(page); } if (needwrite) { MarkBufferDirty(buffer); GistClearTuplesDeleted(page); if (!gv->index->rd_istemp) { XLogRecData *rdata; XLogRecPtr recptr; char *xlinfo; rdata = formUpdateRdata(gv->index->rd_node, buffer, offToDelete, nOffToDelete, addon, curlenaddon, NULL); xlinfo = rdata->next->data; recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_UPDATE, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); pfree(xlinfo); pfree(rdata); } else PageSetLSN(page, XLogRecPtrForTemp); } END_CRIT_SECTION(); if (needunion && !PageIsEmpty(page)) { res.itup = (IndexTuple *) palloc(sizeof(IndexTuple)); res.ituplen = 1; res.itup[0] = PageMakeUnionKey(gv, buffer); } UnlockReleaseBuffer(buffer); /* delete empty children, now we havn't any links to pointed subtrees */ for (i = 0; i < nBlkToDelete; i++) gistDeleteSubtree(gv, blkToDelete[i]); if (ncompleted && !gv->index->rd_istemp) gistxlogInsertCompletion(gv->index->rd_node, completed, ncompleted); } for (i = 0; i < curlenaddon; i++) pfree(addon[i]); if (addon) pfree(addon); if (completed) pfree(completed); if (tempPage) pfree(tempPage); return res; }
/* * lazy_scan_heap() -- scan an open heap relation * * This routine sets commit status bits, builds lists of dead tuples * and pages with free space, and calculates statistics on the number * of live tuples in the heap. When done, or when we run low on space * for dead-tuple TIDs, invoke vacuuming of indexes and heap. * * If there are no indexes then we just vacuum each dirty page as we * process it, since there's no point in gathering many tuples. */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, Relation *Irel, int nindexes, bool scan_all) { BlockNumber nblocks, blkno; HeapTupleData tuple; char *relname; BlockNumber empty_pages, vacuumed_pages; double num_tuples, tups_vacuumed, nkeep, nunused; IndexBulkDeleteResult **indstats; int i; PGRUsage ru0; Buffer vmbuffer = InvalidBuffer; BlockNumber next_not_all_visible_block; bool skipping_all_visible_blocks; pg_rusage_init(&ru0); relname = RelationGetRelationName(onerel); ereport(elevel, (errmsg("vacuuming \"%s.%s\"", get_namespace_name(RelationGetNamespace(onerel)), relname))); empty_pages = vacuumed_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); nblocks = RelationGetNumberOfBlocks(onerel); vacrelstats->rel_pages = nblocks; vacrelstats->scanned_pages = 0; vacrelstats->nonempty_pages = 0; vacrelstats->latestRemovedXid = InvalidTransactionId; lazy_space_alloc(vacrelstats, nblocks); /* * We want to skip pages that don't require vacuuming according to the * visibility map, but only when we can skip at least SKIP_PAGES_THRESHOLD * consecutive pages. Since we're reading sequentially, the OS should be * doing readahead for us, so there's no gain in skipping a page now and * then; that's likely to disable readahead and so be counterproductive. * Also, skipping even a single page means that we can't update * relfrozenxid, so we only want to do it if we can skip a goodly number * of pages. * * Before entering the main loop, establish the invariant that * next_not_all_visible_block is the next block number >= blkno that's not * all-visible according to the visibility map, or nblocks if there's no * such block. Also, we set up the skipping_all_visible_blocks flag, * which is needed because we need hysteresis in the decision: once we've * started skipping blocks, we may as well skip everything up to the next * not-all-visible block. * * Note: if scan_all is true, we won't actually skip any pages; but we * maintain next_not_all_visible_block anyway, so as to set up the * all_visible_according_to_vm flag correctly for each page. */ for (next_not_all_visible_block = 0; next_not_all_visible_block < nblocks; next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } if (next_not_all_visible_block >= SKIP_PAGES_THRESHOLD) skipping_all_visible_blocks = true; else skipping_all_visible_blocks = false; for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool tupgone, hastup; int prev_dead_count; OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; Size freespace; bool all_visible_according_to_vm; bool all_visible; bool has_dead_tuples; if (blkno == next_not_all_visible_block) { /* Time to advance next_not_all_visible_block */ for (next_not_all_visible_block++; next_not_all_visible_block < nblocks; next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } /* * We know we can't skip the current block. But set up * skipping_all_visible_blocks to do the right thing at the * following blocks. */ if (next_not_all_visible_block - blkno > SKIP_PAGES_THRESHOLD) skipping_all_visible_blocks = true; else skipping_all_visible_blocks = false; all_visible_according_to_vm = false; } else { /* Current block is all-visible */ if (skipping_all_visible_blocks && !scan_all) continue; all_visible_according_to_vm = true; } vacuum_delay_point(); vacrelstats->scanned_pages++; /* * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. */ if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && vacrelstats->num_dead_tuples > 0) { /* Log cleanup info before we touch indexes */ vacuum_log_cleanup_info(onerel, vacrelstats); /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); /* * Forget the now-vacuumed tuples, and press on, but be careful * not to reset latestRemovedXid since we want that value to be * valid. */ vacrelstats->num_dead_tuples = 0; vacrelstats->num_index_scans++; } buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, vac_strategy); /* We need buffer cleanup lock so that we can prune HOT chains. */ LockBufferForCleanup(buf); page = BufferGetPage(buf); if (PageIsNew(page)) { /* * An all-zeroes page could be left over if a backend extends the * relation but crashes before initializing the page. Reclaim such * pages for use. * * We have to be careful here because we could be looking at a * page that someone has just added to the relation and not yet * been able to initialize (see RelationGetBufferForTuple). To * protect against that, release the buffer lock, grab the * relation extension lock momentarily, and re-lock the buffer. If * the page is still uninitialized by then, it must be left over * from a crashed backend, and we can initialize it. * * We don't really need the relation lock when this is a new or * temp relation, but it's probably not worth the code space to * check that, since this surely isn't a critical path. * * Note: the comparable code in vacuum.c need not worry because * it's got exclusive lock on the whole relation. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockRelationForExtension(onerel, ExclusiveLock); UnlockRelationForExtension(onerel, ExclusiveLock); LockBufferForCleanup(buf); if (PageIsNew(page)) { ereport(WARNING, (errmsg("relation \"%s\" page %u is uninitialized --- fixing", relname, blkno))); PageInit(page, BufferGetPageSize(buf), 0); empty_pages++; } freespace = PageGetHeapFreeSpace(page); MarkBufferDirty(buf); UnlockReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } if (PageIsEmpty(page)) { empty_pages++; freespace = PageGetHeapFreeSpace(page); if (!PageIsAllVisible(page)) { PageSetAllVisible(page); SetBufferCommitInfoNeedsSave(buf); } LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* Update the visibility map */ if (!all_visible_according_to_vm) { visibilitymap_pin(onerel, blkno, &vmbuffer); LockBuffer(buf, BUFFER_LOCK_SHARE); if (PageIsAllVisible(page)) visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } ReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } /* * Prune all HOT-update chains in this page. * * We count tuples removed by the pruning step as removed by VACUUM. */ tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, &vacrelstats->latestRemovedXid); /* * Now scan the page to collect vacuumable items and check for tuples * requiring freezing. */ all_visible = true; has_dead_tuples = false; nfrozen = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* Unused items require no processing, but we count 'em */ if (!ItemIdIsUsed(itemid)) { nunused += 1; continue; } /* Redirect items mustn't be touched */ if (ItemIdIsRedirected(itemid)) { hastup = true; /* this page won't be truncatable */ continue; } ItemPointerSet(&(tuple.t_self), blkno, offnum); /* * DEAD item pointers are to be vacuumed normally; but we don't * count them in tups_vacuumed, else we'd be double-counting (at * least in the common case where heap_page_prune() just freed up * a non-HOT tuple). */ if (ItemIdIsDead(itemid)) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); all_visible = false; continue; } Assert(ItemIdIsNormal(itemid)); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tupgone = false; switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf)) { case HEAPTUPLE_DEAD: /* * Ordinarily, DEAD tuples would have been removed by * heap_page_prune(), but it's possible that the tuple * state changed since heap_page_prune() looked. In * particular an INSERT_IN_PROGRESS tuple could have * changed to DEAD if the inserter aborted. So this * cannot be considered an error condition. * * If the tuple is HOT-updated then it must only be * removed by a prune operation; so we keep it just as if * it were RECENTLY_DEAD. Also, if it's a heap-only * tuple, we choose to keep it, because it'll be a lot * cheaper to get rid of it in the next pruning pass than * to treat it like an indexed tuple. */ if (HeapTupleIsHotUpdated(&tuple) || HeapTupleIsHeapOnly(&tuple)) nkeep += 1; else tupgone = true; /* we can delete the tuple */ all_visible = false; break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ if (onerel->rd_rel->relhasoids && !OidIsValid(HeapTupleGetOid(&tuple))) elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); /* * Is the tuple definitely visible to all transactions? * * NB: Like with per-tuple hint bits, we can't set the * PD_ALL_VISIBLE flag if the inserter committed * asynchronously. See SetHintBits for more info. Check * that the HEAP_XMIN_COMMITTED hint bit is set because of * that. */ if (all_visible) { TransactionId xmin; if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)) { all_visible = false; break; } /* * The inserter definitely committed. But is it old * enough that everyone sees it as committed? */ xmin = HeapTupleHeaderGetXmin(tuple.t_data); if (!TransactionIdPrecedes(xmin, OldestXmin)) { all_visible = false; break; } } break; case HEAPTUPLE_RECENTLY_DEAD: /* * If tuple is recently deleted then we must not remove it * from relation. */ nkeep += 1; all_visible = false; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } if (tupgone) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, &vacrelstats->latestRemovedXid); tups_vacuumed += 1; has_dead_tuples = true; } else { num_tuples += 1; hastup = true; /* * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ if (heap_freeze_tuple(tuple.t_data, FreezeLimit, InvalidBuffer)) frozen[nfrozen++] = offnum; } } /* scan along page */ /* * If we froze any tuples, mark the buffer dirty, and write a WAL * record recording the changes. We must log the changes to be * crash-safe against future truncation of CLOG. */ if (nfrozen > 0) { MarkBufferDirty(buf); if (RelationNeedsWAL(onerel)) { XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, frozen, nfrozen); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } /* * If there are no indexes then we can vacuum the page right now * instead of doing a second scan. */ if (nindexes == 0 && vacrelstats->num_dead_tuples > 0) { /* Remove tuples from heap */ lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); /* * Forget the now-vacuumed tuples, and press on, but be careful * not to reset latestRemovedXid since we want that value to be * valid. */ vacrelstats->num_dead_tuples = 0; vacuumed_pages++; } freespace = PageGetHeapFreeSpace(page); /* Update the all-visible flag on the page */ if (!PageIsAllVisible(page) && all_visible) { PageSetAllVisible(page); SetBufferCommitInfoNeedsSave(buf); } /* * It's possible for the value returned by GetOldestXmin() to move * backwards, so it's not wrong for us to see tuples that appear to * not be visible to everyone yet, while PD_ALL_VISIBLE is already * set. The real safe xmin value never moves backwards, but * GetOldestXmin() is conservative and sometimes returns a value * that's unnecessarily small, so if we see that contradiction it just * means that the tuples that we think are not visible to everyone yet * actually are, and the PD_ALL_VISIBLE flag is correct. * * There should never be dead tuples on a page with PD_ALL_VISIBLE * set, however. */ else if (PageIsAllVisible(page) && has_dead_tuples) { elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u", relname, blkno); PageClearAllVisible(page); SetBufferCommitInfoNeedsSave(buf); /* * Normally, we would drop the lock on the heap page before * updating the visibility map, but since this case shouldn't * happen anyway, don't worry about that. */ visibilitymap_clear(onerel, blkno); } LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* Update the visibility map */ if (!all_visible_according_to_vm && all_visible) { visibilitymap_pin(onerel, blkno, &vmbuffer); LockBuffer(buf, BUFFER_LOCK_SHARE); if (PageIsAllVisible(page)) visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } ReleaseBuffer(buf); /* Remember the location of the last page with nonremovable tuples */ if (hastup) vacrelstats->nonempty_pages = blkno + 1; /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record * its post-compaction free space. If not, then we're done with this * page, so remember its free space as-is. (This path will always be * taken if there are no indexes.) */ if (vacrelstats->num_dead_tuples == prev_dead_count) RecordPageWithFreeSpace(onerel, blkno, freespace); } /* save stats for use later */ vacrelstats->scanned_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; /* now we can compute the new value for pg_class.reltuples */ vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false, nblocks, vacrelstats->scanned_pages, num_tuples); /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) { /* Log cleanup info before we touch indexes */ vacuum_log_cleanup_info(onerel, vacrelstats); /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); vacrelstats->num_index_scans++; } /* Release the pin on the visibility map page */ if (BufferIsValid(vmbuffer)) { ReleaseBuffer(vmbuffer); vmbuffer = InvalidBuffer; } /* Do post-vacuum cleanup and statistics update for each index */ for (i = 0; i < nindexes; i++) lazy_cleanup_index(Irel[i], indstats[i], vacrelstats); /* If no indexes, make log report that lazy_vacuum_heap would've made */ if (vacuumed_pages) ereport(elevel, (errmsg("\"%s\": removed %.0f row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, vacuumed_pages))); ereport(elevel, (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages", RelationGetRelationName(onerel), tups_vacuumed, num_tuples, vacrelstats->scanned_pages, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages are entirely empty.\n" "%s.", nkeep, nunused, empty_pages, pg_rusage_show(&ru0)))); }
/* * _hash_squeezebucket(rel, bucket) * * Try to squeeze the tuples onto pages occurring earlier in the * bucket chain in an attempt to free overflow pages. When we start * the "squeezing", the page from which we start taking tuples (the * "read" page) is the last bucket in the bucket chain and the page * onto which we start squeezing tuples (the "write" page) is the * first page in the bucket chain. The read page works backward and * the write page works forward; the procedure terminates when the * read page and write page are the same page. * * At completion of this procedure, it is guaranteed that all pages in * the bucket are nonempty, unless the bucket is totally empty (in * which case all overflow pages will be freed). The original implementation * required that to be true on entry as well, but it's a lot easier for * callers to leave empty overflow pages and let this guy clean it up. * * Caller must hold exclusive lock on the target bucket. This allows * us to safely lock multiple pages in the bucket. */ void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno) { Buffer wbuf; Buffer rbuf = 0; BlockNumber wblkno; BlockNumber rblkno; Page wpage; Page rpage; HashPageOpaque wopaque; HashPageOpaque ropaque; OffsetNumber woffnum; OffsetNumber roffnum; IndexTuple itup; Size itemsz; /* * start squeezing into the base bucket page. */ wblkno = bucket_blkno; wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE); _hash_checkpage(rel, wbuf, LH_BUCKET_PAGE); wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); /* * if there aren't any overflow pages, there's nothing to squeeze. */ if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) { _hash_relbuf(rel, wbuf); return; } /* * find the last page in the bucket chain by starting at the base bucket * page and working forward. */ ropaque = wopaque; do { rblkno = ropaque->hasho_nextblkno; if (ropaque != wopaque) _hash_relbuf(rel, rbuf); rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE); _hash_checkpage(rel, rbuf, LH_OVERFLOW_PAGE); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); /* * squeeze the tuples. */ roffnum = FirstOffsetNumber; for (;;) { /* this test is needed in case page is empty on entry */ if (roffnum <= PageGetMaxOffsetNumber(rpage)) { itup = (IndexTuple) PageGetItem(rpage, PageGetItemId(rpage, roffnum)); itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* * Walk up the bucket chain, looking for a page big enough for * this item. Exit if we reach the read page. */ while (PageGetFreeSpace(wpage) < itemsz) { Assert(!PageIsEmpty(wpage)); wblkno = wopaque->hasho_nextblkno; Assert(BlockNumberIsValid(wblkno)); _hash_wrtbuf(rel, wbuf); if (rblkno == wblkno) { /* wbuf is already released */ _hash_wrtbuf(rel, rbuf); return; } wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE); _hash_checkpage(rel, wbuf, LH_OVERFLOW_PAGE); wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); Assert(wopaque->hasho_bucket == bucket); } /* * we have found room so insert on the "write" page. */ woffnum = OffsetNumberNext(PageGetMaxOffsetNumber(wpage)); if (PageAddItem(wpage, (Item) itup, itemsz, woffnum, LP_USED) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel)); /* * delete the tuple from the "read" page. PageIndexTupleDelete * repacks the ItemId array, so 'roffnum' will be "advanced" to * the "next" ItemId. */ PageIndexTupleDelete(rpage, roffnum); } /* * if the "read" page is now empty because of the deletion (or because * it was empty when we got to it), free it. * * Tricky point here: if our read and write pages are adjacent in the * bucket chain, our write lock on wbuf will conflict with * _hash_freeovflpage's attempt to update the sibling links of the * removed page. However, in that case we are done anyway, so we can * simply drop the write lock before calling _hash_freeovflpage. */ if (PageIsEmpty(rpage)) { rblkno = ropaque->hasho_prevblkno; Assert(BlockNumberIsValid(rblkno)); /* are we freeing the page adjacent to wbuf? */ if (rblkno == wblkno) { /* yes, so release wbuf lock first */ _hash_wrtbuf(rel, wbuf); /* free this overflow page (releases rbuf) */ _hash_freeovflpage(rel, rbuf); /* done */ return; } /* free this overflow page, then get the previous one */ _hash_freeovflpage(rel, rbuf); rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE); _hash_checkpage(rel, rbuf, LH_OVERFLOW_PAGE); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); roffnum = FirstOffsetNumber; } } /* NOTREACHED */ }