/* * Deletes pending list pages up to (not including) newHead page. * If newHead == InvalidBlockNumber then function drops the whole list. * * metapage is pinned and exclusive-locked throughout this function. * * Returns true if another cleanup process is running concurrently * (if so, we can just abandon our own efforts) */ static bool shiftList(Relation index, Buffer metabuffer, BlockNumber newHead, bool fill_fsm, IndexBulkDeleteResult *stats) { Page metapage; GinMetaPageData *metadata; BlockNumber blknoToDelete; metapage = BufferGetPage(metabuffer); metadata = GinPageGetMeta(metapage); blknoToDelete = metadata->head; do { Page page; int i; int64 nDeletedHeapTuples = 0; ginxlogDeleteListPages data; Buffer buffers[GIN_NDELETE_AT_ONCE]; BlockNumber freespace[GIN_NDELETE_AT_ONCE]; data.ndeleted = 0; while (data.ndeleted < GIN_NDELETE_AT_ONCE && blknoToDelete != newHead) { freespace[data.ndeleted] = blknoToDelete; buffers[data.ndeleted] = ReadBuffer(index, blknoToDelete); LockBuffer(buffers[data.ndeleted], GIN_EXCLUSIVE); page = BufferGetPage(buffers[data.ndeleted]); data.ndeleted++; if (GinPageIsDeleted(page)) { /* concurrent cleanup process is detected */ for (i = 0; i < data.ndeleted; i++) UnlockReleaseBuffer(buffers[i]); return true; } nDeletedHeapTuples += GinPageGetOpaque(page)->maxoff; blknoToDelete = GinPageGetOpaque(page)->rightlink; } if (stats) stats->pages_deleted += data.ndeleted; /* * This operation touches an unusually large number of pages, so * prepare the XLogInsert machinery for that before entering the * critical section. */ if (RelationNeedsWAL(index)) XLogEnsureRecordSpace(data.ndeleted, 0); START_CRIT_SECTION(); metadata->head = blknoToDelete; Assert(metadata->nPendingPages >= data.ndeleted); metadata->nPendingPages -= data.ndeleted; Assert(metadata->nPendingHeapTuples >= nDeletedHeapTuples); metadata->nPendingHeapTuples -= nDeletedHeapTuples; if (blknoToDelete == InvalidBlockNumber) { metadata->tail = InvalidBlockNumber; metadata->tailFreeSize = 0; metadata->nPendingPages = 0; metadata->nPendingHeapTuples = 0; } MarkBufferDirty(metabuffer); for (i = 0; i < data.ndeleted; i++) { page = BufferGetPage(buffers[i]); GinPageGetOpaque(page)->flags = GIN_DELETED; MarkBufferDirty(buffers[i]); } if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT); for (i = 0; i < data.ndeleted; i++) XLogRegisterBuffer(i + 1, buffers[i], REGBUF_WILL_INIT); memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); XLogRegisterData((char *) &data, sizeof(ginxlogDeleteListPages)); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_LISTPAGE); PageSetLSN(metapage, recptr); for (i = 0; i < data.ndeleted; i++) { page = BufferGetPage(buffers[i]); PageSetLSN(page, recptr); } } for (i = 0; i < data.ndeleted; i++) UnlockReleaseBuffer(buffers[i]); END_CRIT_SECTION(); for (i = 0; fill_fsm && i < data.ndeleted; i++) RecordFreeIndexPage(index, freespace[i]); } while (blknoToDelete != newHead); return false; }
/* * _hash_squeezebucket(rel, bucket) * * Try to squeeze the tuples onto pages occurring earlier in the * bucket chain in an attempt to free overflow pages. When we start * the "squeezing", the page from which we start taking tuples (the * "read" page) is the last bucket in the bucket chain and the page * onto which we start squeezing tuples (the "write" page) is the * first page in the bucket chain. The read page works backward and * the write page works forward; the procedure terminates when the * read page and write page are the same page. * * At completion of this procedure, it is guaranteed that all pages in * the bucket are nonempty, unless the bucket is totally empty (in * which case all overflow pages will be freed). The original implementation * required that to be true on entry as well, but it's a lot easier for * callers to leave empty overflow pages and let this guy clean it up. * * Caller must acquire cleanup lock on the primary page of the target * bucket to exclude any scans that are in progress, which could easily * be confused into returning the same tuple more than once or some tuples * not at all by the rearrangement we are performing here. To prevent * any concurrent scan to cross the squeeze scan we use lock chaining * similar to hasbucketcleanup. Refer comments atop hashbucketcleanup. * * We need to retain a pin on the primary bucket to ensure that no concurrent * split can start. * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. */ void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy) { BlockNumber wblkno; BlockNumber rblkno; Buffer wbuf; Buffer rbuf; Page wpage; Page rpage; HashPageOpaque wopaque; HashPageOpaque ropaque; /* * start squeezing into the primary bucket page. */ wblkno = bucket_blkno; wbuf = bucket_buf; wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); /* * if there aren't any overflow pages, there's nothing to squeeze. caller * is responsible for releasing the pin on primary bucket page. */ if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) { LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); return; } /* * Find the last page in the bucket chain by starting at the base bucket * page and working forward. Note: we assume that a hash bucket chain is * usually smaller than the buffer ring being used by VACUUM, else using * the access strategy here would be counterproductive. */ rbuf = InvalidBuffer; ropaque = wopaque; do { rblkno = ropaque->hasho_nextblkno; if (rbuf != InvalidBuffer) _hash_relbuf(rel, rbuf); rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); /* * squeeze the tuples. */ for (;;) { OffsetNumber roffnum; OffsetNumber maxroffnum; OffsetNumber deletable[MaxOffsetNumber]; IndexTuple itups[MaxIndexTuplesPerPage]; Size tups_size[MaxIndexTuplesPerPage]; OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; uint16 ndeletable = 0; uint16 nitups = 0; Size all_tups_size = 0; int i; bool retain_pin = false; readpage: /* Scan each tuple in "read" page */ maxroffnum = PageGetMaxOffsetNumber(rpage); for (roffnum = FirstOffsetNumber; roffnum <= maxroffnum; roffnum = OffsetNumberNext(roffnum)) { IndexTuple itup; Size itemsz; /* skip dead tuples */ if (ItemIdIsDead(PageGetItemId(rpage, roffnum))) continue; itup = (IndexTuple) PageGetItem(rpage, PageGetItemId(rpage, roffnum)); itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* * Walk up the bucket chain, looking for a page big enough for * this item and all other accumulated items. Exit if we reach * the read page. */ while (PageGetFreeSpaceForMultipleTuples(wpage, nitups + 1) < (all_tups_size + itemsz)) { Buffer next_wbuf = InvalidBuffer; bool tups_moved = false; Assert(!PageIsEmpty(wpage)); if (wblkno == bucket_blkno) retain_pin = true; wblkno = wopaque->hasho_nextblkno; Assert(BlockNumberIsValid(wblkno)); /* don't need to move to next page if we reached the read page */ if (wblkno != rblkno) next_wbuf = _hash_getbuf_with_strategy(rel, wblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); if (nitups > 0) { Assert(nitups == ndeletable); /* * This operation needs to log multiple tuples, prepare * WAL for that. */ if (RelationNeedsWAL(rel)) XLogEnsureRecordSpace(0, 3 + nitups); START_CRIT_SECTION(); /* * we have to insert tuples on the "write" page, being * careful to preserve hashkey ordering. (If we insert * many tuples into the same "write" page it would be * worth qsort'ing them). */ _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); MarkBufferDirty(wbuf); /* Delete tuples we already moved off read page */ PageIndexMultiDelete(rpage, deletable, ndeletable); MarkBufferDirty(rbuf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; xl_hash_move_page_contents xlrec; xlrec.ntups = nitups; xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf) ? true : false; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashMovePageContents); /* * bucket buffer needs to be registered to ensure that * we can acquire a cleanup lock on it during replay. */ if (!xlrec.is_prim_bucket_same_wrt) XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); XLogRegisterBufData(1, (char *) itup_offsets, nitups * sizeof(OffsetNumber)); for (i = 0; i < nitups; i++) XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD); XLogRegisterBufData(2, (char *) deletable, ndeletable * sizeof(OffsetNumber)); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS); PageSetLSN(BufferGetPage(wbuf), recptr); PageSetLSN(BufferGetPage(rbuf), recptr); } END_CRIT_SECTION(); tups_moved = true; } /* * release the lock on previous page after acquiring the lock * on next page */ if (retain_pin) LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, wbuf); /* nothing more to do if we reached the read page */ if (rblkno == wblkno) { _hash_relbuf(rel, rbuf); return; } wbuf = next_wbuf; wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); Assert(wopaque->hasho_bucket == bucket); retain_pin = false; /* be tidy */ for (i = 0; i < nitups; i++) pfree(itups[i]); nitups = 0; all_tups_size = 0; ndeletable = 0; /* * after moving the tuples, rpage would have been compacted, * so we need to rescan it. */ if (tups_moved) goto readpage; } /* remember tuple for deletion from "read" page */ deletable[ndeletable++] = roffnum; /* * we need a copy of index tuples as they can be freed as part of * overflow page, however we need them to write a WAL record in * _hash_freeovflpage. */ itups[nitups] = CopyIndexTuple(itup); tups_size[nitups++] = itemsz; all_tups_size += itemsz; } /* * If we reach here, there are no live tuples on the "read" page --- * it was empty when we got to it, or we moved them all. So we can * just free the page without bothering with deleting tuples * individually. Then advance to the previous "read" page. * * Tricky point here: if our read and write pages are adjacent in the * bucket chain, our write lock on wbuf will conflict with * _hash_freeovflpage's attempt to update the sibling links of the * removed page. In that case, we don't need to lock it again. */ rblkno = ropaque->hasho_prevblkno; Assert(BlockNumberIsValid(rblkno)); /* free this overflow page (releases rbuf) */ _hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets, tups_size, nitups, bstrategy); /* be tidy */ for (i = 0; i < nitups; i++) pfree(itups[i]); /* are we freeing the page adjacent to wbuf? */ if (rblkno == wblkno) { /* retain the pin on primary bucket page till end of bucket scan */ if (wblkno == bucket_blkno) LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, wbuf); return; } rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } /* NOTREACHED */ }
/* * Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple * at that offset is atomically removed along with inserting the new tuples. * This is used to replace a tuple with a new one. * * If 'leftchildbuf' is valid, we're inserting the downlink for the page * to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'. * F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set. * * If 'markfollowright' is true and the page is split, the left child is * marked with F_FOLLOW_RIGHT flag. That is the normal case. During buffered * index build, however, there is no concurrent access and the page splitting * is done in a slightly simpler fashion, and false is passed. * * If there is not enough room on the page, it is split. All the split * pages are kept pinned and locked and returned in *splitinfo, the caller * is responsible for inserting the downlinks for them. However, if * 'buffer' is the root page and it needs to be split, gistplacetopage() * performs the split as one atomic operation, and *splitinfo is set to NIL. * In that case, we continue to hold the root page locked, and the child * pages are released; note that new tuple(s) are *not* on the root page * but in one of the new child pages. * * If 'newblkno' is not NULL, returns the block number of page the first * new/updated tuple was inserted to. Usually it's the given page, but could * be its right sibling if the page was split. * * Returns 'true' if the page was split, 'false' otherwise. */ bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, Buffer buffer, IndexTuple *itup, int ntup, OffsetNumber oldoffnum, BlockNumber *newblkno, Buffer leftchildbuf, List **splitinfo, bool markfollowright) { BlockNumber blkno = BufferGetBlockNumber(buffer); Page page = BufferGetPage(buffer, NULL, NULL, BGP_NO_SNAPSHOT_TEST); bool is_leaf = (GistPageIsLeaf(page)) ? true : false; XLogRecPtr recptr; int i; bool is_split; /* * Refuse to modify a page that's incompletely split. This should not * happen because we finish any incomplete splits while we walk down the * tree. However, it's remotely possible that another concurrent inserter * splits a parent page, and errors out before completing the split. We * will just throw an error in that case, and leave any split we had in * progress unfinished too. The next insert that comes along will clean up * the mess. */ if (GistFollowRight(page)) elog(ERROR, "concurrent GiST page split was incomplete"); *splitinfo = NIL; /* * if isupdate, remove old key: This node's key has been modified, either * because a child split occurred or because we needed to adjust our key * for an insert in a child node. Therefore, remove the old version of * this node's key. * * for WAL replay, in the non-split case we handle this by setting up a * one-element todelete array; in the split case, it's handled implicitly * because the tuple vector passed to gistSplit won't include this tuple. */ is_split = gistnospace(page, itup, ntup, oldoffnum, freespace); /* * If leaf page is full, try at first to delete dead tuples. And then * check again. */ if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page)) { gistvacuumpage(rel, page, buffer); is_split = gistnospace(page, itup, ntup, oldoffnum, freespace); } if (is_split) { /* no space for insertion */ IndexTuple *itvec; int tlen; SplitedPageLayout *dist = NULL, *ptr; BlockNumber oldrlink = InvalidBlockNumber; GistNSN oldnsn = 0; SplitedPageLayout rootpg; bool is_rootsplit; int npage; is_rootsplit = (blkno == GIST_ROOT_BLKNO); /* * Form index tuples vector to split. If we're replacing an old tuple, * remove the old version from the vector. */ itvec = gistextractpage(page, &tlen); if (OffsetNumberIsValid(oldoffnum)) { /* on inner page we should remove old tuple */ int pos = oldoffnum - FirstOffsetNumber; tlen--; if (pos != tlen) memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos)); } itvec = gistjoinvector(itvec, &tlen, itup, ntup); dist = gistSplit(rel, page, itvec, tlen, giststate); /* * Check that split didn't produce too many pages. */ npage = 0; for (ptr = dist; ptr; ptr = ptr->next) npage++; /* in a root split, we'll add one more page to the list below */ if (is_rootsplit) npage++; if (npage > GIST_MAX_SPLIT_PAGES) elog(ERROR, "GiST page split into too many halves (%d, maximum %d)", npage, GIST_MAX_SPLIT_PAGES); /* * Set up pages to work with. Allocate new buffers for all but the * leftmost page. The original page becomes the new leftmost page, and * is just replaced with the new contents. * * For a root-split, allocate new buffers for all child pages, the * original page is overwritten with new root page containing * downlinks to the new child pages. */ ptr = dist; if (!is_rootsplit) { /* save old rightlink and NSN */ oldrlink = GistPageGetOpaque(page)->rightlink; oldnsn = GistPageGetNSN(page); dist->buffer = buffer; dist->block.blkno = BufferGetBlockNumber(buffer); dist->page = PageGetTempPageCopySpecial(BufferGetPage(buffer, NULL, NULL, BGP_NO_SNAPSHOT_TEST)); /* clean all flags except F_LEAF */ GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0; ptr = ptr->next; } for (; ptr; ptr = ptr->next) { /* Allocate new page */ ptr->buffer = gistNewBuffer(rel); GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0); ptr->page = BufferGetPage(ptr->buffer, NULL, NULL, BGP_NO_SNAPSHOT_TEST); ptr->block.blkno = BufferGetBlockNumber(ptr->buffer); } /* * Now that we know which blocks the new pages go to, set up downlink * tuples to point to them. */ for (ptr = dist; ptr; ptr = ptr->next) { ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno); GistTupleSetValid(ptr->itup); } /* * If this is a root split, we construct the new root page with the * downlinks here directly, instead of requiring the caller to insert * them. Add the new root page to the list along with the child pages. */ if (is_rootsplit) { IndexTuple *downlinks; int ndownlinks = 0; int i; rootpg.buffer = buffer; rootpg.page = PageGetTempPageCopySpecial(BufferGetPage(rootpg.buffer, NULL, NULL, BGP_NO_SNAPSHOT_TEST)); GistPageGetOpaque(rootpg.page)->flags = 0; /* Prepare a vector of all the downlinks */ for (ptr = dist; ptr; ptr = ptr->next) ndownlinks++; downlinks = palloc(sizeof(IndexTuple) * ndownlinks); for (i = 0, ptr = dist; ptr; ptr = ptr->next) downlinks[i++] = ptr->itup; rootpg.block.blkno = GIST_ROOT_BLKNO; rootpg.block.num = ndownlinks; rootpg.list = gistfillitupvec(downlinks, ndownlinks, &(rootpg.lenlist)); rootpg.itup = NULL; rootpg.next = dist; dist = &rootpg; } else { /* Prepare split-info to be returned to caller */ for (ptr = dist; ptr; ptr = ptr->next) { GISTPageSplitInfo *si = palloc(sizeof(GISTPageSplitInfo)); si->buf = ptr->buffer; si->downlink = ptr->itup; *splitinfo = lappend(*splitinfo, si); } } /* * Fill all pages. All the pages are new, ie. freshly allocated empty * pages, or a temporary copy of the old page. */ for (ptr = dist; ptr; ptr = ptr->next) { char *data = (char *) (ptr->list); for (i = 0; i < ptr->block.num; i++) { IndexTuple thistup = (IndexTuple) data; if (PageAddItem(ptr->page, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel)); /* * If this is the first inserted/updated tuple, let the caller * know which page it landed on. */ if (newblkno && ItemPointerEquals(&thistup->t_tid, &(*itup)->t_tid)) *newblkno = ptr->block.blkno; data += IndexTupleSize(thistup); } /* Set up rightlinks */ if (ptr->next && ptr->block.blkno != GIST_ROOT_BLKNO) GistPageGetOpaque(ptr->page)->rightlink = ptr->next->block.blkno; else GistPageGetOpaque(ptr->page)->rightlink = oldrlink; /* * Mark the all but the right-most page with the follow-right * flag. It will be cleared as soon as the downlink is inserted * into the parent, but this ensures that if we error out before * that, the index is still consistent. (in buffering build mode, * any error will abort the index build anyway, so this is not * needed.) */ if (ptr->next && !is_rootsplit && markfollowright) GistMarkFollowRight(ptr->page); else GistClearFollowRight(ptr->page); /* * Copy the NSN of the original page to all pages. The * F_FOLLOW_RIGHT flags ensure that scans will follow the * rightlinks until the downlinks are inserted. */ GistPageSetNSN(ptr->page, oldnsn); } /* * gistXLogSplit() needs to WAL log a lot of pages, prepare WAL * insertion for that. NB: The number of pages and data segments * specified here must match the calculations in gistXLogSplit()! */ if (RelationNeedsWAL(rel)) XLogEnsureRecordSpace(npage, 1 + npage * 2); START_CRIT_SECTION(); /* * Must mark buffers dirty before XLogInsert, even though we'll still * be changing their opaque fields below. */ for (ptr = dist; ptr; ptr = ptr->next) MarkBufferDirty(ptr->buffer); if (BufferIsValid(leftchildbuf)) MarkBufferDirty(leftchildbuf); /* * The first page in the chain was a temporary working copy meant to * replace the old page. Copy it over the old page. */ PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer, NULL, NULL, BGP_NO_SNAPSHOT_TEST)); dist->page = BufferGetPage(dist->buffer, NULL, NULL, BGP_NO_SNAPSHOT_TEST); /* Write the WAL record */ if (RelationNeedsWAL(rel)) recptr = gistXLogSplit(rel->rd_node, blkno, is_leaf, dist, oldrlink, oldnsn, leftchildbuf, markfollowright); else recptr = gistGetFakeLSN(rel); for (ptr = dist; ptr; ptr = ptr->next) { PageSetLSN(ptr->page, recptr); } /* * Return the new child buffers to the caller. * * If this was a root split, we've already inserted the downlink * pointers, in the form of a new root page. Therefore we can release * all the new buffers, and keep just the root page locked. */ if (is_rootsplit) { for (ptr = dist->next; ptr; ptr = ptr->next) UnlockReleaseBuffer(ptr->buffer); } } else { /* * Enough space. We also get here if ntuples==0. */ START_CRIT_SECTION(); /* * While we delete only one tuple at once we could mix calls * PageIndexTupleDelete() here and PageIndexMultiDelete() in * gistRedoPageUpdateRecord() */ if (OffsetNumberIsValid(oldoffnum)) PageIndexTupleDelete(page, oldoffnum); gistfillbuffer(page, itup, ntup, InvalidOffsetNumber); MarkBufferDirty(buffer); if (BufferIsValid(leftchildbuf)) MarkBufferDirty(leftchildbuf); if (RelationNeedsWAL(rel)) { OffsetNumber ndeloffs = 0, deloffs[1]; if (OffsetNumberIsValid(oldoffnum)) { deloffs[0] = oldoffnum; ndeloffs = 1; } recptr = gistXLogUpdate(rel->rd_node, buffer, deloffs, ndeloffs, itup, ntup, leftchildbuf); PageSetLSN(page, recptr); } else { recptr = gistGetFakeLSN(rel); PageSetLSN(page, recptr); } if (newblkno) *newblkno = blkno; } /* * If we inserted the downlink for a child page, set NSN and clear * F_FOLLOW_RIGHT flag on the left child, so that concurrent scans know to * follow the rightlink if and only if they looked at the parent page * before we inserted the downlink. * * Note that we do this *after* writing the WAL record. That means that * the possible full page image in the WAL record does not include these * changes, and they must be replayed even if the page is restored from * the full page image. There's a chicken-and-egg problem: if we updated * the child pages first, we wouldn't know the recptr of the WAL record * we're about to write. */ if (BufferIsValid(leftchildbuf)) { Page leftpg = BufferGetPage(leftchildbuf, NULL, NULL, BGP_NO_SNAPSHOT_TEST); GistPageSetNSN(leftpg, recptr); GistClearFollowRight(leftpg); PageSetLSN(leftpg, recptr); } END_CRIT_SECTION(); return is_split; }
/* * _hash_freeovflpage() - * * Remove this overflow page from its bucket's chain, and mark the page as * free. On entry, ovflbuf is write-locked; it is released before exiting. * * Add the tuples (itups) to wbuf in this function. We could do that in the * caller as well, but the advantage of doing it here is we can easily write * the WAL for XLOG_HASH_SQUEEZE_PAGE operation. Addition of tuples and * removal of overflow page has to done as an atomic operation, otherwise * during replay on standby users might find duplicate records. * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. * * Returns the block number of the page that followed the given page * in the bucket, or InvalidBlockNumber if no following page. * * NB: caller must not hold lock on metapage, nor on page, that's next to * ovflbuf in the bucket chain. We don't acquire the lock on page that's * prior to ovflbuf in chain if it is same as wbuf because the caller already * has a lock on same. */ BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy) { HashMetaPage metap; Buffer metabuf; Buffer mapbuf; BlockNumber ovflblkno; BlockNumber prevblkno; BlockNumber blkno; BlockNumber nextblkno; BlockNumber writeblkno; HashPageOpaque ovflopaque; Page ovflpage; Page mappage; uint32 *freep; uint32 ovflbitno; int32 bitmappage, bitmapbit; Bucket bucket PG_USED_FOR_ASSERTS_ONLY; Buffer prevbuf = InvalidBuffer; Buffer nextbuf = InvalidBuffer; bool update_metap = false; /* Get information from the doomed page */ _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE); ovflblkno = BufferGetBlockNumber(ovflbuf); ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); nextblkno = ovflopaque->hasho_nextblkno; prevblkno = ovflopaque->hasho_prevblkno; writeblkno = BufferGetBlockNumber(wbuf); bucket = ovflopaque->hasho_bucket; /* * Fix up the bucket chain. this is a doubly-linked list, so we must fix * up the bucket chain members behind and ahead of the overflow page being * deleted. Concurrency issues are avoided by using lock chaining as * described atop hashbucketcleanup. */ if (BlockNumberIsValid(prevblkno)) { if (prevblkno == writeblkno) prevbuf = wbuf; else prevbuf = _hash_getbuf_with_strategy(rel, prevblkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, bstrategy); } if (BlockNumberIsValid(nextblkno)) nextbuf = _hash_getbuf_with_strategy(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); /* Note: bstrategy is intentionally not used for metapage and bitmap */ /* Read the metapage so we can determine which bitmap page to use */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* Identify which bit to set */ ovflbitno = _hash_ovflblkno_to_bitno(metap, ovflblkno); bitmappage = ovflbitno >> BMPG_SHIFT(metap); bitmapbit = ovflbitno & BMPG_MASK(metap); if (bitmappage >= metap->hashm_nmaps) elog(ERROR, "invalid overflow bit number %u", ovflbitno); blkno = metap->hashm_mapp[bitmappage]; /* Release metapage lock while we access the bitmap page */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); /* read the bitmap page to clear the bitmap bit */ mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); freep = HashPageGetBitmap(mappage); Assert(ISSET(freep, bitmapbit)); /* Get write-lock on metapage to update firstfree */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); /* This operation needs to log multiple tuples, prepare WAL for that */ if (RelationNeedsWAL(rel)) XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS, 4 + nitups); START_CRIT_SECTION(); /* * we have to insert tuples on the "write" page, being careful to preserve * hashkey ordering. (If we insert many tuples into the same "write" page * it would be worth qsort'ing them). */ if (nitups > 0) { _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); MarkBufferDirty(wbuf); } /* * Reinitialize the freed overflow page. Just zeroing the page won't * work, because WAL replay routines expect pages to be initialized. See * explanation of RBM_NORMAL mode atop XLogReadBufferExtended. We are * careful to make the special space valid here so that tools like * pageinspect won't get confused. */ _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); ovflopaque->hasho_prevblkno = InvalidBlockNumber; ovflopaque->hasho_nextblkno = InvalidBlockNumber; ovflopaque->hasho_bucket = -1; ovflopaque->hasho_flag = LH_UNUSED_PAGE; ovflopaque->hasho_page_id = HASHO_PAGE_ID; MarkBufferDirty(ovflbuf); if (BufferIsValid(prevbuf)) { Page prevpage = BufferGetPage(prevbuf); HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); Assert(prevopaque->hasho_bucket == bucket); prevopaque->hasho_nextblkno = nextblkno; MarkBufferDirty(prevbuf); } if (BufferIsValid(nextbuf)) { Page nextpage = BufferGetPage(nextbuf); HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); Assert(nextopaque->hasho_bucket == bucket); nextopaque->hasho_prevblkno = prevblkno; MarkBufferDirty(nextbuf); } /* Clear the bitmap bit to indicate that this overflow page is free */ CLRBIT(freep, bitmapbit); MarkBufferDirty(mapbuf); /* if this is now the first free page, update hashm_firstfree */ if (ovflbitno < metap->hashm_firstfree) { metap->hashm_firstfree = ovflbitno; update_metap = true; MarkBufferDirty(metabuf); } /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_hash_squeeze_page xlrec; XLogRecPtr recptr; int i; xlrec.prevblkno = prevblkno; xlrec.nextblkno = nextblkno; xlrec.ntups = nitups; xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf); xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf); XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage); /* * bucket buffer needs to be registered to ensure that we can acquire * a cleanup lock on it during replay. */ if (!xlrec.is_prim_bucket_same_wrt) XLogRegisterBuffer(0, bucketbuf, REGBUF_STANDARD | REGBUF_NO_IMAGE); XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); if (xlrec.ntups > 0) { XLogRegisterBufData(1, (char *) itup_offsets, nitups * sizeof(OffsetNumber)); for (i = 0; i < nitups; i++) XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); } XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD); /* * If prevpage and the writepage (block in which we are moving tuples * from overflow) are same, then no need to separately register * prevpage. During replay, we can directly update the nextblock in * writepage. */ if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD); if (BufferIsValid(nextbuf)) XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD); XLogRegisterBuffer(5, mapbuf, REGBUF_STANDARD); XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32)); if (update_metap) { XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD); XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32)); } recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE); PageSetLSN(BufferGetPage(wbuf), recptr); PageSetLSN(BufferGetPage(ovflbuf), recptr); if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) PageSetLSN(BufferGetPage(prevbuf), recptr); if (BufferIsValid(nextbuf)) PageSetLSN(BufferGetPage(nextbuf), recptr); PageSetLSN(BufferGetPage(mapbuf), recptr); if (update_metap) PageSetLSN(BufferGetPage(metabuf), recptr); } END_CRIT_SECTION(); /* release previous bucket if it is not same as write bucket */ if (BufferIsValid(prevbuf) && prevblkno != writeblkno) _hash_relbuf(rel, prevbuf); if (BufferIsValid(ovflbuf)) _hash_relbuf(rel, ovflbuf); if (BufferIsValid(nextbuf)) _hash_relbuf(rel, nextbuf); _hash_relbuf(rel, mapbuf); _hash_relbuf(rel, metabuf); return nextblkno; }
/* * Deletes pending list pages up to (not including) newHead page. * If newHead == InvalidBlockNumber then function drops the whole list. * * metapage is pinned and exclusive-locked throughout this function. */ static void shiftList(Relation index, Buffer metabuffer, BlockNumber newHead, bool fill_fsm, IndexBulkDeleteResult *stats) { Page metapage; GinMetaPageData *metadata; BlockNumber blknoToDelete; metapage = BufferGetPage(metabuffer); metadata = GinPageGetMeta(metapage); blknoToDelete = metadata->head; do { Page page; int i; int64 nDeletedHeapTuples = 0; ginxlogDeleteListPages data; Buffer buffers[GIN_NDELETE_AT_ONCE]; BlockNumber freespace[GIN_NDELETE_AT_ONCE]; data.ndeleted = 0; while (data.ndeleted < GIN_NDELETE_AT_ONCE && blknoToDelete != newHead) { freespace[data.ndeleted] = blknoToDelete; buffers[data.ndeleted] = ReadBuffer(index, blknoToDelete); LockBuffer(buffers[data.ndeleted], GIN_EXCLUSIVE); page = BufferGetPage(buffers[data.ndeleted]); data.ndeleted++; Assert(!GinPageIsDeleted(page)); nDeletedHeapTuples += GinPageGetOpaque(page)->maxoff; blknoToDelete = GinPageGetOpaque(page)->rightlink; } if (stats) stats->pages_deleted += data.ndeleted; /* * This operation touches an unusually large number of pages, so * prepare the XLogInsert machinery for that before entering the * critical section. */ if (RelationNeedsWAL(index)) XLogEnsureRecordSpace(data.ndeleted, 0); START_CRIT_SECTION(); metadata->head = blknoToDelete; Assert(metadata->nPendingPages >= data.ndeleted); metadata->nPendingPages -= data.ndeleted; Assert(metadata->nPendingHeapTuples >= nDeletedHeapTuples); metadata->nPendingHeapTuples -= nDeletedHeapTuples; if (blknoToDelete == InvalidBlockNumber) { metadata->tail = InvalidBlockNumber; metadata->tailFreeSize = 0; metadata->nPendingPages = 0; metadata->nPendingHeapTuples = 0; } /* * Set pd_lower just past the end of the metadata. This is essential, * because without doing so, metadata will be lost if xlog.c * compresses the page. (We must do this here because pre-v11 * versions of PG did not set the metapage's pd_lower correctly, so a * pg_upgraded index might contain the wrong value.) */ ((PageHeader) metapage)->pd_lower = ((char *) metadata + sizeof(GinMetaPageData)) - (char *) metapage; MarkBufferDirty(metabuffer); for (i = 0; i < data.ndeleted; i++) { page = BufferGetPage(buffers[i]); GinPageGetOpaque(page)->flags = GIN_DELETED; MarkBufferDirty(buffers[i]); } if (RelationNeedsWAL(index)) { XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT | REGBUF_STANDARD); for (i = 0; i < data.ndeleted; i++) XLogRegisterBuffer(i + 1, buffers[i], REGBUF_WILL_INIT); memcpy(&data.metadata, metadata, sizeof(GinMetaPageData)); XLogRegisterData((char *) &data, sizeof(ginxlogDeleteListPages)); recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_LISTPAGE); PageSetLSN(metapage, recptr); for (i = 0; i < data.ndeleted; i++) { page = BufferGetPage(buffers[i]); PageSetLSN(page, recptr); } } for (i = 0; i < data.ndeleted; i++) UnlockReleaseBuffer(buffers[i]); END_CRIT_SECTION(); for (i = 0; fill_fsm && i < data.ndeleted; i++) RecordFreeIndexPage(index, freespace[i]); } while (blknoToDelete != newHead); }