/* * replay addition of overflow page for hash index */ static void hash_xlog_add_ovfl_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) XLogRecGetData(record); Buffer leftbuf; Buffer ovflbuf; Buffer metabuf; BlockNumber leftblk; BlockNumber rightblk; BlockNumber newmapblk = InvalidBlockNumber; Page ovflpage; HashPageOpaque ovflopaque; uint32 *num_bucket; char *data; Size datalen PG_USED_FOR_ASSERTS_ONLY; bool new_bmpage = false; XLogRecGetBlockTag(record, 0, NULL, NULL, &rightblk); XLogRecGetBlockTag(record, 1, NULL, NULL, &leftblk); ovflbuf = XLogInitBufferForRedo(record, 0); Assert(BufferIsValid(ovflbuf)); data = XLogRecGetBlockData(record, 0, &datalen); num_bucket = (uint32 *) data; Assert(datalen == sizeof(uint32)); _hash_initbuf(ovflbuf, InvalidBlockNumber, *num_bucket, LH_OVERFLOW_PAGE, true); /* update backlink */ ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); ovflopaque->hasho_prevblkno = leftblk; PageSetLSN(ovflpage, lsn); MarkBufferDirty(ovflbuf); if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO) { Page leftpage; HashPageOpaque leftopaque; leftpage = BufferGetPage(leftbuf); leftopaque = (HashPageOpaque) PageGetSpecialPointer(leftpage); leftopaque->hasho_nextblkno = rightblk; PageSetLSN(leftpage, lsn); MarkBufferDirty(leftbuf); } if (BufferIsValid(leftbuf)) UnlockReleaseBuffer(leftbuf); UnlockReleaseBuffer(ovflbuf); /* * Note: in normal operation, we'd update the bitmap and meta page while * still holding lock on the overflow pages. But during replay it's not * necessary to hold those locks, since no other index updates can be * happening concurrently. */ if (XLogRecHasBlockRef(record, 2)) { Buffer mapbuffer; if (XLogReadBufferForRedo(record, 2, &mapbuffer) == BLK_NEEDS_REDO) { Page mappage = (Page) BufferGetPage(mapbuffer); uint32 *freep = NULL; char *data; uint32 *bitmap_page_bit; freep = HashPageGetBitmap(mappage); data = XLogRecGetBlockData(record, 2, &datalen); bitmap_page_bit = (uint32 *) data; SETBIT(freep, *bitmap_page_bit); PageSetLSN(mappage, lsn); MarkBufferDirty(mapbuffer); } if (BufferIsValid(mapbuffer)) UnlockReleaseBuffer(mapbuffer); } if (XLogRecHasBlockRef(record, 3)) { Buffer newmapbuf; newmapbuf = XLogInitBufferForRedo(record, 3); _hash_initbitmapbuffer(newmapbuf, xlrec->bmsize, true); new_bmpage = true; newmapblk = BufferGetBlockNumber(newmapbuf); MarkBufferDirty(newmapbuf); PageSetLSN(BufferGetPage(newmapbuf), lsn); UnlockReleaseBuffer(newmapbuf); } if (XLogReadBufferForRedo(record, 4, &metabuf) == BLK_NEEDS_REDO) { HashMetaPage metap; Page page; uint32 *firstfree_ovflpage; data = XLogRecGetBlockData(record, 4, &datalen); firstfree_ovflpage = (uint32 *) data; page = BufferGetPage(metabuf); metap = HashPageGetMeta(page); metap->hashm_firstfree = *firstfree_ovflpage; if (!xlrec->bmpage_found) { metap->hashm_spares[metap->hashm_ovflpoint]++; if (new_bmpage) { Assert(BlockNumberIsValid(newmapblk)); metap->hashm_mapp[metap->hashm_nmaps] = newmapblk; metap->hashm_nmaps++; metap->hashm_spares[metap->hashm_ovflpoint]++; } } PageSetLSN(page, lsn); MarkBufferDirty(metabuf); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); }
/* * _hash_freeovflpage() - * * Remove this overflow page from its bucket's chain, and mark the page as * free. On entry, ovflbuf is write-locked; it is released before exiting. * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. * * Returns the block number of the page that followed the given page * in the bucket, or InvalidBlockNumber if no following page. * * NB: caller must not hold lock on metapage, nor on either page that's * adjacent in the bucket chain. The caller had better hold exclusive lock * on the bucket, too. */ BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, BufferAccessStrategy bstrategy) { HashMetaPage metap; Buffer metabuf; Buffer mapbuf; BlockNumber ovflblkno; BlockNumber prevblkno; BlockNumber blkno; BlockNumber nextblkno; HashPageOpaque ovflopaque; Page ovflpage; Page mappage; uint32 *freep; uint32 ovflbitno; int32 bitmappage, bitmapbit; Bucket bucket PG_USED_FOR_ASSERTS_ONLY; /* Get information from the doomed page */ _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE); ovflblkno = BufferGetBlockNumber(ovflbuf); ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); nextblkno = ovflopaque->hasho_nextblkno; prevblkno = ovflopaque->hasho_prevblkno; bucket = ovflopaque->hasho_bucket; /* * Zero the page for debugging's sake; then write and release it. (Note: * if we failed to zero the page here, we'd have problems with the Assert * in _hash_pageinit() when the page is reused.) */ MemSet(ovflpage, 0, BufferGetPageSize(ovflbuf)); _hash_wrtbuf(rel, ovflbuf); /* * Fix up the bucket chain. this is a doubly-linked list, so we must fix * up the bucket chain members behind and ahead of the overflow page being * deleted. No concurrency issues since we hold exclusive lock on the * entire bucket. */ if (BlockNumberIsValid(prevblkno)) { Buffer prevbuf = _hash_getbuf_with_strategy(rel, prevblkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, bstrategy); Page prevpage = BufferGetPage(prevbuf); HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); Assert(prevopaque->hasho_bucket == bucket); prevopaque->hasho_nextblkno = nextblkno; _hash_wrtbuf(rel, prevbuf); } if (BlockNumberIsValid(nextblkno)) { Buffer nextbuf = _hash_getbuf_with_strategy(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); Page nextpage = BufferGetPage(nextbuf); HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); Assert(nextopaque->hasho_bucket == bucket); nextopaque->hasho_prevblkno = prevblkno; _hash_wrtbuf(rel, nextbuf); } /* Note: bstrategy is intentionally not used for metapage and bitmap */ /* Read the metapage so we can determine which bitmap page to use */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* Identify which bit to set */ ovflbitno = blkno_to_bitno(metap, ovflblkno); bitmappage = ovflbitno >> BMPG_SHIFT(metap); bitmapbit = ovflbitno & BMPG_MASK(metap); if (bitmappage >= metap->hashm_nmaps) elog(ERROR, "invalid overflow bit number %u", ovflbitno); blkno = metap->hashm_mapp[bitmappage]; /* Release metapage lock while we access the bitmap page */ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); /* Clear the bitmap bit to indicate that this overflow page is free */ mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); freep = HashPageGetBitmap(mappage); Assert(ISSET(freep, bitmapbit)); CLRBIT(freep, bitmapbit); _hash_wrtbuf(rel, mapbuf); /* Get write-lock on metapage to update firstfree */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); /* if this is now the first free page, update hashm_firstfree */ if (ovflbitno < metap->hashm_firstfree) { metap->hashm_firstfree = ovflbitno; _hash_wrtbuf(rel, metabuf); } else { /* no need to change metapage */ _hash_relbuf(rel, metabuf); } return nextblkno; }
/* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * This function also deletes the tuples that are moved by split to other * bucket. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state) { Relation rel = info->index; double tuples_removed; double num_index_tuples; double orig_ntuples; Bucket orig_maxbucket; Bucket cur_maxbucket; Bucket cur_bucket; Buffer metabuf = InvalidBuffer; HashMetaPage metap; HashMetaPage cachedmetap; tuples_removed = 0; num_index_tuples = 0; /* * We need a copy of the metapage so that we can use its hashm_spares[] * values to compute bucket page addresses, but a cached copy should be * good enough. (If not, we'll detect that further down and refresh the * cache as necessary.) */ cachedmetap = _hash_getcachedmetap(rel, &metabuf, false); Assert(cachedmetap != NULL); orig_maxbucket = cachedmetap->hashm_maxbucket; orig_ntuples = cachedmetap->hashm_ntuples; /* Scan the buckets that we know exist */ cur_bucket = 0; cur_maxbucket = orig_maxbucket; loop_top: while (cur_bucket <= cur_maxbucket) { BlockNumber bucket_blkno; BlockNumber blkno; Buffer bucket_buf; Buffer buf; HashPageOpaque bucket_opaque; Page page; bool split_cleanup = false; /* Get address of bucket's start page */ bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket); blkno = bucket_blkno; /* * We need to acquire a cleanup lock on the primary bucket page to out * wait concurrent scans before deleting the dead tuples. */ buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBufferForCleanup(buf); _hash_checkpage(rel, buf, LH_BUCKET_PAGE); page = BufferGetPage(buf); bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* * If the bucket contains tuples that are moved by split, then we need * to delete such tuples. We can't delete such tuples if the split * operation on bucket is not finished as those are needed by scans. */ if (!H_BUCKET_BEING_SPLIT(bucket_opaque) && H_NEEDS_SPLIT_CLEANUP(bucket_opaque)) { split_cleanup = true; /* * This bucket might have been split since we last held a lock on * the metapage. If so, hashm_maxbucket, hashm_highmask and * hashm_lowmask might be old enough to cause us to fail to remove * tuples left behind by the most recent split. To prevent that, * now that the primary page of the target bucket has been locked * (and thus can't be further split), check whether we need to * update our cached metapage data. */ Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber); if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket) { cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); Assert(cachedmetap != NULL); } } bucket_buf = buf; hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy, cachedmetap->hashm_maxbucket, cachedmetap->hashm_highmask, cachedmetap->hashm_lowmask, &tuples_removed, &num_index_tuples, split_cleanup, callback, callback_state); _hash_dropbuf(rel, bucket_buf); /* Advance to next bucket */ cur_bucket++; } if (BufferIsInvalid(metabuf)) metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE); /* Write-lock metapage and check for split since we started */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); metap = HashPageGetMeta(BufferGetPage(metabuf)); if (cur_maxbucket != metap->hashm_maxbucket) { /* There's been a split, so process the additional bucket(s) */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); Assert(cachedmetap != NULL); cur_maxbucket = cachedmetap->hashm_maxbucket; goto loop_top; } /* Okay, we're really done. Update tuple count in metapage. */ START_CRIT_SECTION(); if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) { /* * No one has split or inserted anything since start of scan, so * believe our count as gospel. */ metap->hashm_ntuples = num_index_tuples; } else { /* * Otherwise, our count is untrustworthy since we may have * double-scanned tuples in split buckets. Proceed by dead-reckoning. * (Note: we still return estimated_count = false, because using this * count is better than not updating reltuples at all.) */ if (metap->hashm_ntuples > tuples_removed) metap->hashm_ntuples -= tuples_removed; else metap->hashm_ntuples = 0; num_index_tuples = metap->hashm_ntuples; } MarkBufferDirty(metabuf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_hash_update_meta_page xlrec; XLogRecPtr recptr; xlrec.ntuples = metap->hashm_ntuples; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage); XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE); PageSetLSN(BufferGetPage(metabuf), recptr); } END_CRIT_SECTION(); _hash_relbuf(rel, metabuf); /* return statistics */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); stats->estimated_count = false; stats->num_index_tuples = num_index_tuples; stats->tuples_removed += tuples_removed; /* hashvacuumcleanup will fill in num_pages */ return stats; }
static void btree_xlog_split(bool onleft, bool isroot, XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); bool isleaf = (xlrec->level == 0); Buffer lbuf; Buffer rbuf; Page rpage; BTPageOpaque ropaque; char *datapos; Size datalen; Item left_hikey = NULL; Size left_hikeysz = 0; BlockNumber leftsib; BlockNumber rightsib; BlockNumber rnext; XLogRecGetBlockTag(record, 0, NULL, NULL, &leftsib); XLogRecGetBlockTag(record, 1, NULL, NULL, &rightsib); if (!XLogRecGetBlockTag(record, 2, NULL, NULL, &rnext)) rnext = P_NONE; /* * Clear the incomplete split flag on the left sibling of the child page * this is a downlink for. (Like in btree_xlog_insert, this can be done * before locking the other pages) */ if (!isleaf) _bt_clear_incomplete_split(record, 3); /* Reconstruct right (new) sibling page from scratch */ rbuf = XLogInitBufferForRedo(record, 1); datapos = XLogRecGetBlockData(record, 1, &datalen); rpage = (Page) BufferGetPage(rbuf); _bt_pageinit(rpage, BufferGetPageSize(rbuf)); ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); ropaque->btpo_prev = leftsib; ropaque->btpo_next = rnext; ropaque->btpo.level = xlrec->level; ropaque->btpo_flags = isleaf ? BTP_LEAF : 0; ropaque->btpo_cycleid = 0; _bt_restore_page(rpage, datapos, datalen); /* * On leaf level, the high key of the left page is equal to the first key * on the right page. */ if (isleaf) { ItemId hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque)); left_hikey = PageGetItem(rpage, hiItemId); left_hikeysz = ItemIdGetLength(hiItemId); } PageSetLSN(rpage, lsn); MarkBufferDirty(rbuf); /* don't release the buffer yet; we touch right page's first item below */ /* Now reconstruct left (original) sibling page */ if (XLogReadBufferForRedo(record, 0, &lbuf) == BLK_NEEDS_REDO) { /* * To retain the same physical order of the tuples that they had, we * initialize a temporary empty page for the left page and add all the * items to that in item number order. This mirrors how _bt_split() * works. It's not strictly required to retain the same physical * order, as long as the items are in the correct item number order, * but it helps debugging. See also _bt_restore_page(), which does * the same for the right page. */ Page lpage = (Page) BufferGetPage(lbuf); BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); OffsetNumber off; Item newitem = NULL; Size newitemsz = 0; Page newlpage; OffsetNumber leftoff; datapos = XLogRecGetBlockData(record, 0, &datalen); if (onleft) { newitem = (Item) datapos; newitemsz = MAXALIGN(IndexTupleSize(newitem)); datapos += newitemsz; datalen -= newitemsz; } /* Extract left hikey and its size (assuming 16-bit alignment) */ if (!isleaf) { left_hikey = (Item) datapos; left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); datapos += left_hikeysz; datalen -= left_hikeysz; } Assert(datalen == 0); newlpage = PageGetTempPageCopySpecial(lpage); /* Set high key */ leftoff = P_HIKEY; if (PageAddItem(newlpage, left_hikey, left_hikeysz, P_HIKEY, false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add high key to left page after split"); leftoff = OffsetNumberNext(leftoff); for (off = P_FIRSTDATAKEY(lopaque); off < xlrec->firstright; off++) { ItemId itemid; Size itemsz; Item item; /* add the new item if it was inserted on left page */ if (onleft && off == xlrec->newitemoff) { if (PageAddItem(newlpage, newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add new item to left page after split"); leftoff = OffsetNumberNext(leftoff); } itemid = PageGetItemId(lpage, off); itemsz = ItemIdGetLength(itemid); item = PageGetItem(lpage, itemid); if (PageAddItem(newlpage, item, itemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add old item to left page after split"); leftoff = OffsetNumberNext(leftoff); } /* cope with possibility that newitem goes at the end */ if (onleft && off == xlrec->newitemoff) { if (PageAddItem(newlpage, newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add new item to left page after split"); leftoff = OffsetNumberNext(leftoff); } PageRestoreTempPage(newlpage, lpage); /* Fix opaque fields */ lopaque->btpo_flags = BTP_INCOMPLETE_SPLIT; if (isleaf) lopaque->btpo_flags |= BTP_LEAF; lopaque->btpo_next = rightsib; lopaque->btpo_cycleid = 0; PageSetLSN(lpage, lsn); MarkBufferDirty(lbuf); } /* We no longer need the buffers */ if (BufferIsValid(lbuf)) UnlockReleaseBuffer(lbuf); UnlockReleaseBuffer(rbuf); /* * Fix left-link of the page to the right of the new right sibling. * * Note: in normal operation, we do this while still holding lock on the * two split pages. However, that's not necessary for correctness in WAL * replay, because no other index update can be in progress, and readers * will cope properly when following an obsolete left-link. */ if (rnext != P_NONE) { Buffer buffer; if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) { Page page = (Page) BufferGetPage(buffer); BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_prev = rightsib; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); } }
static void btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_mark_page_halfdead *xlrec = (xl_btree_mark_page_halfdead *) XLogRecGetData(record); Buffer buffer; Page page; BTPageOpaque pageop; IndexTupleData trunctuple; /* * In normal operation, we would lock all the pages this WAL record * touches before changing any of them. In WAL replay, it should be okay * to lock just one page at a time, since no concurrent index updates can * be happening, and readers should not care whether they arrive at the * target page or not (since it's surely empty). */ /* parent page */ if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) { OffsetNumber poffset; ItemId itemid; IndexTuple itup; OffsetNumber nextoffset; BlockNumber rightsib; page = (Page) BufferGetPage(buffer); pageop = (BTPageOpaque) PageGetSpecialPointer(page); poffset = xlrec->poffset; nextoffset = OffsetNumberNext(poffset); itemid = PageGetItemId(page, nextoffset); itup = (IndexTuple) PageGetItem(page, itemid); rightsib = ItemPointerGetBlockNumber(&itup->t_tid); itemid = PageGetItemId(page, poffset); itup = (IndexTuple) PageGetItem(page, itemid); ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY); nextoffset = OffsetNumberNext(poffset); PageIndexTupleDelete(page, nextoffset); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* Rewrite the leaf page as a halfdead page */ buffer = XLogInitBufferForRedo(record, 0); page = (Page) BufferGetPage(buffer); _bt_pageinit(page, BufferGetPageSize(buffer)); pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_prev = xlrec->leftblk; pageop->btpo_next = xlrec->rightblk; pageop->btpo.level = 0; pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; pageop->btpo_cycleid = 0; /* * Construct a dummy hikey item that points to the next parent to be * deleted (if any). */ MemSet(&trunctuple, 0, sizeof(IndexTupleData)); trunctuple.t_info = sizeof(IndexTupleData); if (xlrec->topparent != InvalidBlockNumber) ItemPointerSet(&trunctuple.t_tid, xlrec->topparent, P_HIKEY); else ItemPointerSetInvalid(&trunctuple.t_tid); if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, false, false) == InvalidOffsetNumber) elog(ERROR, "could not add dummy high key to half-dead page"); PageSetLSN(page, lsn); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); }
Datum bt_page_items(PG_FUNCTION_ARGS) { text *relname = PG_GETARG_TEXT_P(0); uint32 blkno = PG_GETARG_UINT32(1); Datum result; char *values[6]; HeapTuple tuple; FuncCallContext *fctx; MemoryContext mctx; struct user_args *uargs; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use pageinspect functions")))); if (SRF_IS_FIRSTCALL()) { RangeVar *relrv; Relation rel; Buffer buffer; BTPageOpaque opaque; TupleDesc tupleDesc; fctx = SRF_FIRSTCALL_INIT(); relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); rel = relation_openrv(relrv, AccessShareLock); if (!IS_INDEX(rel) || !IS_BTREE(rel)) elog(ERROR, "relation \"%s\" is not a btree index", RelationGetRelationName(rel)); /* * Reject attempts to read non-local temporary relations; we would be * likely to get wrong data since we have no visibility into the * owning session's local buffers. */ if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"))); if (blkno == 0) elog(ERROR, "block 0 is a meta page"); CHECK_RELATION_BLOCK_RANGE(rel, blkno); buffer = ReadBuffer(rel, blkno); LockBuffer(buffer, BUFFER_LOCK_SHARE); /* * We copy the page into local storage to avoid holding pin on the * buffer longer than we must, and possibly failing to release it at * all if the calling query doesn't fetch all rows. */ mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); uargs = palloc(sizeof(struct user_args)); uargs->page = palloc(BLCKSZ); memcpy(uargs->page, BufferGetPage(buffer), BLCKSZ); UnlockReleaseBuffer(buffer); relation_close(rel, AccessShareLock); uargs->offset = FirstOffsetNumber; opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page); if (P_ISDELETED(opaque)) elog(NOTICE, "page is deleted"); fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc); fctx->user_fctx = uargs; MemoryContextSwitchTo(mctx); } fctx = SRF_PERCALL_SETUP(); uargs = fctx->user_fctx; if (fctx->call_cntr < fctx->max_calls) { ItemId id; IndexTuple itup; int j; int off; int dlen; char *dump; char *ptr; id = PageGetItemId(uargs->page, uargs->offset); if (!ItemIdIsValid(id)) elog(ERROR, "invalid ItemId"); itup = (IndexTuple) PageGetItem(uargs->page, id); j = 0; values[j++] = psprintf("%d", uargs->offset); values[j++] = psprintf("(%u,%u)", BlockIdGetBlockNumber(&(itup->t_tid.ip_blkid)), itup->t_tid.ip_posid); values[j++] = psprintf("%d", (int) IndexTupleSize(itup)); values[j++] = psprintf("%c", IndexTupleHasNulls(itup) ? 't' : 'f'); values[j++] = psprintf("%c", IndexTupleHasVarwidths(itup) ? 't' : 'f'); ptr = (char *) itup + IndexInfoFindDataOffset(itup->t_info); dlen = IndexTupleSize(itup) - IndexInfoFindDataOffset(itup->t_info); dump = palloc0(dlen * 3 + 1); values[j] = dump; for (off = 0; off < dlen; off++) { if (off > 0) *dump++ = ' '; sprintf(dump, "%02x", *(ptr + off) & 0xff); dump += 2; } tuple = BuildTupleFromCStrings(fctx->attinmeta, values); result = HeapTupleGetDatum(tuple); uargs->offset = uargs->offset + 1; SRF_RETURN_NEXT(fctx, result); } else { pfree(uargs->page); pfree(uargs); SRF_RETURN_DONE(fctx); } }
Datum bt_page_items_bytea(PG_FUNCTION_ARGS) { bytea *raw_page = PG_GETARG_BYTEA_P(0); Datum result; FuncCallContext *fctx; struct user_args *uargs; int raw_page_size; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use pageinspect functions")))); if (SRF_IS_FIRSTCALL()) { BTPageOpaque opaque; MemoryContext mctx; TupleDesc tupleDesc; raw_page_size = VARSIZE(raw_page) - VARHDRSZ; if (raw_page_size < SizeOfPageHeaderData) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("input page too small (%d bytes)", raw_page_size))); fctx = SRF_FIRSTCALL_INIT(); mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); uargs = palloc(sizeof(struct user_args)); uargs->page = VARDATA(raw_page); uargs->offset = FirstOffsetNumber; opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page); if (P_ISMETA(opaque)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("block is a meta page"))); if (P_ISDELETED(opaque)) elog(NOTICE, "page is deleted"); fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc); fctx->user_fctx = uargs; MemoryContextSwitchTo(mctx); } fctx = SRF_PERCALL_SETUP(); uargs = fctx->user_fctx; if (fctx->call_cntr < fctx->max_calls) { result = bt_page_print_tuples(fctx, uargs->page, uargs->offset); uargs->offset++; SRF_RETURN_NEXT(fctx, result); } else { pfree(uargs); SRF_RETURN_DONE(fctx); } }
/* * _hash_metapinit() -- Initialize the metadata page of a hash index, * the two buckets that we begin with and the initial * bitmap page. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ void _hash_metapinit(Relation rel) { HashMetaPage metap; HashPageOpaque pageopaque; Buffer metabuf; Buffer buf; Page pg; int32 data_width; int32 item_width; int32 ffactor; uint16 i; /* safety check */ if (RelationGetNumberOfBlocks(rel) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); /* * Determine the target fill factor (tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about 3/4ths * full. We can compute it exactly if the index datatype is fixed-width, * but for var-width there's some guessing involved. */ data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid, RelationGetDescr(rel)->attrs[0]->atttypmod); item_width = MAXALIGN(sizeof(HashItemData)) + MAXALIGN(data_width) + sizeof(ItemIdData); /* include the line pointer */ ffactor = (BLCKSZ * 3 / 4) / item_width; /* keep to a sane range */ if (ffactor < 10) ffactor = 10; metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE); pg = BufferGetPage(metabuf); _hash_pageinit(pg, BufferGetPageSize(metabuf)); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = -1; pageopaque->hasho_flag = LH_META_PAGE; pageopaque->hasho_filler = HASHO_FILL; metap = (HashMetaPage) pg; metap->hashm_magic = HASH_MAGIC; metap->hashm_version = HASH_VERSION; metap->hashm_ntuples = 0; metap->hashm_nmaps = 0; metap->hashm_ffactor = ffactor; metap->hashm_bsize = BufferGetPageSize(metabuf); /* find largest bitmap array size that will fit in page size */ for (i = _hash_log2(metap->hashm_bsize); i > 0; --i) { if ((1 << i) <= (metap->hashm_bsize - (MAXALIGN(sizeof(PageHeaderData)) + MAXALIGN(sizeof(HashPageOpaqueData))))) break; } Assert(i > 0); metap->hashm_bmsize = 1 << i; metap->hashm_bmshift = i + BYTE_TO_BIT; Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1)); metap->hashm_procid = index_getprocid(rel, 1, HASHPROC); /* * We initialize the index with two buckets, 0 and 1, occupying physical * blocks 1 and 2. The first freespace bitmap page is in block 3. */ metap->hashm_maxbucket = metap->hashm_lowmask = 1; /* nbuckets - 1 */ metap->hashm_highmask = 3; /* (nbuckets << 1) - 1 */ MemSet((char *) metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet((char *) metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */ metap->hashm_ovflpoint = 1; metap->hashm_firstfree = 0; /* * Initialize the first two buckets */ for (i = 0; i <= 1; i++) { buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE); pg = BufferGetPage(buf); _hash_pageinit(pg, BufferGetPageSize(buf)); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = i; pageopaque->hasho_flag = LH_BUCKET_PAGE; pageopaque->hasho_filler = HASHO_FILL; _hash_wrtbuf(rel, buf); } /* * Initialize first bitmap page. Can't do this until we * create the first two buckets, else smgr will complain. */ _hash_initbitmap(rel, metap, 3); /* all done */ _hash_wrtbuf(rel, metabuf); }
/* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * * We are splitting a bucket that consists of a base bucket page and zero * or more overflow (bucket chain) pages. We must relocate tuples that * belong in the new bucket, and compress out any free space in the old * bucket. * * The caller must hold exclusive locks on both buckets to ensure that * no one else is trying to access them (see README). * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. (The metapage is only * touched if it becomes necessary to add or remove overflow pages.) */ static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, BlockNumber start_oblkno, BlockNumber start_nblkno, uint32 maxbucket, uint32 highmask, uint32 lowmask) { Bucket bucket; Buffer obuf; Buffer nbuf; BlockNumber oblkno; BlockNumber nblkno; bool null; Datum datum; HashItem hitem; HashPageOpaque oopaque; HashPageOpaque nopaque; IndexTuple itup; Size itemsz; OffsetNumber ooffnum; OffsetNumber noffnum; OffsetNumber omaxoffnum; Page opage; Page npage; TupleDesc itupdesc = RelationGetDescr(rel); /* * It should be okay to simultaneously write-lock pages from each * bucket, since no one else can be trying to acquire buffer lock * on pages of either bucket. */ oblkno = start_oblkno; nblkno = start_nblkno; obuf = _hash_getbuf(rel, oblkno, HASH_WRITE); nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE); opage = BufferGetPage(obuf); npage = BufferGetPage(nbuf); _hash_checkpage(rel, opage, LH_BUCKET_PAGE); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); /* initialize the new bucket's primary page */ _hash_pageinit(npage, BufferGetPageSize(nbuf)); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); nopaque->hasho_prevblkno = InvalidBlockNumber; nopaque->hasho_nextblkno = InvalidBlockNumber; nopaque->hasho_bucket = nbucket; nopaque->hasho_flag = LH_BUCKET_PAGE; nopaque->hasho_filler = HASHO_FILL; /* * Partition the tuples in the old bucket between the old bucket and the * new bucket, advancing along the old bucket's overflow bucket chain * and adding overflow pages to the new bucket as needed. */ ooffnum = FirstOffsetNumber; omaxoffnum = PageGetMaxOffsetNumber(opage); for (;;) { /* * at each iteration through this loop, each of these variables * should be up-to-date: obuf opage oopaque ooffnum omaxoffnum */ /* check if we're at the end of the page */ if (ooffnum > omaxoffnum) { /* at end of page, but check for an(other) overflow page */ oblkno = oopaque->hasho_nextblkno; if (!BlockNumberIsValid(oblkno)) break; /* * we ran out of tuples on this particular page, but we * have more overflow pages; advance to next page. */ _hash_wrtbuf(rel, obuf); obuf = _hash_getbuf(rel, oblkno, HASH_WRITE); opage = BufferGetPage(obuf); _hash_checkpage(rel, opage, LH_OVERFLOW_PAGE); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); ooffnum = FirstOffsetNumber; omaxoffnum = PageGetMaxOffsetNumber(opage); continue; } /* * Re-hash the tuple to determine which bucket it now belongs in. * * It is annoying to call the hash function while holding locks, * but releasing and relocking the page for each tuple is unappealing * too. */ hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum)); itup = &(hitem->hash_itup); datum = index_getattr(itup, 1, itupdesc, &null); Assert(!null); bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum), maxbucket, highmask, lowmask); if (bucket == nbucket) { /* * insert the tuple into the new bucket. if it doesn't fit on * the current page in the new bucket, we must allocate a new * overflow page and place the tuple on that page instead. */ itemsz = IndexTupleDSize(hitem->hash_itup) + (sizeof(HashItemData) - sizeof(IndexTupleData)); itemsz = MAXALIGN(itemsz); if (PageGetFreeSpace(npage) < itemsz) { /* write out nbuf and drop lock, but keep pin */ _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK); /* chain to a new overflow page */ nbuf = _hash_addovflpage(rel, metabuf, nbuf); npage = BufferGetPage(nbuf); _hash_checkpage(rel, npage, LH_OVERFLOW_PAGE); /* we don't need nopaque within the loop */ } noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage)); if (PageAddItem(npage, (Item) hitem, itemsz, noffnum, LP_USED) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel)); /* * now delete the tuple from the old bucket. after this * section of code, 'ooffnum' will actually point to the * ItemId to which we would point if we had advanced it before * the deletion (PageIndexTupleDelete repacks the ItemId * array). this also means that 'omaxoffnum' is exactly one * less than it used to be, so we really can just decrement it * instead of calling PageGetMaxOffsetNumber. */ PageIndexTupleDelete(opage, ooffnum); omaxoffnum = OffsetNumberPrev(omaxoffnum); } else { /* * the tuple stays on this page. we didn't move anything, so * we didn't delete anything and therefore we don't have to * change 'omaxoffnum'. */ Assert(bucket == obucket); ooffnum = OffsetNumberNext(ooffnum); } } /* * We're at the end of the old bucket chain, so we're done partitioning * the tuples. Before quitting, call _hash_squeezebucket to ensure the * tuples remaining in the old bucket (including the overflow pages) are * packed as tightly as possible. The new bucket is already tight. */ _hash_wrtbuf(rel, obuf); _hash_wrtbuf(rel, nbuf); _hash_squeezebucket(rel, obucket, start_oblkno); }
static void rtdoinsert(Relation r, IndexTuple itup, RTSTATE *rtstate) { Page page; Buffer buffer; BlockNumber blk; IndexTuple which; OffsetNumber l; RTSTACK *stack; RTreePageOpaque opaque; Datum datum; blk = P_ROOT; buffer = InvalidBuffer; stack = NULL; do { /* release the current buffer, read in the next one */ buffer = ReleaseAndReadBuffer(buffer, r, blk); page = (Page) BufferGetPage(buffer); opaque = (RTreePageOpaque) PageGetSpecialPointer(page); if (!(opaque->flags & F_LEAF)) { RTSTACK *n; ItemId iid; n = (RTSTACK *) palloc(sizeof(RTSTACK)); n->rts_parent = stack; n->rts_blk = blk; n->rts_child = choose(r, page, itup, rtstate); stack = n; iid = PageGetItemId(page, n->rts_child); which = (IndexTuple) PageGetItem(page, iid); blk = ItemPointerGetBlockNumber(&(which->t_tid)); } } while (!(opaque->flags & F_LEAF)); if (nospace(page, itup)) { /* need to do a split */ rtdosplit(r, buffer, stack, itup, rtstate); freestack(stack); WriteBuffer(buffer); /* don't forget to release buffer! */ return; } /* add the item and write the buffer */ if (PageIsEmpty(page)) { l = PageAddItem(page, (Item) itup, IndexTupleSize(itup), FirstOffsetNumber, LP_USED); } else { l = PageAddItem(page, (Item) itup, IndexTupleSize(itup), OffsetNumberNext(PageGetMaxOffsetNumber(page)), LP_USED); } if (l == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(r)); WriteBuffer(buffer); datum = IndexTupleGetDatum(itup); /* now expand the page boundary in the parent to include the new child */ rttighten(r, stack, datum, IndexTupleAttSize(itup), rtstate); freestack(stack); }
/* * rtdosplit -- split a page in the tree. * * rtpicksplit does the interesting work of choosing the split. * This routine just does the bit-pushing. */ static void rtdosplit(Relation r, Buffer buffer, RTSTACK *stack, IndexTuple itup, RTSTATE *rtstate) { Page p; Buffer leftbuf, rightbuf; Page left, right; ItemId itemid; IndexTuple item; IndexTuple ltup, rtup; OffsetNumber maxoff; OffsetNumber i; OffsetNumber leftoff, rightoff; BlockNumber lbknum, rbknum; BlockNumber bufblock; RTreePageOpaque opaque; bool *isnull; SPLITVEC v; OffsetNumber *spl_left, *spl_right; TupleDesc tupDesc; int n; OffsetNumber newitemoff; p = (Page) BufferGetPage(buffer); opaque = (RTreePageOpaque) PageGetSpecialPointer(p); rtpicksplit(r, p, &v, itup, rtstate); /* * The root of the tree is the first block in the relation. If we're * about to split the root, we need to do some hocus-pocus to enforce this * guarantee. */ if (BufferGetBlockNumber(buffer) == P_ROOT) { leftbuf = ReadBuffer(r, P_NEW); RTInitBuffer(leftbuf, opaque->flags); lbknum = BufferGetBlockNumber(leftbuf); left = (Page) BufferGetPage(leftbuf); } else { leftbuf = buffer; IncrBufferRefCount(buffer); lbknum = BufferGetBlockNumber(buffer); left = (Page) PageGetTempPage(p, sizeof(RTreePageOpaqueData)); } rightbuf = ReadBuffer(r, P_NEW); RTInitBuffer(rightbuf, opaque->flags); rbknum = BufferGetBlockNumber(rightbuf); right = (Page) BufferGetPage(rightbuf); spl_left = v.spl_left; spl_right = v.spl_right; leftoff = rightoff = FirstOffsetNumber; maxoff = PageGetMaxOffsetNumber(p); newitemoff = OffsetNumberNext(maxoff); /* * spl_left contains a list of the offset numbers of the tuples that will * go to the left page. For each offset number, get the tuple item, then * add the item to the left page. Similarly for the right side. */ /* fill left node */ for (n = 0; n < v.spl_nleft; n++) { i = *spl_left; if (i == newitemoff) item = itup; else { itemid = PageGetItemId(p, i); item = (IndexTuple) PageGetItem(p, itemid); } if (PageAddItem(left, (Item) item, IndexTupleSize(item), leftoff, LP_USED) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(r)); leftoff = OffsetNumberNext(leftoff); spl_left++; /* advance in left split vector */ } /* fill right node */ for (n = 0; n < v.spl_nright; n++) { i = *spl_right; if (i == newitemoff) item = itup; else { itemid = PageGetItemId(p, i); item = (IndexTuple) PageGetItem(p, itemid); } if (PageAddItem(right, (Item) item, IndexTupleSize(item), rightoff, LP_USED) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(r)); rightoff = OffsetNumberNext(rightoff); spl_right++; /* advance in right split vector */ } /* Make sure we consumed all of the split vectors, and release 'em */ Assert(*spl_left == InvalidOffsetNumber); Assert(*spl_right == InvalidOffsetNumber); pfree(v.spl_left); pfree(v.spl_right); if ((bufblock = BufferGetBlockNumber(buffer)) != P_ROOT) PageRestoreTempPage(left, p); WriteBuffer(leftbuf); WriteBuffer(rightbuf); /* * Okay, the page is split. We have three things left to do: * * 1) Adjust any active scans on this index to cope with changes we * introduced in its structure by splitting this page. * * 2) "Tighten" the bounding box of the pointer to the left page in the * parent node in the tree, if any. Since we moved a bunch of stuff off * the left page, we expect it to get smaller. This happens in the * internal insertion routine. * * 3) Insert a pointer to the right page in the parent. This may cause * the parent to split. If it does, we need to repeat steps one and two * for each split node in the tree. */ /* adjust active scans */ rtadjscans(r, RTOP_SPLIT, bufblock, FirstOffsetNumber); tupDesc = r->rd_att; isnull = (bool *) palloc(r->rd_rel->relnatts * sizeof(bool)); memset(isnull, false, r->rd_rel->relnatts * sizeof(bool)); ltup = index_form_tuple(tupDesc, &(v.spl_ldatum), isnull); rtup = index_form_tuple(tupDesc, &(v.spl_rdatum), isnull); pfree(isnull); pfree(DatumGetPointer(v.spl_ldatum)); pfree(DatumGetPointer(v.spl_rdatum)); /* set pointers to new child pages in the internal index tuples */ ItemPointerSet(&(ltup->t_tid), lbknum, 1); ItemPointerSet(&(rtup->t_tid), rbknum, 1); rtintinsert(r, stack, ltup, rtup, rtstate); pfree(ltup); pfree(rtup); }
/* * replay delete operation of hash index */ static void hash_xlog_delete(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_delete *xldata = (xl_hash_delete *) XLogRecGetData(record); Buffer bucketbuf = InvalidBuffer; Buffer deletebuf; Page page; XLogRedoAction action; /* * Ensure we have a cleanup lock on primary bucket page before we start * with the actual replay operation. This is to ensure that neither a * scan can start nor a scan can be already-in-progress during the replay * of this operation. If we allow scans during this operation, then they * can miss some records or show the same record multiple times. */ if (xldata->is_primary_bucket_page) action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &deletebuf); else { /* * we don't care for return value as the purpose of reading bucketbuf * is to ensure a cleanup lock on primary bucket page. */ (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); action = XLogReadBufferForRedo(record, 1, &deletebuf); } /* replay the record for deleting entries in bucket page */ if (action == BLK_NEEDS_REDO) { char *ptr; Size len; ptr = XLogRecGetBlockData(record, 1, &len); page = (Page) BufferGetPage(deletebuf); if (len > 0) { OffsetNumber *unused; OffsetNumber *unend; unused = (OffsetNumber *) ptr; unend = (OffsetNumber *) ((char *) ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); } /* * Mark the page as not containing any LP_DEAD items only if * clear_dead_marking flag is set to true. See comments in * hashbucketcleanup() for details. */ if (xldata->clear_dead_marking) { HashPageOpaque pageopaque; pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; } PageSetLSN(page, lsn); MarkBufferDirty(deletebuf); } if (BufferIsValid(deletebuf)) UnlockReleaseBuffer(deletebuf); if (BufferIsValid(bucketbuf)) UnlockReleaseBuffer(bucketbuf); }
/* * replay squeeze page operation of hash index */ static void hash_xlog_squeeze_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) XLogRecGetData(record); Buffer bucketbuf = InvalidBuffer; Buffer writebuf; Buffer ovflbuf; Buffer prevbuf = InvalidBuffer; Buffer mapbuf; XLogRedoAction action; /* * Ensure we have a cleanup lock on primary bucket page before we start * with the actual replay operation. This is to ensure that neither a * scan can start nor a scan can be already-in-progress during the replay * of this operation. If we allow scans during this operation, then they * can miss some records or show the same record multiple times. */ if (xldata->is_prim_bucket_same_wrt) action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf); else { /* * we don't care for return value as the purpose of reading bucketbuf * is to ensure a cleanup lock on primary bucket page. */ (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); action = XLogReadBufferForRedo(record, 1, &writebuf); } /* replay the record for adding entries in overflow buffer */ if (action == BLK_NEEDS_REDO) { Page writepage; char *begin; char *data; Size datalen; uint16 ninserted = 0; data = begin = XLogRecGetBlockData(record, 1, &datalen); writepage = (Page) BufferGetPage(writebuf); if (xldata->ntups > 0) { OffsetNumber *towrite = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntups; while (data - begin < datalen) { IndexTuple itup = (IndexTuple) data; Size itemsz; OffsetNumber l; itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); data += itemsz; l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); if (l == InvalidOffsetNumber) elog(ERROR, "hash_xlog_squeeze_page: failed to add item to hash index page, size %d bytes", (int) itemsz); ninserted++; } } /* * number of tuples inserted must be same as requested in REDO record. */ Assert(ninserted == xldata->ntups); /* * if the page on which are adding tuples is a page previous to freed * overflow page, then update its nextblno. */ if (xldata->is_prev_bucket_same_wrt) { HashPageOpaque writeopaque = (HashPageOpaque) PageGetSpecialPointer(writepage); writeopaque->hasho_nextblkno = xldata->nextblkno; } PageSetLSN(writepage, lsn); MarkBufferDirty(writebuf); } /* replay the record for initializing overflow buffer */ if (XLogReadBufferForRedo(record, 2, &ovflbuf) == BLK_NEEDS_REDO) { Page ovflpage; ovflpage = BufferGetPage(ovflbuf); _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); PageSetLSN(ovflpage, lsn); MarkBufferDirty(ovflbuf); } if (BufferIsValid(ovflbuf)) UnlockReleaseBuffer(ovflbuf); /* replay the record for page previous to the freed overflow page */ if (!xldata->is_prev_bucket_same_wrt && XLogReadBufferForRedo(record, 3, &prevbuf) == BLK_NEEDS_REDO) { Page prevpage = BufferGetPage(prevbuf); HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); prevopaque->hasho_nextblkno = xldata->nextblkno; PageSetLSN(prevpage, lsn); MarkBufferDirty(prevbuf); } if (BufferIsValid(prevbuf)) UnlockReleaseBuffer(prevbuf); /* replay the record for page next to the freed overflow page */ if (XLogRecHasBlockRef(record, 4)) { Buffer nextbuf; if (XLogReadBufferForRedo(record, 4, &nextbuf) == BLK_NEEDS_REDO) { Page nextpage = BufferGetPage(nextbuf); HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); nextopaque->hasho_prevblkno = xldata->prevblkno; PageSetLSN(nextpage, lsn); MarkBufferDirty(nextbuf); } if (BufferIsValid(nextbuf)) UnlockReleaseBuffer(nextbuf); } if (BufferIsValid(writebuf)) UnlockReleaseBuffer(writebuf); if (BufferIsValid(bucketbuf)) UnlockReleaseBuffer(bucketbuf); /* * Note: in normal operation, we'd update the bitmap and meta page while * still holding lock on the primary bucket page and overflow pages. But * during replay it's not necessary to hold those locks, since no other * index updates can be happening concurrently. */ /* replay the record for bitmap page */ if (XLogReadBufferForRedo(record, 5, &mapbuf) == BLK_NEEDS_REDO) { Page mappage = (Page) BufferGetPage(mapbuf); uint32 *freep = NULL; char *data; uint32 *bitmap_page_bit; Size datalen; freep = HashPageGetBitmap(mappage); data = XLogRecGetBlockData(record, 5, &datalen); bitmap_page_bit = (uint32 *) data; CLRBIT(freep, *bitmap_page_bit); PageSetLSN(mappage, lsn); MarkBufferDirty(mapbuf); } if (BufferIsValid(mapbuf)) UnlockReleaseBuffer(mapbuf); /* replay the record for meta page */ if (XLogRecHasBlockRef(record, 6)) { Buffer metabuf; if (XLogReadBufferForRedo(record, 6, &metabuf) == BLK_NEEDS_REDO) { HashMetaPage metap; Page page; char *data; uint32 *firstfree_ovflpage; Size datalen; data = XLogRecGetBlockData(record, 6, &datalen); firstfree_ovflpage = (uint32 *) data; page = BufferGetPage(metabuf); metap = HashPageGetMeta(page); metap->hashm_firstfree = *firstfree_ovflpage; PageSetLSN(page, lsn); MarkBufferDirty(metabuf); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); } }
/* * replay allocation of page for split operation */ static void hash_xlog_split_allocate_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) XLogRecGetData(record); Buffer oldbuf; Buffer newbuf; Buffer metabuf; Size datalen PG_USED_FOR_ASSERTS_ONLY; char *data; XLogRedoAction action; /* * To be consistent with normal operation, here we take cleanup locks on * both the old and new buckets even though there can't be any concurrent * inserts. */ /* replay the record for old bucket */ action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &oldbuf); /* * Note that we still update the page even if it was restored from a full * page image, because the special space is not included in the image. */ if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) { Page oldpage; HashPageOpaque oldopaque; oldpage = BufferGetPage(oldbuf); oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage); oldopaque->hasho_flag = xlrec->old_bucket_flag; oldopaque->hasho_prevblkno = xlrec->new_bucket; PageSetLSN(oldpage, lsn); MarkBufferDirty(oldbuf); } /* replay the record for new bucket */ newbuf = XLogInitBufferForRedo(record, 1); _hash_initbuf(newbuf, xlrec->new_bucket, xlrec->new_bucket, xlrec->new_bucket_flag, true); if (!IsBufferCleanupOK(newbuf)) elog(PANIC, "hash_xlog_split_allocate_page: failed to acquire cleanup lock"); MarkBufferDirty(newbuf); PageSetLSN(BufferGetPage(newbuf), lsn); /* * We can release the lock on old bucket early as well but doing here to * consistent with normal operation. */ if (BufferIsValid(oldbuf)) UnlockReleaseBuffer(oldbuf); if (BufferIsValid(newbuf)) UnlockReleaseBuffer(newbuf); /* * Note: in normal operation, we'd update the meta page while still * holding lock on the old and new bucket pages. But during replay it's * not necessary to hold those locks, since no other bucket splits can be * happening concurrently. */ /* replay the record for metapage changes */ if (XLogReadBufferForRedo(record, 2, &metabuf) == BLK_NEEDS_REDO) { Page page; HashMetaPage metap; page = BufferGetPage(metabuf); metap = HashPageGetMeta(page); metap->hashm_maxbucket = xlrec->new_bucket; data = XLogRecGetBlockData(record, 2, &datalen); if (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) { uint32 lowmask; uint32 *highmask; /* extract low and high masks. */ memcpy(&lowmask, data, sizeof(uint32)); highmask = (uint32 *) ((char *) data + sizeof(uint32)); /* update metapage */ metap->hashm_lowmask = lowmask; metap->hashm_highmask = *highmask; data += sizeof(uint32) * 2; } if (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) { uint32 ovflpoint; uint32 *ovflpages; /* extract information of overflow pages. */ memcpy(&ovflpoint, data, sizeof(uint32)); ovflpages = (uint32 *) ((char *) data + sizeof(uint32)); /* update metapage */ metap->hashm_spares[ovflpoint] = *ovflpages; metap->hashm_ovflpoint = ovflpoint; } MarkBufferDirty(metabuf); PageSetLSN(BufferGetPage(metabuf), lsn); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); }
/* * _hash_step() -- step to the next valid item in a scan in the bucket. * * If no valid record exists in the requested direction, return * false. Else, return true and set the hashso_curpos for the * scan to the right thing. * * 'bufP' points to the current buffer, which is pinned and read-locked. * On success exit, we have pin and read-lock on whichever page * contains the right item; on failure, we have released all buffers. */ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) { Relation rel = scan->indexRelation; HashScanOpaque so = (HashScanOpaque) scan->opaque; ItemPointer current; Buffer buf; Page page; HashPageOpaque opaque; OffsetNumber maxoff; OffsetNumber offnum; BlockNumber blkno; IndexTuple itup; current = &(so->hashso_curpos); buf = *bufP; _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* * If _hash_step is called from _hash_first, current will not be valid, so * we can't dereference it. However, in that case, we presumably want to * start at the beginning/end of the page... */ maxoff = PageGetMaxOffsetNumber(page); if (ItemPointerIsValid(current)) offnum = ItemPointerGetOffsetNumber(current); else offnum = InvalidOffsetNumber; /* * 'offnum' now points to the last tuple we examined (if any). * * continue to step through tuples until: 1) we get to the end of the * bucket chain or 2) we find a valid tuple. */ do { switch (dir) { case ForwardScanDirection: if (offnum != InvalidOffsetNumber) offnum = OffsetNumberNext(offnum); /* move forward */ else { /* new page, locate starting position by binary search */ offnum = _hash_binsearch(page, so->hashso_sk_hash); } for (;;) { /* * check if we're still in the range of items with the * target hash key */ if (offnum <= maxoff) { Assert(offnum >= FirstOffsetNumber); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup)) break; /* yes, so exit for-loop */ } /* * ran off the end of this page, try the next */ _hash_readnext(rel, &buf, &page, &opaque); if (BufferIsValid(buf)) { maxoff = PageGetMaxOffsetNumber(page); offnum = _hash_binsearch(page, so->hashso_sk_hash); } else { /* end of bucket */ itup = NULL; break; /* exit for-loop */ } } break; case BackwardScanDirection: if (offnum != InvalidOffsetNumber) offnum = OffsetNumberPrev(offnum); /* move back */ else { /* new page, locate starting position by binary search */ offnum = _hash_binsearch_last(page, so->hashso_sk_hash); } for (;;) { /* * check if we're still in the range of items with the * target hash key */ if (offnum >= FirstOffsetNumber) { Assert(offnum <= maxoff); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup)) break; /* yes, so exit for-loop */ } /* * ran off the end of this page, try the next */ _hash_readprev(rel, &buf, &page, &opaque); if (BufferIsValid(buf)) { maxoff = PageGetMaxOffsetNumber(page); offnum = _hash_binsearch_last(page, so->hashso_sk_hash); } else { /* end of bucket */ itup = NULL; break; /* exit for-loop */ } } break; default: /* NoMovementScanDirection */ /* this should not be reached */ itup = NULL; break; } if (itup == NULL) { /* we ran off the end of the bucket without finding a match */ *bufP = so->hashso_curbuf = InvalidBuffer; ItemPointerSetInvalid(current); return false; } /* check the tuple quals, loop around if not met */ } while (!_hash_checkqual(scan, itup)); /* if we made it to here, we've found a valid tuple */ blkno = BufferGetBlockNumber(buf); *bufP = so->hashso_curbuf = buf; ItemPointerSet(current, blkno, offnum); return true; }
/* * btvacuumpage --- VACUUM one page * * This processes a single page for btvacuumscan(). In some cases we * must go back and re-examine previously-scanned pages; this routine * recurses when necessary to handle that case. * * blkno is the page to process. orig_blkno is the highest block number * reached by the outer btvacuumscan loop (the same as blkno, unless we * are recursing to re-examine a previous page). */ static void btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno) { IndexVacuumInfo *info = vstate->info; IndexBulkDeleteResult *stats = vstate->stats; IndexBulkDeleteCallback callback = vstate->callback; void *callback_state = vstate->callback_state; Relation rel = info->index; bool delete_now; BlockNumber recurse_to; Buffer buf; Page page; BTPageOpaque opaque; restart: delete_now = false; recurse_to = P_NONE; /* call vacuum_delay_point while not holding any buffer lock */ vacuum_delay_point(); /* * We can't use _bt_getbuf() here because it always applies * _bt_checkpage(), which will barf on an all-zero page. We want to * recycle all-zero pages, not fail. Also, we want to use a nondefault * buffer access strategy. */ buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBuffer(buf, BT_READ); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!PageIsNew(page)) _bt_checkpage(rel, buf); /* * If we are recursing, the only case we want to do anything with is a * live leaf page having the current vacuum cycle ID. Any other state * implies we already saw the page (eg, deleted it as being empty). */ if (blkno != orig_blkno) { if (_bt_page_recyclable(page) || P_IGNORE(opaque) || !P_ISLEAF(opaque) || opaque->btpo_cycleid != vstate->cycleid) { _bt_relbuf(rel, buf); return; } } /* If the page is in use, update lastUsedPage */ if (!_bt_page_recyclable(page) && vstate->lastUsedPage < blkno) vstate->lastUsedPage = blkno; /* Page is valid, see what to do with it */ if (_bt_page_recyclable(page)) { /* Okay to recycle this page */ RecordFreeIndexPage(rel, blkno); vstate->totFreePages++; stats->pages_deleted++; } else if (P_ISDELETED(opaque)) { /* Already deleted, but can't recycle yet */ stats->pages_deleted++; } else if (P_ISHALFDEAD(opaque)) { /* Half-dead, try to delete */ delete_now = true; } else if (P_ISLEAF(opaque)) { OffsetNumber deletable[MaxOffsetNumber]; int ndeletable; OffsetNumber offnum, minoff, maxoff; /* * Trade in the initial read lock for a super-exclusive write lock on * this page. We must get such a lock on every leaf page over the * course of the vacuum scan, whether or not it actually contains any * deletable tuples --- see nbtree/README. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBufferForCleanup(buf); /* * Check whether we need to recurse back to earlier pages. What we * are concerned about is a page split that happened since we started * the vacuum scan. If the split moved some tuples to a lower page * then we might have missed 'em. If so, set up for tail recursion. * (Must do this before possibly clearing btpo_cycleid below!) */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid && !(opaque->btpo_flags & BTP_SPLIT_END) && !P_RIGHTMOST(opaque) && opaque->btpo_next < orig_blkno) recurse_to = opaque->btpo_next; /* * Scan over all items to see which ones need deleted according to the * callback function. */ ndeletable = 0; minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); if (callback) { for (offnum = minoff; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { IndexTuple itup; ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); htup = &(itup->t_tid); /* * During Hot Standby we currently assume that * XLOG_BTREE_VACUUM records do not produce conflicts. That is * only true as long as the callback function depends only * upon whether the index tuple refers to heap tuples removed * in the initial heap scan. When vacuum starts it derives a * value of OldestXmin. Backends taking later snapshots could * have a RecentGlobalXmin with a later xid than the vacuum's * OldestXmin, so it is possible that row versions deleted * after OldestXmin could be marked as killed by other * backends. The callback function *could* look at the index * tuple state in isolation and decide to delete the index * tuple, though currently it does not. If it ever did, we * would need to reconsider whether XLOG_BTREE_VACUUM records * should cause conflicts. If they did cause conflicts they * would be fairly harsh conflicts, since we haven't yet * worked out a way to pass a useful value for * latestRemovedXid on the XLOG_BTREE_VACUUM records. This * applies to *any* type of index that marks index tuples as * killed. */ if (callback(htup, callback_state)) deletable[ndeletable++] = offnum; } } /* * Apply any needed deletes. We issue just one _bt_delitems_vacuum() * call per page, so as to minimize WAL traffic. */ if (ndeletable > 0) { BlockNumber lastBlockVacuumed = BufferGetBlockNumber(buf); _bt_delitems_vacuum(rel, buf, deletable, ndeletable, vstate->lastBlockVacuumed); /* * Keep track of the block number of the lastBlockVacuumed, so we * can scan those blocks as well during WAL replay. This then * provides concurrency protection and allows btrees to be used * while in recovery. */ if (lastBlockVacuumed > vstate->lastBlockVacuumed) vstate->lastBlockVacuumed = lastBlockVacuumed; stats->tuples_removed += ndeletable; /* must recompute maxoff */ maxoff = PageGetMaxOffsetNumber(page); } else { /* * If the page has been split during this vacuum cycle, it seems * worth expending a write to clear btpo_cycleid even if we don't * have any deletions to do. (If we do, _bt_delitems_vacuum takes * care of this.) This ensures we won't process the page again. * * We treat this like a hint-bit update because there's no need to * WAL-log it. */ if (vstate->cycleid != 0 && opaque->btpo_cycleid == vstate->cycleid) { opaque->btpo_cycleid = 0; SetBufferCommitInfoNeedsSave(buf); } } /* * If it's now empty, try to delete; else count the live tuples. We * don't delete when recursing, though, to avoid putting entries into * freePages out-of-order (doesn't seem worth any extra code to handle * the case). */ if (minoff > maxoff) delete_now = (blkno == orig_blkno); else stats->num_index_tuples += maxoff - minoff + 1; } if (delete_now) { MemoryContext oldcontext; int ndel; /* Run pagedel in a temp context to avoid memory leakage */ MemoryContextReset(vstate->pagedelcontext); oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext); ndel = _bt_pagedel(rel, buf, NULL); /* count only this page, else may double-count parent */ if (ndel) stats->pages_deleted++; MemoryContextSwitchTo(oldcontext); /* pagedel released buffer, so we shouldn't */ } else _bt_relbuf(rel, buf); /* * This is really tail recursion, but if the compiler is too stupid to * optimize it as such, we'd eat an uncomfortably large amount of stack * space per recursion level (due to the deletable[] array). A failure is * improbable since the number of levels isn't likely to be large ... but * just in case, let's hand-optimize into a loop. */ if (recurse_to != P_NONE) { blkno = recurse_to; goto restart; } }
/* * Redo recompression of posting list. Doing all the changes in-place is not * always possible, because it might require more space than we've on the page. * Instead, once modification is required we copy unprocessed tail of the page * into separately allocated chunk of memory for further reading original * versions of segments. Thanks to that we don't bother about moving page data * in-place. */ static void ginRedoRecompress(Page page, ginxlogRecompressDataLeaf *data) { int actionno; int segno; GinPostingList *oldseg; Pointer segmentend; char *walbuf; int totalsize; Pointer tailCopy = NULL; Pointer writePtr; Pointer segptr; /* * If the page is in pre-9.4 format, convert to new format first. */ if (!GinPageIsCompressed(page)) { ItemPointer uncompressed = (ItemPointer) GinDataPageGetData(page); int nuncompressed = GinPageGetOpaque(page)->maxoff; int npacked; /* * Empty leaf pages are deleted as part of vacuum, but leftmost and * rightmost pages are never deleted. So, pg_upgrade'd from pre-9.4 * instances might contain empty leaf pages, and we need to handle * them correctly. */ if (nuncompressed > 0) { GinPostingList *plist; plist = ginCompressPostingList(uncompressed, nuncompressed, BLCKSZ, &npacked); totalsize = SizeOfGinPostingList(plist); Assert(npacked == nuncompressed); memcpy(GinDataLeafPageGetPostingList(page), plist, totalsize); } else { totalsize = 0; } GinDataPageSetDataSize(page, totalsize); GinPageSetCompressed(page); GinPageGetOpaque(page)->maxoff = InvalidOffsetNumber; } oldseg = GinDataLeafPageGetPostingList(page); writePtr = (Pointer) oldseg; segmentend = (Pointer) oldseg + GinDataLeafPageGetPostingListSize(page); segno = 0; walbuf = ((char *) data) + sizeof(ginxlogRecompressDataLeaf); for (actionno = 0; actionno < data->nactions; actionno++) { uint8 a_segno = *((uint8 *) (walbuf++)); uint8 a_action = *((uint8 *) (walbuf++)); GinPostingList *newseg = NULL; int newsegsize = 0; ItemPointerData *items = NULL; uint16 nitems = 0; ItemPointerData *olditems; int nolditems; ItemPointerData *newitems; int nnewitems; int segsize; /* Extract all the information we need from the WAL record */ if (a_action == GIN_SEGMENT_INSERT || a_action == GIN_SEGMENT_REPLACE) { newseg = (GinPostingList *) walbuf; newsegsize = SizeOfGinPostingList(newseg); walbuf += SHORTALIGN(newsegsize); } if (a_action == GIN_SEGMENT_ADDITEMS) { memcpy(&nitems, walbuf, sizeof(uint16)); walbuf += sizeof(uint16); items = (ItemPointerData *) walbuf; walbuf += nitems * sizeof(ItemPointerData); } /* Skip to the segment that this action concerns */ Assert(segno <= a_segno); while (segno < a_segno) { /* * Once modification is started and page tail is copied, we've * to copy unmodified segments. */ segsize = SizeOfGinPostingList(oldseg); if (tailCopy) { Assert(writePtr + segsize < PageGetSpecialPointer(page)); memcpy(writePtr, (Pointer) oldseg, segsize); } writePtr += segsize; oldseg = GinNextPostingListSegment(oldseg); segno++; } /* * ADDITEMS action is handled like REPLACE, but the new segment to * replace the old one is reconstructed using the old segment from * disk and the new items from the WAL record. */ if (a_action == GIN_SEGMENT_ADDITEMS) { int npacked; olditems = ginPostingListDecode(oldseg, &nolditems); newitems = ginMergeItemPointers(items, nitems, olditems, nolditems, &nnewitems); Assert(nnewitems == nolditems + nitems); newseg = ginCompressPostingList(newitems, nnewitems, BLCKSZ, &npacked); Assert(npacked == nnewitems); newsegsize = SizeOfGinPostingList(newseg); a_action = GIN_SEGMENT_REPLACE; } segptr = (Pointer) oldseg; if (segptr != segmentend) segsize = SizeOfGinPostingList(oldseg); else { /* * Positioned after the last existing segment. Only INSERTs * expected here. */ Assert(a_action == GIN_SEGMENT_INSERT); segsize = 0; } /* * We're about to start modification of the page. So, copy tail of the * page if it's not done already. */ if (!tailCopy && segptr != segmentend) { int tailSize = segmentend - segptr; tailCopy = (Pointer) palloc(tailSize); memcpy(tailCopy, segptr, tailSize); segptr = tailCopy; oldseg = (GinPostingList *) segptr; segmentend = segptr + tailSize; } switch (a_action) { case GIN_SEGMENT_DELETE: segptr += segsize; segno++; break; case GIN_SEGMENT_INSERT: /* copy the new segment in place */ Assert(writePtr + newsegsize <= PageGetSpecialPointer(page)); memcpy(writePtr, newseg, newsegsize); writePtr += newsegsize; break; case GIN_SEGMENT_REPLACE: /* copy the new version of segment in place */ Assert(writePtr + newsegsize <= PageGetSpecialPointer(page)); memcpy(writePtr, newseg, newsegsize); writePtr += newsegsize; segptr += segsize; segno++; break; default: elog(ERROR, "unexpected GIN leaf action: %u", a_action); } oldseg = (GinPostingList *) segptr; } /* Copy the rest of unmodified segments if any. */ segptr = (Pointer) oldseg; if (segptr != segmentend && tailCopy) { int restSize = segmentend - segptr; Assert(writePtr + restSize <= PageGetSpecialPointer(page)); memcpy(writePtr, segptr, restSize); writePtr += restSize; } totalsize = writePtr - (Pointer) GinDataLeafPageGetPostingList(page); GinDataPageSetDataSize(page, totalsize); }
Datum bt_page_items(PG_FUNCTION_ARGS) { text *relname = PG_GETARG_TEXT_P(0); uint32 blkno = PG_GETARG_UINT32(1); RangeVar *relrv; Datum result; char *values[BTPAGEITEMS_NCOLUMNS]; BTPageOpaque opaque; HeapTuple tuple; ItemId id; FuncCallContext *fctx; MemoryContext mctx; struct user_args *uargs = NULL; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use pgstattuple functions")))); if (blkno == 0) elog(ERROR, "Block 0 is a meta page."); if (SRF_IS_FIRSTCALL()) { fctx = SRF_FIRSTCALL_INIT(); mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); uargs = palloc(sizeof(struct user_args)); uargs->tupd = RelationNameGetTupleDesc(BTPAGEITEMS_TYPE); uargs->offset = FirstOffsetNumber; relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); uargs->rel = relation_openrv(relrv, AccessShareLock); CHECK_RELATION_BLOCK_RANGE(uargs->rel, blkno); uargs->buffer = ReadBuffer(uargs->rel, blkno); if (!IS_INDEX(uargs->rel) || !IS_BTREE(uargs->rel)) elog(ERROR, "bt_page_items() can be used only on b-tree index."); uargs->page = BufferGetPage(uargs->buffer); opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page); if (P_ISDELETED(opaque)) elog(NOTICE, "bt_page_items(): this page is deleted."); fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); fctx->user_fctx = uargs; MemoryContextSwitchTo(mctx); } fctx = SRF_PERCALL_SETUP(); uargs = fctx->user_fctx; if (fctx->call_cntr < fctx->max_calls) { IndexTuple itup; id = PageGetItemId(uargs->page, uargs->offset); if (!ItemIdIsValid(id)) elog(ERROR, "Invalid ItemId."); itup = (IndexTuple) PageGetItem(uargs->page, id); { int j = 0; BlockNumber blkno = BlockIdGetBlockNumber(&(itup->t_tid.ip_blkid)); values[j] = palloc(32); snprintf(values[j++], 32, "%d", uargs->offset); values[j] = palloc(32); snprintf(values[j++], 32, "(%u,%u)", blkno, itup->t_tid.ip_posid); values[j] = palloc(32); snprintf(values[j++], 32, "%d", (int) IndexTupleSize(itup)); values[j] = palloc(32); snprintf(values[j++], 32, "%c", IndexTupleHasNulls(itup) ? 't' : 'f'); values[j] = palloc(32); snprintf(values[j++], 32, "%c", IndexTupleHasVarwidths(itup) ? 't' : 'f'); { int off; char *dump; char *ptr = (char *) itup + IndexInfoFindDataOffset(itup->t_info); dump = palloc(IndexTupleSize(itup) * 3); memset(dump, 0, IndexTupleSize(itup) * 3); for (off = 0; off < IndexTupleSize(itup) - IndexInfoFindDataOffset(itup->t_info); off++) { if (dump[0] == '\0') sprintf(dump, "%02x", *(ptr + off) & 0xff); else { char buf[4]; sprintf(buf, " %02x", *(ptr + off) & 0xff); strcat(dump, buf); } } values[j] = dump; } tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(uargs->tupd), values); result = TupleGetDatum(TupleDescGetSlot(uargs->tupd), tuple); } uargs->offset = uargs->offset + 1; SRF_RETURN_NEXT(fctx, result); } else { ReleaseBuffer(uargs->buffer); relation_close(uargs->rel, AccessShareLock); SRF_RETURN_DONE(fctx); } }
/*------------------------------------------------------- * bt_page_items() * * Get IndexTupleData set in a btree page * * Usage: SELECT * FROM bt_page_items('t1_pkey', 1); *------------------------------------------------------- */ Datum bt_page_items(PG_FUNCTION_ARGS) { text *relname = PG_GETARG_TEXT_PP(0); uint32 blkno = PG_GETARG_UINT32(1); Datum result; FuncCallContext *fctx; MemoryContext mctx; struct user_args *uargs; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use pageinspect functions")))); if (SRF_IS_FIRSTCALL()) { RangeVar *relrv; Relation rel; Buffer buffer; BTPageOpaque opaque; TupleDesc tupleDesc; fctx = SRF_FIRSTCALL_INIT(); relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); rel = relation_openrv(relrv, AccessShareLock); if (!IS_INDEX(rel) || !IS_BTREE(rel)) elog(ERROR, "relation \"%s\" is not a btree index", RelationGetRelationName(rel)); /* * Reject attempts to read non-local temporary relations; we would be * likely to get wrong data since we have no visibility into the * owning session's local buffers. */ if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"))); if (blkno == 0) elog(ERROR, "block 0 is a meta page"); CHECK_RELATION_BLOCK_RANGE(rel, blkno); buffer = ReadBuffer(rel, blkno); LockBuffer(buffer, BUFFER_LOCK_SHARE); /* * We copy the page into local storage to avoid holding pin on the * buffer longer than we must, and possibly failing to release it at * all if the calling query doesn't fetch all rows. */ mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); uargs = palloc(sizeof(struct user_args)); uargs->page = palloc(BLCKSZ); memcpy(uargs->page, BufferGetPage(buffer), BLCKSZ); UnlockReleaseBuffer(buffer); relation_close(rel, AccessShareLock); uargs->offset = FirstOffsetNumber; opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page); if (P_ISDELETED(opaque)) elog(NOTICE, "page is deleted"); fctx->max_calls = PageGetMaxOffsetNumber(uargs->page); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); fctx->attinmeta = TupleDescGetAttInMetadata(tupleDesc); fctx->user_fctx = uargs; MemoryContextSwitchTo(mctx); } fctx = SRF_PERCALL_SETUP(); uargs = fctx->user_fctx; if (fctx->call_cntr < fctx->max_calls) { result = bt_page_print_tuples(fctx, uargs->page, uargs->offset); uargs->offset++; SRF_RETURN_NEXT(fctx, result); } else { pfree(uargs->page); pfree(uargs); SRF_RETURN_DONE(fctx); } }
/* * Verify that the given bytea contains a HASH page, or die in the attempt. * A pointer to a palloc'd, properly aligned copy of the page is returned. */ static Page verify_hash_page(bytea *raw_page, int flags) { Page page = get_page_from_raw(raw_page); int pagetype = LH_UNUSED_PAGE; /* Treat new pages as unused. */ if (!PageIsNew(page)) { HashPageOpaque pageopaque; if (PageGetSpecialSize(page) != MAXALIGN(sizeof(HashPageOpaqueData))) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index table contains corrupted page"))); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); if (pageopaque->hasho_page_id != HASHO_PAGE_ID) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("page is not a hash page"), errdetail("Expected %08x, got %08x.", HASHO_PAGE_ID, pageopaque->hasho_page_id))); pagetype = pageopaque->hasho_flag & LH_PAGE_TYPE; } /* Check that page type is sane. */ if (pagetype != LH_OVERFLOW_PAGE && pagetype != LH_BUCKET_PAGE && pagetype != LH_BITMAP_PAGE && pagetype != LH_META_PAGE && pagetype != LH_UNUSED_PAGE) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid hash page type %08x", pagetype))); /* If requested, verify page type. */ if (flags != 0 && (pagetype & flags) == 0) { switch (flags) { case LH_META_PAGE: ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("page is not a hash meta page"))); case LH_BUCKET_PAGE | LH_OVERFLOW_PAGE: ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("page is not a hash bucket or overflow page"))); case LH_OVERFLOW_PAGE: ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("page is not a hash overflow page"))); default: elog(ERROR, "hash page of type %08x not in mask %08x", pagetype, flags); } } /* * If it is the metapage, also verify magic number and version. */ if (pagetype == LH_META_PAGE) { HashMetaPage metap = HashPageGetMeta(page); if (metap->hashm_magic != HASH_MAGIC) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("invalid magic number for metadata"), errdetail("Expected 0x%08x, got 0x%08x.", HASH_MAGIC, metap->hashm_magic))); if (metap->hashm_version != HASH_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("invalid version for metadata"), errdetail("Expected %d, got %d", HASH_VERSION, metap->hashm_version))); } return page; }
/* ------------------------------------------------- * GetBTPageStatistics() * * Collect statistics of single b-tree page * ------------------------------------------------- */ static void GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat) { Page page = BufferGetPage(buffer); PageHeader phdr = (PageHeader) page; OffsetNumber maxoff = PageGetMaxOffsetNumber(page); BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); int item_size = 0; int off; stat->blkno = blkno; stat->max_avail = BLCKSZ - (BLCKSZ - phdr->pd_special + SizeOfPageHeaderData); stat->dead_items = stat->live_items = 0; stat->page_size = PageGetPageSize(page); /* page type (flags) */ if (P_ISDELETED(opaque)) { stat->type = 'd'; stat->btpo.xact = opaque->btpo.xact; return; } else if (P_IGNORE(opaque)) stat->type = 'e'; else if (P_ISLEAF(opaque)) stat->type = 'l'; else if (P_ISROOT(opaque)) stat->type = 'r'; else stat->type = 'i'; /* btpage opaque data */ stat->btpo_prev = opaque->btpo_prev; stat->btpo_next = opaque->btpo_next; stat->btpo.level = opaque->btpo.level; stat->btpo_flags = opaque->btpo_flags; stat->btpo_cycleid = opaque->btpo_cycleid; /* count live and dead tuples, and free space */ for (off = FirstOffsetNumber; off <= maxoff; off++) { IndexTuple itup; ItemId id = PageGetItemId(page, off); itup = (IndexTuple) PageGetItem(page, id); item_size += IndexTupleSize(itup); if (!ItemIdIsDead(id)) stat->live_items++; else stat->dead_items++; } stat->free_size = PageGetFreeSpace(page); if ((stat->live_items + stat->dead_items) > 0) stat->avg_item_size = item_size / (stat->live_items + stat->dead_items); else stat->avg_item_size = 0; }
/** * @brief Read the left-most leaf page by walking down on index tree structure * from root node. * * Process flow * -# Open index file and read meta page * -# Get block number of root page * -# Read "fast root" page * -# Read left child page until reaching left-most leaf page * * After calling this function, the members of BTReader are the following: * - smgr : Smgr relation of the existing index file. * - blkno : block number of left-most leaf page. If there is no leaf page, * InvalidBlockNumber is set. * - offnum : InvalidOffsetNumber is set. * - page : Left-most leaf page, or undefined if no leaf page. * * @param reader [in/out] B-Tree index reader * @return true iff there are some tuples */ static bool BTReaderInit(BTReader *reader, Relation rel) { BTPageOpaque metaopaque; BTMetaPageData *metad; BTPageOpaque opaque; BlockNumber blkno; /* * HACK: We cannot use smgropen because smgrs returned from it * will be closed automatically when we assign a new file node. * * XXX: It might be better to open the previous relfilenode with * smgropen *after* RelationSetNewRelfilenode. */ memset(&reader->smgr, 0, sizeof(reader->smgr)); #if PG_VERSION_NUM >= 90100 reader->smgr.smgr_rnode.node = rel->rd_node; reader->smgr.smgr_rnode.backend = rel->rd_backend == MyBackendId ? MyBackendId : InvalidBackendId; #else reader->smgr.smgr_rnode = rel->rd_node; #endif reader->smgr.smgr_which = 0; /* md.c */ reader->blkno = InvalidBlockNumber; reader->offnum = InvalidOffsetNumber; reader->page = palloc(BLCKSZ); /* * Read meta page and check sanity of it. * * XXX: It might be better to do REINDEX against corrupted indexes * instead of raising errors because we've spent long time for data * loading... */ BTReaderReadPage(reader, BTREE_METAPAGE); metaopaque = (BTPageOpaque) PageGetSpecialPointer(reader->page); metad = BTPageGetMeta(reader->page); if (!(metaopaque->btpo_flags & BTP_META) || metad->btm_magic != BTREE_MAGIC) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" is not a reader", RelationGetRelationName(rel)))); if (metad->btm_version != BTREE_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("version mismatch in index \"%s\": file version %d," " code version %d", RelationGetRelationName(rel), metad->btm_version, BTREE_VERSION))); if (metad->btm_root == P_NONE) { /* No root page; We ignore the index in the subsequent build. */ reader->blkno = InvalidBlockNumber; return false; } /* Go to the fast root page. */ blkno = metad->btm_fastroot; BTReaderReadPage(reader, blkno); opaque = (BTPageOpaque) PageGetSpecialPointer(reader->page); /* Walk down to the left-most leaf page */ while (!P_ISLEAF(opaque)) { ItemId firstid; IndexTuple itup; /* Get the block number of the left child */ firstid = PageGetItemId(reader->page, P_FIRSTDATAKEY(opaque)); itup = (IndexTuple) PageGetItem(reader->page, firstid); blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); /* Go down to children */ for (;;) { BTReaderReadPage(reader, blkno); opaque = (BTPageOpaque) PageGetSpecialPointer(reader->page); if (!P_IGNORE(opaque)) break; if (P_RIGHTMOST(opaque)) { /* We reach end of the index without any valid leaves. */ reader->blkno = InvalidBlockNumber; return false; } blkno = opaque->btpo_next; } } return true; }
static void btree_xlog_vacuum(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record); Buffer buffer; Page page; BTPageOpaque opaque; /* * If queries might be active then we need to ensure every leaf page is * unpinned between the lastBlockVacuumed and the current block, if there * are any. This prevents replay of the VACUUM from reaching the stage of * removing heap tuples while there could still be indexscans "in flight" * to those particular tuples (see nbtree/README). * * It might be worth checking if there are actually any backends running; * if not, we could just skip this. * * Since VACUUM can visit leaf pages out-of-order, it might issue records * with lastBlockVacuumed >= block; that's not an error, it just means * nothing to do now. * * Note: since we touch all pages in the range, we will lock non-leaf * pages, and also any empty (all-zero) pages that may be in the index. It * doesn't seem worth the complexity to avoid that. But it's important * that HotStandbyActiveInReplay() will not return true if the database * isn't yet consistent; so we need not fear reading still-corrupt blocks * here during crash recovery. */ if (HotStandbyActiveInReplay()) { RelFileNode thisrnode; BlockNumber thisblkno; BlockNumber blkno; XLogRecGetBlockTag(record, 0, &thisrnode, NULL, &thisblkno); for (blkno = xlrec->lastBlockVacuumed + 1; blkno < thisblkno; blkno++) { /* * We use RBM_NORMAL_NO_LOG mode because it's not an error * condition to see all-zero pages. The original btvacuumpage * scan would have skipped over all-zero pages, noting them in FSM * but not bothering to initialize them just yet; so we mustn't * throw an error here. (We could skip acquiring the cleanup lock * if PageIsNew, but it's probably not worth the cycles to test.) * * XXX we don't actually need to read the block, we just need to * confirm it is unpinned. If we had a special call into the * buffer manager we could optimise this so that if the block is * not in shared_buffers we confirm it as unpinned. */ buffer = XLogReadBufferExtended(thisrnode, MAIN_FORKNUM, blkno, RBM_NORMAL_NO_LOG); if (BufferIsValid(buffer)) { LockBufferForCleanup(buffer); UnlockReleaseBuffer(buffer); } } } /* * Like in btvacuumpage(), we need to take a cleanup lock on every leaf * page. See nbtree/README for details. */ if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer) == BLK_NEEDS_REDO) { char *ptr; Size len; ptr = XLogRecGetBlockData(record, 0, &len); page = (Page) BufferGetPage(buffer); if (len > 0) { OffsetNumber *unused; OffsetNumber *unend; unused = (OffsetNumber *) ptr; unend = (OffsetNumber *) ((char *) ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); } /* * Mark the page as not containing any LP_DEAD items --- see comments * in _bt_delitems_vacuum(). */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_flags &= ~BTP_HAS_GARBAGE; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }
/* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ Datum hashbulkdelete(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2); void *callback_state = (void *) PG_GETARG_POINTER(3); Relation rel = info->index; double tuples_removed; double num_index_tuples; double orig_ntuples; Bucket orig_maxbucket; Bucket cur_maxbucket; Bucket cur_bucket; Buffer metabuf; HashMetaPage metap; HashMetaPageData local_metapage; tuples_removed = 0; num_index_tuples = 0; /* * Read the metapage to fetch original bucket and tuple counts. Also, we * keep a copy of the last-seen metapage so that we can use its * hashm_spares[] values to compute bucket page addresses. This is a bit * hokey but perfectly safe, since the interesting entries in the spares * array cannot change under us; and it beats rereading the metapage for * each bucket. */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); orig_maxbucket = metap->hashm_maxbucket; orig_ntuples = metap->hashm_ntuples; memcpy(&local_metapage, metap, sizeof(local_metapage)); _hash_relbuf(rel, metabuf); /* Scan the buckets that we know exist */ cur_bucket = 0; cur_maxbucket = orig_maxbucket; loop_top: while (cur_bucket <= cur_maxbucket) { BlockNumber bucket_blkno; BlockNumber blkno; bool bucket_dirty = false; /* Get address of bucket's start page */ bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket); /* Exclusive-lock the bucket so we can shrink it */ _hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE); /* Shouldn't have any active scans locally, either */ if (_hash_has_active_scan(rel, cur_bucket)) elog(ERROR, "hash index has active scan during VACUUM"); /* Scan each page in bucket */ blkno = bucket_blkno; while (BlockNumberIsValid(blkno)) { Buffer buf; Page page; HashPageOpaque opaque; OffsetNumber offno; OffsetNumber maxoffno; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; vacuum_delay_point(); buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, info->strategy); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(opaque->hasho_bucket == cur_bucket); /* Scan each tuple in page */ maxoffno = PageGetMaxOffsetNumber(page); for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) { IndexTuple itup; ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); htup = &(itup->t_tid); if (callback(htup, callback_state)) { /* mark the item for deletion */ deletable[ndeletable++] = offno; tuples_removed += 1; } else num_index_tuples += 1; } /* * Apply deletions and write page if needed, advance to next page. */ blkno = opaque->hasho_nextblkno; if (ndeletable > 0) { PageIndexMultiDelete(page, deletable, ndeletable); _hash_wrtbuf(rel, buf); bucket_dirty = true; } else _hash_relbuf(rel, buf); } /* If we deleted anything, try to compact free space */ if (bucket_dirty) _hash_squeezebucket(rel, cur_bucket, bucket_blkno, info->strategy); /* Release bucket lock */ _hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE); /* Advance to next bucket */ cur_bucket++; } /* Write-lock metapage and check for split since we started */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); if (cur_maxbucket != metap->hashm_maxbucket) { /* There's been a split, so process the additional bucket(s) */ cur_maxbucket = metap->hashm_maxbucket; memcpy(&local_metapage, metap, sizeof(local_metapage)); _hash_relbuf(rel, metabuf); goto loop_top; } /* Okay, we're really done. Update tuple count in metapage. */ if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) { /* * No one has split or inserted anything since start of scan, so * believe our count as gospel. */ metap->hashm_ntuples = num_index_tuples; } else { /* * Otherwise, our count is untrustworthy since we may have * double-scanned tuples in split buckets. Proceed by dead-reckoning. * (Note: we still return estimated_count = false, because using this * count is better than not updating reltuples at all.) */ if (metap->hashm_ntuples > tuples_removed) metap->hashm_ntuples -= tuples_removed; else metap->hashm_ntuples = 0; num_index_tuples = metap->hashm_ntuples; } _hash_wrtbuf(rel, metabuf); /* return statistics */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); stats->estimated_count = false; stats->num_index_tuples = num_index_tuples; stats->tuples_removed += tuples_removed; /* hashvacuumcleanup will fill in num_pages */ PG_RETURN_POINTER(stats); }
static void btree_xlog_unlink_page(uint8 info, XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record); BlockNumber leftsib; BlockNumber rightsib; Buffer buffer; Page page; BTPageOpaque pageop; leftsib = xlrec->leftsib; rightsib = xlrec->rightsib; /* * In normal operation, we would lock all the pages this WAL record * touches before changing any of them. In WAL replay, it should be okay * to lock just one page at a time, since no concurrent index updates can * be happening, and readers should not care whether they arrive at the * target page or not (since it's surely empty). */ /* Fix left-link of right sibling */ if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) { page = (Page) BufferGetPage(buffer); pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_prev = leftsib; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* Fix right-link of left sibling, if any */ if (leftsib != P_NONE) { if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) { page = (Page) BufferGetPage(buffer); pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_next = rightsib; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); } /* Rewrite target page as empty deleted page */ buffer = XLogInitBufferForRedo(record, 0); page = (Page) BufferGetPage(buffer); _bt_pageinit(page, BufferGetPageSize(buffer)); pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_prev = leftsib; pageop->btpo_next = rightsib; pageop->btpo.xact = xlrec->btpo_xact; pageop->btpo_flags = BTP_DELETED; pageop->btpo_cycleid = 0; PageSetLSN(page, lsn); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); /* * If we deleted a parent of the targeted leaf page, instead of the leaf * itself, update the leaf to point to the next remaining child in the * branch. */ if (XLogRecHasBlockRef(record, 3)) { /* * There is no real data on the page, so we just re-create it from * scratch using the information from the WAL record. */ IndexTupleData trunctuple; buffer = XLogInitBufferForRedo(record, 3); page = (Page) BufferGetPage(buffer); pageop = (BTPageOpaque) PageGetSpecialPointer(page); _bt_pageinit(page, BufferGetPageSize(buffer)); pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; pageop->btpo_prev = xlrec->leafleftsib; pageop->btpo_next = xlrec->leafrightsib; pageop->btpo.level = 0; pageop->btpo_cycleid = 0; /* Add a dummy hikey item */ MemSet(&trunctuple, 0, sizeof(IndexTupleData)); trunctuple.t_info = sizeof(IndexTupleData); if (xlrec->topparent != InvalidBlockNumber) ItemPointerSet(&trunctuple.t_tid, xlrec->topparent, P_HIKEY); else ItemPointerSetInvalid(&trunctuple.t_tid); if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, false, false) == InvalidOffsetNumber) elog(ERROR, "could not add dummy high key to half-dead page"); PageSetLSN(page, lsn); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); } /* Update metapage if needed */ if (info == XLOG_BTREE_UNLINK_PAGE_META) _bt_restore_meta(record, 4); }
/* * _hash_checkpage -- sanity checks on the format of all hash pages * * If flags is not zero, it is a bitwise OR of the acceptable values of * hasho_flag. */ void _hash_checkpage(Relation rel, Buffer buf, int flags) { Page page = BufferGetPage(buf); /* * ReadBuffer verifies that every newly-read page passes * PageHeaderIsValid, which means it either contains a reasonably sane * page header or is all-zero. We have to defend against the all-zero * case, however. */ if (PageIsNew(page)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" contains unexpected zero page at block %u", RelationGetRelationName(rel), BufferGetBlockNumber(buf)), errhint("Please REINDEX it."))); /* * Additionally check that the special area looks sane. */ if (PageGetSpecialSize(page) != MAXALIGN(sizeof(HashPageOpaqueData))) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" contains corrupted page at block %u", RelationGetRelationName(rel), BufferGetBlockNumber(buf)), errhint("Please REINDEX it."))); if (flags) { HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page); if ((opaque->hasho_flag & flags) == 0) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" contains corrupted page at block %u", RelationGetRelationName(rel), BufferGetBlockNumber(buf)), errhint("Please REINDEX it."))); } /* * When checking the metapage, also verify magic number and version. */ if (flags == LH_META_PAGE) { HashMetaPage metap = HashPageGetMeta(page); if (metap->hashm_magic != HASH_MAGIC) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" is not a hash index", RelationGetRelationName(rel)))); if (metap->hashm_version != HASH_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" has wrong hash version", RelationGetRelationName(rel)), errhint("Please REINDEX it."))); } }
/* * _hash_squeezebucket(rel, bucket) * * Try to squeeze the tuples onto pages occurring earlier in the * bucket chain in an attempt to free overflow pages. When we start * the "squeezing", the page from which we start taking tuples (the * "read" page) is the last bucket in the bucket chain and the page * onto which we start squeezing tuples (the "write" page) is the * first page in the bucket chain. The read page works backward and * the write page works forward; the procedure terminates when the * read page and write page are the same page. * * At completion of this procedure, it is guaranteed that all pages in * the bucket are nonempty, unless the bucket is totally empty (in * which case all overflow pages will be freed). The original implementation * required that to be true on entry as well, but it's a lot easier for * callers to leave empty overflow pages and let this guy clean it up. * * Caller must hold exclusive lock on the target bucket. This allows * us to safely lock multiple pages in the bucket. * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. */ void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy) { BlockNumber wblkno; BlockNumber rblkno; Buffer wbuf; Buffer rbuf; Page wpage; Page rpage; HashPageOpaque wopaque; HashPageOpaque ropaque; bool wbuf_dirty; /* * start squeezing into the base bucket page. */ wblkno = bucket_blkno; wbuf = _hash_getbuf_with_strategy(rel, wblkno, HASH_WRITE, LH_BUCKET_PAGE, bstrategy); wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); /* * if there aren't any overflow pages, there's nothing to squeeze. */ if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) { _hash_relbuf(rel, wbuf); return; } /* * Find the last page in the bucket chain by starting at the base bucket * page and working forward. Note: we assume that a hash bucket chain is * usually smaller than the buffer ring being used by VACUUM, else using * the access strategy here would be counterproductive. */ rbuf = InvalidBuffer; ropaque = wopaque; do { rblkno = ropaque->hasho_nextblkno; if (rbuf != InvalidBuffer) _hash_relbuf(rel, rbuf); rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); /* * squeeze the tuples. */ wbuf_dirty = false; for (;;) { OffsetNumber roffnum; OffsetNumber maxroffnum; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; /* Scan each tuple in "read" page */ maxroffnum = PageGetMaxOffsetNumber(rpage); for (roffnum = FirstOffsetNumber; roffnum <= maxroffnum; roffnum = OffsetNumberNext(roffnum)) { IndexTuple itup; Size itemsz; itup = (IndexTuple) PageGetItem(rpage, PageGetItemId(rpage, roffnum)); itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* * Walk up the bucket chain, looking for a page big enough for * this item. Exit if we reach the read page. */ while (PageGetFreeSpace(wpage) < itemsz) { Assert(!PageIsEmpty(wpage)); wblkno = wopaque->hasho_nextblkno; Assert(BlockNumberIsValid(wblkno)); if (wbuf_dirty) _hash_wrtbuf(rel, wbuf); else _hash_relbuf(rel, wbuf); /* nothing more to do if we reached the read page */ if (rblkno == wblkno) { if (ndeletable > 0) { /* Delete tuples we already moved off read page */ PageIndexMultiDelete(rpage, deletable, ndeletable); _hash_wrtbuf(rel, rbuf); } else _hash_relbuf(rel, rbuf); return; } wbuf = _hash_getbuf_with_strategy(rel, wblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); Assert(wopaque->hasho_bucket == bucket); wbuf_dirty = false; } /* * we have found room so insert on the "write" page, being careful * to preserve hashkey ordering. (If we insert many tuples into * the same "write" page it would be worth qsort'ing instead of * doing repeated _hash_pgaddtup.) */ (void) _hash_pgaddtup(rel, wbuf, itemsz, itup); wbuf_dirty = true; /* remember tuple for deletion from "read" page */ deletable[ndeletable++] = roffnum; } /* * If we reach here, there are no live tuples on the "read" page --- * it was empty when we got to it, or we moved them all. So we can * just free the page without bothering with deleting tuples * individually. Then advance to the previous "read" page. * * Tricky point here: if our read and write pages are adjacent in the * bucket chain, our write lock on wbuf will conflict with * _hash_freeovflpage's attempt to update the sibling links of the * removed page. However, in that case we are done anyway, so we can * simply drop the write lock before calling _hash_freeovflpage. */ rblkno = ropaque->hasho_prevblkno; Assert(BlockNumberIsValid(rblkno)); /* are we freeing the page adjacent to wbuf? */ if (rblkno == wblkno) { /* yes, so release wbuf lock first */ if (wbuf_dirty) _hash_wrtbuf(rel, wbuf); else _hash_relbuf(rel, wbuf); /* free this overflow page (releases rbuf) */ _hash_freeovflpage(rel, rbuf, bstrategy); /* done */ return; } /* free this overflow page, then get the previous one */ _hash_freeovflpage(rel, rbuf, bstrategy); rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } /* NOTREACHED */ }
/* * _hash_first() -- Find the first item in a scan. * * Find the first item in the index that * satisfies the qualification associated with the scan descriptor. On * success, the page containing the current index tuple is read locked * and pinned, and the scan's opaque data entry is updated to * include the buffer. */ bool _hash_first(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; HashScanOpaque so = (HashScanOpaque) scan->opaque; ScanKey cur; uint32 hashkey; Bucket bucket; BlockNumber blkno; Buffer buf; Buffer metabuf; Page page; HashPageOpaque opaque; HashMetaPage metap; IndexTuple itup; ItemPointer current; OffsetNumber offnum; pgstat_count_index_scan(rel); current = &(so->hashso_curpos); ItemPointerSetInvalid(current); /* * We do not support hash scans with no index qualification, because we * would have to read the whole index rather than just one bucket. That * creates a whole raft of problems, since we haven't got a practical way * to lock all the buckets against splits or compactions. */ if (scan->numberOfKeys < 1) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("hash indexes do not support whole-index scans"))); /* There may be more than one index qual, but we hash only the first */ cur = &scan->keyData[0]; /* We support only single-column hash indexes */ Assert(cur->sk_attno == 1); /* And there's only one operator strategy, too */ Assert(cur->sk_strategy == HTEqualStrategyNumber); /* * If the constant in the index qual is NULL, assume it cannot match any * items in the index. */ if (cur->sk_flags & SK_ISNULL) return false; /* * Okay to compute the hash key. We want to do this before acquiring any * locks, in case a user-defined hash function happens to be slow. * * If scankey operator is not a cross-type comparison, we can use the * cached hash function; otherwise gotta look it up in the catalogs. * * We support the convention that sk_subtype == InvalidOid means the * opclass input type; this is a hack to simplify life for ScanKeyInit(). */ if (cur->sk_subtype == rel->rd_opcintype[0] || cur->sk_subtype == InvalidOid) hashkey = _hash_datum2hashkey(rel, cur->sk_argument); else hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument, cur->sk_subtype); so->hashso_sk_hash = hashkey; /* * Acquire shared split lock so we can compute the target bucket safely * (see README). */ _hash_getlock(rel, 0, HASH_SHARE); /* Read the metapage */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* * Compute the target bucket number, and convert to block number. */ bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask); blkno = BUCKET_TO_BLKNO(metap, bucket); /* done with the metapage */ _hash_relbuf(rel, metabuf); /* * Acquire share lock on target bucket; then we can release split lock. */ _hash_getlock(rel, blkno, HASH_SHARE); _hash_droplock(rel, 0, HASH_SHARE); /* Update scan opaque state to show we have lock on the bucket */ so->hashso_bucket = bucket; so->hashso_bucket_valid = true; so->hashso_bucket_blkno = blkno; /* Fetch the primary bucket page for the bucket */ buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(opaque->hasho_bucket == bucket); /* If a backwards scan is requested, move to the end of the chain */ if (ScanDirectionIsBackward(dir)) { while (BlockNumberIsValid(opaque->hasho_nextblkno)) _hash_readnext(rel, &buf, &page, &opaque); } /* Now find the first tuple satisfying the qualification */ if (!_hash_step(scan, &buf, dir)) return false; /* if we're here, _hash_step found a valid tuple */ offnum = ItemPointerGetOffsetNumber(current); _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); page = BufferGetPage(buf); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); so->hashso_heappos = itup->t_tid; return true; }
/* * Helper function to perform deletion of index entries from a bucket. * * This function expects that the caller has acquired a cleanup lock on the * primary bucket page, and will return with a write lock again held on the * primary bucket page. The lock won't necessarily be held continuously, * though, because we'll release it when visiting overflow pages. * * It would be very bad if this function cleaned a page while some other * backend was in the midst of scanning it, because hashgettuple assumes * that the next valid TID will be greater than or equal to the current * valid TID. There can't be any concurrent scans in progress when we first * enter this function because of the cleanup lock we hold on the primary * bucket page, but as soon as we release that lock, there might be. We * handle that by conspiring to prevent those scans from passing our cleanup * scan. To do that, we lock the next page in the bucket chain before * releasing the lock on the previous page. (This type of lock chaining is * not ideal, so we might want to look for a better solution at some point.) * * We need to retain a pin on the primary bucket to ensure that no concurrent * split can start. */ void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state) { BlockNumber blkno; Buffer buf; Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket; bool bucket_dirty = false; blkno = bucket_blkno; buf = bucket_buf; if (split_cleanup) new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket, lowmask, maxbucket); /* Scan each page in bucket */ for (;;) { HashPageOpaque opaque; OffsetNumber offno; OffsetNumber maxoffno; Buffer next_buf; Page page; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; bool retain_pin = false; bool clear_dead_marking = false; vacuum_delay_point(); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* Scan each tuple in page */ maxoffno = PageGetMaxOffsetNumber(page); for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) { ItemPointer htup; IndexTuple itup; Bucket bucket; bool kill_tuple = false; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); htup = &(itup->t_tid); /* * To remove the dead tuples, we strictly want to rely on results * of callback function. refer btvacuumpage for detailed reason. */ if (callback && callback(htup, callback_state)) { kill_tuple = true; if (tuples_removed) *tuples_removed += 1; } else if (split_cleanup) { /* delete the tuples that are moved by split. */ bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); /* mark the item for deletion */ if (bucket != cur_bucket) { /* * We expect tuples to either belong to current bucket or * new_bucket. This is ensured because we don't allow * further splits from bucket that contains garbage. See * comments in _hash_expandtable. */ Assert(bucket == new_bucket); kill_tuple = true; } } if (kill_tuple) { /* mark the item for deletion */ deletable[ndeletable++] = offno; } else { /* we're keeping it, so count it */ if (num_index_tuples) *num_index_tuples += 1; } } /* retain the pin on primary bucket page till end of bucket scan */ if (blkno == bucket_blkno) retain_pin = true; else retain_pin = false; blkno = opaque->hasho_nextblkno; /* * Apply deletions, advance to next page and write page if needed. */ if (ndeletable > 0) { /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); PageIndexMultiDelete(page, deletable, ndeletable); bucket_dirty = true; /* * Let us mark the page as clean if vacuum removes the DEAD tuples * from an index page. We do this by clearing * LH_PAGE_HAS_DEAD_TUPLES flag. */ if (tuples_removed && *tuples_removed > 0 && H_HAS_DEAD_TUPLES(opaque)) { opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; clear_dead_marking = true; } MarkBufferDirty(buf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_hash_delete xlrec; XLogRecPtr recptr; xlrec.clear_dead_marking = clear_dead_marking; xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashDelete); /* * bucket buffer needs to be registered to ensure that we can * acquire a cleanup lock on it during replay. */ if (!xlrec.is_primary_bucket_page) XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); XLogRegisterBuffer(1, buf, REGBUF_STANDARD); XLogRegisterBufData(1, (char *) deletable, ndeletable * sizeof(OffsetNumber)); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE); PageSetLSN(BufferGetPage(buf), recptr); } END_CRIT_SECTION(); } /* bail out if there are no more pages to scan. */ if (!BlockNumberIsValid(blkno)) break; next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); /* * release the lock on previous page after acquiring the lock on next * page */ if (retain_pin) LockBuffer(buf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, buf); buf = next_buf; } /* * lock the bucket page to clear the garbage flag and squeeze the bucket. * if the current buffer is same as bucket buffer, then we already have * lock on bucket page. */ if (buf != bucket_buf) { _hash_relbuf(rel, buf); LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE); } /* * Clear the garbage flag from bucket after deleting the tuples that are * moved by split. We purposefully clear the flag before squeeze bucket, * so that after restart, vacuum shouldn't again try to delete the moved * by split tuples. */ if (split_cleanup) { HashPageOpaque bucket_opaque; Page page; page = BufferGetPage(bucket_buf); bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; MarkBufferDirty(bucket_buf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP); PageSetLSN(page, recptr); } END_CRIT_SECTION(); } /* * If we have deleted anything, try to compact free space. For squeezing * the bucket, we must have a cleanup lock, else it can impact the * ordering of tuples for a scan that has started before it. */ if (bucket_dirty && IsBufferCleanupOK(bucket_buf)) _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf, bstrategy); else LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK); }
/* * replay delete operation in hash index to remove * tuples marked as DEAD during index tuple insertion. */ static void hash_xlog_vacuum_one_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_vacuum_one_page *xldata; Buffer buffer; Buffer metabuf; Page page; XLogRedoAction action; HashPageOpaque pageopaque; xldata = (xl_hash_vacuum_one_page *) XLogRecGetData(record); /* * If we have any conflict processing to do, it must happen before we * update the page. * * Hash index records that are marked as LP_DEAD and being removed during * hash index tuple insertion can conflict with standby queries. You might * think that vacuum records would conflict as well, but we've handled * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid * cleaned by the vacuum of the heap and so we can resolve any conflicts * just once when that arrives. After that we know that no conflicts * exist from individual hash index vacuum records on that index. */ if (InHotStandby) { TransactionId latestRemovedXid = hash_xlog_vacuum_get_latestRemovedXid(record); RelFileNode rnode; XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode); } action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer); if (action == BLK_NEEDS_REDO) { char *ptr; Size len; ptr = XLogRecGetBlockData(record, 0, &len); page = (Page) BufferGetPage(buffer); if (len > 0) { OffsetNumber *unused; OffsetNumber *unend; unused = (OffsetNumber *) ptr; unend = (OffsetNumber *) ((char *) ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); } /* * Mark the page as not containing any LP_DEAD items. See comments * in _hash_vacuum_one_page() for details. */ pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) { Page metapage; HashMetaPage metap; metapage = BufferGetPage(metabuf); metap = HashPageGetMeta(metapage); metap->hashm_ntuples -= xldata->ntuples; PageSetLSN(metapage, lsn); MarkBufferDirty(metabuf); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); }