/* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state) { Relation rel = info->index; double tuples_removed; double num_index_tuples; double orig_ntuples; Bucket orig_maxbucket; Bucket cur_maxbucket; Bucket cur_bucket; Buffer metabuf; HashMetaPage metap; HashMetaPageData local_metapage; tuples_removed = 0; num_index_tuples = 0; /* * Read the metapage to fetch original bucket and tuple counts. Also, we * keep a copy of the last-seen metapage so that we can use its * hashm_spares[] values to compute bucket page addresses. This is a bit * hokey but perfectly safe, since the interesting entries in the spares * array cannot change under us; and it beats rereading the metapage for * each bucket. */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); orig_maxbucket = metap->hashm_maxbucket; orig_ntuples = metap->hashm_ntuples; memcpy(&local_metapage, metap, sizeof(local_metapage)); _hash_relbuf(rel, metabuf); /* Scan the buckets that we know exist */ cur_bucket = 0; cur_maxbucket = orig_maxbucket; loop_top: while (cur_bucket <= cur_maxbucket) { BlockNumber bucket_blkno; BlockNumber blkno; bool bucket_dirty = false; /* Get address of bucket's start page */ bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket); /* Exclusive-lock the bucket so we can shrink it */ _hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE); /* Shouldn't have any active scans locally, either */ if (_hash_has_active_scan(rel, cur_bucket)) elog(ERROR, "hash index has active scan during VACUUM"); /* Scan each page in bucket */ blkno = bucket_blkno; while (BlockNumberIsValid(blkno)) { Buffer buf; Page page; HashPageOpaque opaque; OffsetNumber offno; OffsetNumber maxoffno; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; vacuum_delay_point(); buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, info->strategy); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(opaque->hasho_bucket == cur_bucket); /* Scan each tuple in page */ maxoffno = PageGetMaxOffsetNumber(page); for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) { IndexTuple itup; ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); htup = &(itup->t_tid); if (callback(htup, callback_state)) { /* mark the item for deletion */ deletable[ndeletable++] = offno; tuples_removed += 1; } else num_index_tuples += 1; } /* * Apply deletions and write page if needed, advance to next page. */ blkno = opaque->hasho_nextblkno; if (ndeletable > 0) { PageIndexMultiDelete(page, deletable, ndeletable); _hash_wrtbuf(rel, buf); bucket_dirty = true; } else _hash_relbuf(rel, buf); } /* If we deleted anything, try to compact free space */ if (bucket_dirty) _hash_squeezebucket(rel, cur_bucket, bucket_blkno, info->strategy); /* Release bucket lock */ _hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE); /* Advance to next bucket */ cur_bucket++; } /* Write-lock metapage and check for split since we started */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); if (cur_maxbucket != metap->hashm_maxbucket) { /* There's been a split, so process the additional bucket(s) */ cur_maxbucket = metap->hashm_maxbucket; memcpy(&local_metapage, metap, sizeof(local_metapage)); _hash_relbuf(rel, metabuf); goto loop_top; } /* Okay, we're really done. Update tuple count in metapage. */ if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) { /* * No one has split or inserted anything since start of scan, so * believe our count as gospel. */ metap->hashm_ntuples = num_index_tuples; } else { /* * Otherwise, our count is untrustworthy since we may have * double-scanned tuples in split buckets. Proceed by dead-reckoning. * (Note: we still return estimated_count = false, because using this * count is better than not updating reltuples at all.) */ if (metap->hashm_ntuples > tuples_removed) metap->hashm_ntuples -= tuples_removed; else metap->hashm_ntuples = 0; num_index_tuples = metap->hashm_ntuples; } _hash_wrtbuf(rel, metabuf); /* return statistics */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); stats->estimated_count = false; stats->num_index_tuples = num_index_tuples; stats->tuples_removed += tuples_removed; /* hashvacuumcleanup will fill in num_pages */ return stats; }
/* * Helper function to perform deletion of index entries from a bucket. * * This function expects that the caller has acquired a cleanup lock on the * primary bucket page, and will return with a write lock again held on the * primary bucket page. The lock won't necessarily be held continuously, * though, because we'll release it when visiting overflow pages. * * It would be very bad if this function cleaned a page while some other * backend was in the midst of scanning it, because hashgettuple assumes * that the next valid TID will be greater than or equal to the current * valid TID. There can't be any concurrent scans in progress when we first * enter this function because of the cleanup lock we hold on the primary * bucket page, but as soon as we release that lock, there might be. We * handle that by conspiring to prevent those scans from passing our cleanup * scan. To do that, we lock the next page in the bucket chain before * releasing the lock on the previous page. (This type of lock chaining is * not ideal, so we might want to look for a better solution at some point.) * * We need to retain a pin on the primary bucket to ensure that no concurrent * split can start. */ void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state) { BlockNumber blkno; Buffer buf; Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket; bool bucket_dirty = false; blkno = bucket_blkno; buf = bucket_buf; if (split_cleanup) new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket, lowmask, maxbucket); /* Scan each page in bucket */ for (;;) { HashPageOpaque opaque; OffsetNumber offno; OffsetNumber maxoffno; Buffer next_buf; Page page; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; bool retain_pin = false; bool clear_dead_marking = false; vacuum_delay_point(); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* Scan each tuple in page */ maxoffno = PageGetMaxOffsetNumber(page); for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) { ItemPointer htup; IndexTuple itup; Bucket bucket; bool kill_tuple = false; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); htup = &(itup->t_tid); /* * To remove the dead tuples, we strictly want to rely on results * of callback function. refer btvacuumpage for detailed reason. */ if (callback && callback(htup, callback_state)) { kill_tuple = true; if (tuples_removed) *tuples_removed += 1; } else if (split_cleanup) { /* delete the tuples that are moved by split. */ bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); /* mark the item for deletion */ if (bucket != cur_bucket) { /* * We expect tuples to either belong to current bucket or * new_bucket. This is ensured because we don't allow * further splits from bucket that contains garbage. See * comments in _hash_expandtable. */ Assert(bucket == new_bucket); kill_tuple = true; } } if (kill_tuple) { /* mark the item for deletion */ deletable[ndeletable++] = offno; } else { /* we're keeping it, so count it */ if (num_index_tuples) *num_index_tuples += 1; } } /* retain the pin on primary bucket page till end of bucket scan */ if (blkno == bucket_blkno) retain_pin = true; else retain_pin = false; blkno = opaque->hasho_nextblkno; /* * Apply deletions, advance to next page and write page if needed. */ if (ndeletable > 0) { /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); PageIndexMultiDelete(page, deletable, ndeletable); bucket_dirty = true; /* * Let us mark the page as clean if vacuum removes the DEAD tuples * from an index page. We do this by clearing * LH_PAGE_HAS_DEAD_TUPLES flag. */ if (tuples_removed && *tuples_removed > 0 && H_HAS_DEAD_TUPLES(opaque)) { opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; clear_dead_marking = true; } MarkBufferDirty(buf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_hash_delete xlrec; XLogRecPtr recptr; xlrec.clear_dead_marking = clear_dead_marking; xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashDelete); /* * bucket buffer needs to be registered to ensure that we can * acquire a cleanup lock on it during replay. */ if (!xlrec.is_primary_bucket_page) XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); XLogRegisterBuffer(1, buf, REGBUF_STANDARD); XLogRegisterBufData(1, (char *) deletable, ndeletable * sizeof(OffsetNumber)); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE); PageSetLSN(BufferGetPage(buf), recptr); } END_CRIT_SECTION(); } /* bail out if there are no more pages to scan. */ if (!BlockNumberIsValid(blkno)) break; next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); /* * release the lock on previous page after acquiring the lock on next * page */ if (retain_pin) LockBuffer(buf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, buf); buf = next_buf; } /* * lock the bucket page to clear the garbage flag and squeeze the bucket. * if the current buffer is same as bucket buffer, then we already have * lock on bucket page. */ if (buf != bucket_buf) { _hash_relbuf(rel, buf); LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE); } /* * Clear the garbage flag from bucket after deleting the tuples that are * moved by split. We purposefully clear the flag before squeeze bucket, * so that after restart, vacuum shouldn't again try to delete the moved * by split tuples. */ if (split_cleanup) { HashPageOpaque bucket_opaque; Page page; page = BufferGetPage(bucket_buf); bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; MarkBufferDirty(bucket_buf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP); PageSetLSN(page, recptr); } END_CRIT_SECTION(); } /* * If we have deleted anything, try to compact free space. For squeezing * the bucket, we must have a cleanup lock, else it can impact the * ordering of tuples for a scan that has started before it. */ if (bucket_dirty && IsBufferCleanupOK(bucket_buf)) _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf, bstrategy); else LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK); }
/* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * * We are splitting a bucket that consists of a base bucket page and zero * or more overflow (bucket chain) pages. We must relocate tuples that * belong in the new bucket, and compress out any free space in the old * bucket. * * The caller must hold exclusive locks on both buckets to ensure that * no one else is trying to access them (see README). * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. (The metapage is only * touched if it becomes necessary to add or remove overflow pages.) */ static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, BlockNumber start_oblkno, BlockNumber start_nblkno, uint32 maxbucket, uint32 highmask, uint32 lowmask) { Bucket bucket; Buffer obuf; Buffer nbuf; BlockNumber oblkno; BlockNumber nblkno; bool null; Datum datum; HashItem hitem; HashPageOpaque oopaque; HashPageOpaque nopaque; IndexTuple itup; Size itemsz; OffsetNumber ooffnum; OffsetNumber noffnum; OffsetNumber omaxoffnum; Page opage; Page npage; TupleDesc itupdesc = RelationGetDescr(rel); /* * It should be okay to simultaneously write-lock pages from each * bucket, since no one else can be trying to acquire buffer lock * on pages of either bucket. */ oblkno = start_oblkno; nblkno = start_nblkno; obuf = _hash_getbuf(rel, oblkno, HASH_WRITE); nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE); opage = BufferGetPage(obuf); npage = BufferGetPage(nbuf); _hash_checkpage(rel, opage, LH_BUCKET_PAGE); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); /* initialize the new bucket's primary page */ _hash_pageinit(npage, BufferGetPageSize(nbuf)); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); nopaque->hasho_prevblkno = InvalidBlockNumber; nopaque->hasho_nextblkno = InvalidBlockNumber; nopaque->hasho_bucket = nbucket; nopaque->hasho_flag = LH_BUCKET_PAGE; nopaque->hasho_filler = HASHO_FILL; /* * Partition the tuples in the old bucket between the old bucket and the * new bucket, advancing along the old bucket's overflow bucket chain * and adding overflow pages to the new bucket as needed. */ ooffnum = FirstOffsetNumber; omaxoffnum = PageGetMaxOffsetNumber(opage); for (;;) { /* * at each iteration through this loop, each of these variables * should be up-to-date: obuf opage oopaque ooffnum omaxoffnum */ /* check if we're at the end of the page */ if (ooffnum > omaxoffnum) { /* at end of page, but check for an(other) overflow page */ oblkno = oopaque->hasho_nextblkno; if (!BlockNumberIsValid(oblkno)) break; /* * we ran out of tuples on this particular page, but we * have more overflow pages; advance to next page. */ _hash_wrtbuf(rel, obuf); obuf = _hash_getbuf(rel, oblkno, HASH_WRITE); opage = BufferGetPage(obuf); _hash_checkpage(rel, opage, LH_OVERFLOW_PAGE); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); ooffnum = FirstOffsetNumber; omaxoffnum = PageGetMaxOffsetNumber(opage); continue; } /* * Re-hash the tuple to determine which bucket it now belongs in. * * It is annoying to call the hash function while holding locks, * but releasing and relocking the page for each tuple is unappealing * too. */ hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum)); itup = &(hitem->hash_itup); datum = index_getattr(itup, 1, itupdesc, &null); Assert(!null); bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum), maxbucket, highmask, lowmask); if (bucket == nbucket) { /* * insert the tuple into the new bucket. if it doesn't fit on * the current page in the new bucket, we must allocate a new * overflow page and place the tuple on that page instead. */ itemsz = IndexTupleDSize(hitem->hash_itup) + (sizeof(HashItemData) - sizeof(IndexTupleData)); itemsz = MAXALIGN(itemsz); if (PageGetFreeSpace(npage) < itemsz) { /* write out nbuf and drop lock, but keep pin */ _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK); /* chain to a new overflow page */ nbuf = _hash_addovflpage(rel, metabuf, nbuf); npage = BufferGetPage(nbuf); _hash_checkpage(rel, npage, LH_OVERFLOW_PAGE); /* we don't need nopaque within the loop */ } noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage)); if (PageAddItem(npage, (Item) hitem, itemsz, noffnum, LP_USED) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel)); /* * now delete the tuple from the old bucket. after this * section of code, 'ooffnum' will actually point to the * ItemId to which we would point if we had advanced it before * the deletion (PageIndexTupleDelete repacks the ItemId * array). this also means that 'omaxoffnum' is exactly one * less than it used to be, so we really can just decrement it * instead of calling PageGetMaxOffsetNumber. */ PageIndexTupleDelete(opage, ooffnum); omaxoffnum = OffsetNumberPrev(omaxoffnum); } else { /* * the tuple stays on this page. we didn't move anything, so * we didn't delete anything and therefore we don't have to * change 'omaxoffnum'. */ Assert(bucket == obucket); ooffnum = OffsetNumberNext(ooffnum); } } /* * We're at the end of the old bucket chain, so we're done partitioning * the tuples. Before quitting, call _hash_squeezebucket to ensure the * tuples remaining in the old bucket (including the overflow pages) are * packed as tightly as possible. The new bucket is already tight. */ _hash_wrtbuf(rel, obuf); _hash_wrtbuf(rel, nbuf); _hash_squeezebucket(rel, obucket, start_oblkno); }
/* * Helper function to perform deletion of index entries from a bucket. * * This function expects that the caller has acquired a cleanup lock on the * primary bucket page, and will return with a write lock again held on the * primary bucket page. The lock won't necessarily be held continuously, * though, because we'll release it when visiting overflow pages. * * It would be very bad if this function cleaned a page while some other * backend was in the midst of scanning it, because hashgettuple assumes * that the next valid TID will be greater than or equal to the current * valid TID. There can't be any concurrent scans in progress when we first * enter this function because of the cleanup lock we hold on the primary * bucket page, but as soon as we release that lock, there might be. We * handle that by conspiring to prevent those scans from passing our cleanup * scan. To do that, we lock the next page in the bucket chain before * releasing the lock on the previous page. (This type of lock chaining is * not ideal, so we might want to look for a better solution at some point.) * * We need to retain a pin on the primary bucket to ensure that no concurrent * split can start. */ void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state) { BlockNumber blkno; Buffer buf; Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket; bool bucket_dirty = false; blkno = bucket_blkno; buf = bucket_buf; if (split_cleanup) new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket, lowmask, maxbucket); /* Scan each page in bucket */ for (;;) { HashPageOpaque opaque; OffsetNumber offno; OffsetNumber maxoffno; Buffer next_buf; Page page; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; bool retain_pin = false; bool curr_page_dirty = false; vacuum_delay_point(); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* Scan each tuple in page */ maxoffno = PageGetMaxOffsetNumber(page); for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) { ItemPointer htup; IndexTuple itup; Bucket bucket; bool kill_tuple = false; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); htup = &(itup->t_tid); /* * To remove the dead tuples, we strictly want to rely on results * of callback function. refer btvacuumpage for detailed reason. */ if (callback && callback(htup, callback_state)) { kill_tuple = true; if (tuples_removed) *tuples_removed += 1; } else if (split_cleanup) { /* delete the tuples that are moved by split. */ bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); /* mark the item for deletion */ if (bucket != cur_bucket) { /* * We expect tuples to either belong to curent bucket or * new_bucket. This is ensured because we don't allow * further splits from bucket that contains garbage. See * comments in _hash_expandtable. */ Assert(bucket == new_bucket); kill_tuple = true; } } if (kill_tuple) { /* mark the item for deletion */ deletable[ndeletable++] = offno; } else { /* we're keeping it, so count it */ if (num_index_tuples) *num_index_tuples += 1; } } /* retain the pin on primary bucket page till end of bucket scan */ if (blkno == bucket_blkno) retain_pin = true; else retain_pin = false; blkno = opaque->hasho_nextblkno; /* * Apply deletions, advance to next page and write page if needed. */ if (ndeletable > 0) { PageIndexMultiDelete(page, deletable, ndeletable); bucket_dirty = true; curr_page_dirty = true; } /* bail out if there are no more pages to scan. */ if (!BlockNumberIsValid(blkno)) break; next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); /* * release the lock on previous page after acquiring the lock on next * page */ if (curr_page_dirty) { if (retain_pin) _hash_chgbufaccess(rel, buf, HASH_WRITE, HASH_NOLOCK); else _hash_wrtbuf(rel, buf); curr_page_dirty = false; } else if (retain_pin) _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK); else _hash_relbuf(rel, buf); buf = next_buf; } /* * lock the bucket page to clear the garbage flag and squeeze the bucket. * if the current buffer is same as bucket buffer, then we already have * lock on bucket page. */ if (buf != bucket_buf) { _hash_relbuf(rel, buf); _hash_chgbufaccess(rel, bucket_buf, HASH_NOLOCK, HASH_WRITE); } /* * Clear the garbage flag from bucket after deleting the tuples that are * moved by split. We purposefully clear the flag before squeeze bucket, * so that after restart, vacuum shouldn't again try to delete the moved * by split tuples. */ if (split_cleanup) { HashPageOpaque bucket_opaque; Page page; page = BufferGetPage(bucket_buf); bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; } /* * If we have deleted anything, try to compact free space. For squeezing * the bucket, we must have a cleanup lock, else it can impact the * ordering of tuples for a scan that has started before it. */ if (bucket_dirty && IsBufferCleanupOK(bucket_buf)) _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf, bstrategy); else _hash_chgbufaccess(rel, bucket_buf, HASH_WRITE, HASH_NOLOCK); }
/* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * * We are splitting a bucket that consists of a base bucket page and zero * or more overflow (bucket chain) pages. We must relocate tuples that * belong in the new bucket, and compress out any free space in the old * bucket. * * The caller must hold exclusive locks on both buckets to ensure that * no one else is trying to access them (see README). * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. (The metapage is only * touched if it becomes necessary to add or remove overflow pages.) */ static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, BlockNumber start_oblkno, BlockNumber start_nblkno, uint32 maxbucket, uint32 highmask, uint32 lowmask) { BlockNumber oblkno; BlockNumber nblkno; Buffer obuf; Buffer nbuf; Page opage; Page npage; HashPageOpaque oopaque; HashPageOpaque nopaque; /* * It should be okay to simultaneously write-lock pages from each bucket, * since no one else can be trying to acquire buffer lock on pages of * either bucket. */ oblkno = start_oblkno; obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_BUCKET_PAGE); opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); nblkno = start_nblkno; nbuf = _hash_getnewbuf(rel, nblkno, MAIN_FORKNUM); npage = BufferGetPage(nbuf); /* initialize the new bucket's primary page */ nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); nopaque->hasho_prevblkno = InvalidBlockNumber; nopaque->hasho_nextblkno = InvalidBlockNumber; nopaque->hasho_bucket = nbucket; nopaque->hasho_flag = LH_BUCKET_PAGE; nopaque->hasho_page_id = HASHO_PAGE_ID; /* * Partition the tuples in the old bucket between the old bucket and the * new bucket, advancing along the old bucket's overflow bucket chain and * adding overflow pages to the new bucket as needed. Outer loop iterates * once per page in old bucket. */ for (;;) { OffsetNumber ooffnum; OffsetNumber omaxoffnum; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; /* Scan each tuple in old page */ omaxoffnum = PageGetMaxOffsetNumber(opage); for (ooffnum = FirstOffsetNumber; ooffnum <= omaxoffnum; ooffnum = OffsetNumberNext(ooffnum)) { IndexTuple itup; Size itemsz; Bucket bucket; /* * Fetch the item's hash key (conveniently stored in the item) and * determine which bucket it now belongs in. */ itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum)); bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); if (bucket == nbucket) { /* * insert the tuple into the new bucket. if it doesn't fit on * the current page in the new bucket, we must allocate a new * overflow page and place the tuple on that page instead. */ itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); if (PageGetFreeSpace(npage) < itemsz) { /* write out nbuf and drop lock, but keep pin */ _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK); /* chain to a new overflow page */ nbuf = _hash_addovflpage(rel, metabuf, nbuf); npage = BufferGetPage(nbuf); /* we don't need nblkno or nopaque within the loop */ } /* * Insert tuple on new page, using _hash_pgaddtup to ensure * correct ordering by hashkey. This is a tad inefficient * since we may have to shuffle itempointers repeatedly. * Possible future improvement: accumulate all the items for * the new page and qsort them before insertion. */ (void) _hash_pgaddtup(rel, nbuf, itemsz, itup); /* * Mark tuple for deletion from old page. */ deletable[ndeletable++] = ooffnum; } else { /* * the tuple stays on this page, so nothing to do. */ Assert(bucket == obucket); } } oblkno = oopaque->hasho_nextblkno; /* * Done scanning this old page. If we moved any tuples, delete them * from the old page. */ if (ndeletable > 0) { PageIndexMultiDelete(opage, deletable, ndeletable); _hash_wrtbuf(rel, obuf); } else _hash_relbuf(rel, obuf); /* Exit loop if no more overflow pages in old bucket */ if (!BlockNumberIsValid(oblkno)) break; /* Else, advance to next old page */ obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_OVERFLOW_PAGE); opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); } /* * We're at the end of the old bucket chain, so we're done partitioning * the tuples. Before quitting, call _hash_squeezebucket to ensure the * tuples remaining in the old bucket (including the overflow pages) are * packed as tightly as possible. The new bucket is already tight. */ _hash_wrtbuf(rel, nbuf); _hash_squeezebucket(rel, obucket, start_oblkno, NULL); }
/* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ Datum hashbulkdelete(PG_FUNCTION_ARGS) { Relation rel = (Relation) PG_GETARG_POINTER(0); IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1); void *callback_state = (void *) PG_GETARG_POINTER(2); IndexBulkDeleteResult *result; BlockNumber num_pages; double tuples_removed; double num_index_tuples; double orig_ntuples; Bucket orig_maxbucket; Bucket cur_maxbucket; Bucket cur_bucket; Buffer metabuf; HashMetaPage metap; HashMetaPageData local_metapage; tuples_removed = 0; num_index_tuples = 0; /* * Read the metapage to fetch original bucket and tuple counts. Also, * we keep a copy of the last-seen metapage so that we can use its * hashm_spares[] values to compute bucket page addresses. This is a * bit hokey but perfectly safe, since the interesting entries in the * spares array cannot change under us; and it beats rereading the * metapage for each bucket. */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ); metap = (HashMetaPage) BufferGetPage(metabuf); _hash_checkpage(rel, (Page) metap, LH_META_PAGE); orig_maxbucket = metap->hashm_maxbucket; orig_ntuples = metap->hashm_ntuples; memcpy(&local_metapage, metap, sizeof(local_metapage)); _hash_relbuf(rel, metabuf); /* Scan the buckets that we know exist */ cur_bucket = 0; cur_maxbucket = orig_maxbucket; loop_top: while (cur_bucket <= cur_maxbucket) { BlockNumber bucket_blkno; BlockNumber blkno; bool bucket_dirty = false; /* Get address of bucket's start page */ bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket); /* Exclusive-lock the bucket so we can shrink it */ _hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE); /* Shouldn't have any active scans locally, either */ if (_hash_has_active_scan(rel, cur_bucket)) elog(ERROR, "hash index has active scan during VACUUM"); /* Scan each page in bucket */ blkno = bucket_blkno; while (BlockNumberIsValid(blkno)) { Buffer buf; Page page; HashPageOpaque opaque; OffsetNumber offno; OffsetNumber maxoffno; bool page_dirty = false; buf = _hash_getbuf(rel, blkno, HASH_WRITE); page = BufferGetPage(buf); _hash_checkpage(rel, page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); opaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(opaque->hasho_bucket == cur_bucket); /* Scan each tuple in page */ offno = FirstOffsetNumber; maxoffno = PageGetMaxOffsetNumber(page); while (offno <= maxoffno) { HashItem hitem; ItemPointer htup; hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offno)); htup = &(hitem->hash_itup.t_tid); if (callback(htup, callback_state)) { /* delete the item from the page */ PageIndexTupleDelete(page, offno); bucket_dirty = page_dirty = true; /* don't increment offno, instead decrement maxoffno */ maxoffno = OffsetNumberPrev(maxoffno); tuples_removed += 1; } else { offno = OffsetNumberNext(offno); num_index_tuples += 1; } } /* * Write page if needed, advance to next page. */ blkno = opaque->hasho_nextblkno; if (page_dirty) _hash_wrtbuf(rel, buf); else _hash_relbuf(rel, buf); } /* If we deleted anything, try to compact free space */ if (bucket_dirty) _hash_squeezebucket(rel, cur_bucket, bucket_blkno); /* Release bucket lock */ _hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE); /* Advance to next bucket */ cur_bucket++; } /* Write-lock metapage and check for split since we started */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE); metap = (HashMetaPage) BufferGetPage(metabuf); _hash_checkpage(rel, (Page) metap, LH_META_PAGE); if (cur_maxbucket != metap->hashm_maxbucket) { /* There's been a split, so process the additional bucket(s) */ cur_maxbucket = metap->hashm_maxbucket; memcpy(&local_metapage, metap, sizeof(local_metapage)); _hash_relbuf(rel, metabuf); goto loop_top; } /* Okay, we're really done. Update tuple count in metapage. */ if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) { /* * No one has split or inserted anything since start of scan, * so believe our count as gospel. */ metap->hashm_ntuples = num_index_tuples; } else { /* * Otherwise, our count is untrustworthy since we may have * double-scanned tuples in split buckets. Proceed by * dead-reckoning. */ if (metap->hashm_ntuples > tuples_removed) metap->hashm_ntuples -= tuples_removed; else metap->hashm_ntuples = 0; num_index_tuples = metap->hashm_ntuples; } _hash_wrtbuf(rel, metabuf); /* return statistics */ num_pages = RelationGetNumberOfBlocks(rel); result = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); result->num_pages = num_pages; result->num_index_tuples = num_index_tuples; result->tuples_removed = tuples_removed; PG_RETURN_POINTER(result); }