/* * _hash_addovflpage * * Add an overflow page to the bucket whose last page is pointed to by 'buf'. * * On entry, the caller must hold a pin but no lock on 'buf'. The pin is * dropped before exiting (we assume the caller is not interested in 'buf' * anymore). The returned overflow page will be pinned and write-locked; * it is guaranteed to be empty. * * The caller must hold a pin, but no lock, on the metapage buffer. * That buffer is returned in the same state. * * The caller must hold at least share lock on the bucket, to ensure that * no one else tries to compact the bucket meanwhile. This guarantees that * 'buf' won't stop being part of the bucket while it's unlocked. * * NB: since this could be executed concurrently by multiple processes, * one should not assume that the returned overflow page will be the * immediate successor of the originally passed 'buf'. Additional overflow * pages might have been added to the bucket chain in between. */ Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf) { Buffer ovflbuf; Page page; Page ovflpage; HashPageOpaque pageopaque; HashPageOpaque ovflopaque; /* allocate and lock an empty overflow page */ ovflbuf = _hash_getovflpage(rel, metabuf); /* * Write-lock the tail page. It is okay to hold two buffer locks here * since there cannot be anyone else contending for access to ovflbuf. */ _hash_chgbufaccess(rel, buf, HASH_NOLOCK, HASH_WRITE); /* probably redundant... */ _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); /* loop to find current tail page, in case someone else inserted too */ for (;;) { BlockNumber nextblkno; page = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); nextblkno = pageopaque->hasho_nextblkno; if (!BlockNumberIsValid(nextblkno)) break; /* we assume we do not need to write the unmodified page */ _hash_relbuf(rel, buf); buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); } /* now that we have correct backlink, initialize new overflow page */ ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); ovflopaque->hasho_nextblkno = InvalidBlockNumber; ovflopaque->hasho_bucket = pageopaque->hasho_bucket; ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; ovflopaque->hasho_page_id = HASHO_PAGE_ID; MarkBufferDirty(ovflbuf); /* logically chain overflow page to previous page */ pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); _hash_wrtbuf(rel, buf); return ovflbuf; }
/* * Advance to next page in a bucket, if any. */ static void _hash_readnext(Relation rel, Buffer *bufp, Page *pagep, HashPageOpaque *opaquep) { BlockNumber blkno; blkno = (*opaquep)->hasho_nextblkno; _hash_relbuf(rel, *bufp); *bufp = InvalidBuffer; if (BlockNumberIsValid(blkno)) { *bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE); *pagep = BufferGetPage(*bufp); *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep); } }
/* * Advance to next page in a bucket, if any. */ static void _hash_readnext(Relation rel, Buffer *bufp, Page *pagep, HashPageOpaque *opaquep) { BlockNumber blkno; blkno = (*opaquep)->hasho_nextblkno; _hash_relbuf(rel, *bufp); *bufp = InvalidBuffer; /* check for interrupts while we're not holding any buffer lock */ CHECK_FOR_INTERRUPTS(); if (BlockNumberIsValid(blkno)) { *bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE); *pagep = BufferGetPage(*bufp); *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep); } }
/* * Advance to previous page in a bucket, if any. */ static void _hash_readprev(Relation rel, Buffer *bufp, Page *pagep, HashPageOpaque *opaquep) { BlockNumber blkno; MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; blkno = (*opaquep)->hasho_prevblkno; _hash_relbuf(rel, *bufp); *bufp = InvalidBuffer; /* check for interrupts while we're not holding any buffer lock */ CHECK_FOR_INTERRUPTS(); if (BlockNumberIsValid(blkno)) { *bufp = _hash_getbuf(rel, blkno, HASH_READ); _hash_checkpage(rel, *bufp, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); *pagep = BufferGetPage(*bufp); *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep); } }
/* * _hash_freeovflpage() - * * Remove this overflow page from its bucket's chain, and mark the page as * free. On entry, ovflbuf is write-locked; it is released before exiting. * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. * * Returns the block number of the page that followed the given page * in the bucket, or InvalidBlockNumber if no following page. * * NB: caller must not hold lock on metapage, nor on either page that's * adjacent in the bucket chain. The caller had better hold exclusive lock * on the bucket, too. */ BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, BufferAccessStrategy bstrategy) { HashMetaPage metap; Buffer metabuf; Buffer mapbuf; BlockNumber ovflblkno; BlockNumber prevblkno; BlockNumber blkno; BlockNumber nextblkno; HashPageOpaque ovflopaque; Page ovflpage; Page mappage; uint32 *freep; uint32 ovflbitno; int32 bitmappage, bitmapbit; /*CS3223*/ int index; int bitIndexInElement; uint32 ovflElement; uint32 temp, temp2; int i; BlockNumber nextblkno_temp; HashPageOpaque pageopaque; Page page; uint32 *tempPointer; Bucket bucket PG_USED_FOR_ASSERTS_ONLY; /* Get information from the doomed page */ _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE); ovflblkno = BufferGetBlockNumber(ovflbuf); ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); nextblkno = ovflopaque->hasho_nextblkno; prevblkno = ovflopaque->hasho_prevblkno; bucket = ovflopaque->hasho_bucket; /*CS3223*/ /* find the length of the bucket chain*/ while (i>=0) { //nextblkno_temp; page = BufferGetPage(ovflbuf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); nextblkno_temp = pageopaque->hasho_nextblkno; if (!BlockNumberIsValid(nextblkno_temp)) break; /* we assume we do not need to write the unmodified page */ _hash_relbuf(rel, ovflbuf); ovflbuf = _hash_getbuf(rel, nextblkno_temp, HASH_WRITE, LH_OVERFLOW_PAGE); /*CS3223*/ i++; } /* * Zero the page for debugging's sake; then write and release it. (Note: * if we failed to zero the page here, we'd have problems with the Assert * in _hash_pageinit() when the page is reused.) */ MemSet(ovflpage, 0, BufferGetPageSize(ovflbuf)); _hash_wrtbuf(rel, ovflbuf); /* * Fix up the bucket chain. this is a doubly-linked list, so we must fix * up the bucket chain members behind and ahead of the overflow page being * deleted. No concurrency issues since we hold exclusive lock on the * entire bucket. */ if (BlockNumberIsValid(prevblkno)) { Buffer prevbuf = _hash_getbuf_with_strategy(rel, prevblkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, bstrategy); Page prevpage = BufferGetPage(prevbuf); HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); Assert(prevopaque->hasho_bucket == bucket); prevopaque->hasho_nextblkno = nextblkno; _hash_wrtbuf(rel, prevbuf); } if (BlockNumberIsValid(nextblkno)) { Buffer nextbuf = _hash_getbuf_with_strategy(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); Page nextpage = BufferGetPage(nextbuf); HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); Assert(nextopaque->hasho_bucket == bucket); nextopaque->hasho_prevblkno = prevblkno; _hash_wrtbuf(rel, nextbuf); } /*CS3223*/ if (i == 0) { //length of the bucket chain is 0, no overflow bucket for that primary bucket index = bucket / 32; bitIndexInElement = bucket % 32; ovflElement = metap->ovflBkts[index]; temp = ovflElement >> bitIndexInElement; temp -= 1; //bit changed from 1 to 0 temp2 = temp << bitIndexInElement; ovflElement = ovflElement | temp2; tempPointer = &(metap->ovflBkts[index]); *tempPointer = ovflElement; }
/* * _hash_freeovflpage() - * * Remove this overflow page from its bucket's chain, and mark the page as * free. On entry, ovflbuf is write-locked; it is released before exiting. * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. * * Returns the block number of the page that followed the given page * in the bucket, or InvalidBlockNumber if no following page. * * NB: caller must not hold lock on metapage, nor on either page that's * adjacent in the bucket chain. The caller had better hold exclusive lock * on the bucket, too. */ BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, BufferAccessStrategy bstrategy) { HashMetaPage metap; Buffer metabuf; Buffer mapbuf; BlockNumber ovflblkno; BlockNumber prevblkno; BlockNumber blkno; BlockNumber nextblkno; HashPageOpaque ovflopaque; Page ovflpage; Page mappage; uint32 *freep; uint32 ovflbitno; int32 bitmappage, bitmapbit; Bucket bucket PG_USED_FOR_ASSERTS_ONLY; /* Get information from the doomed page */ _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE); ovflblkno = BufferGetBlockNumber(ovflbuf); ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); nextblkno = ovflopaque->hasho_nextblkno; prevblkno = ovflopaque->hasho_prevblkno; bucket = ovflopaque->hasho_bucket; /* * Zero the page for debugging's sake; then write and release it. (Note: * if we failed to zero the page here, we'd have problems with the Assert * in _hash_pageinit() when the page is reused.) */ MemSet(ovflpage, 0, BufferGetPageSize(ovflbuf)); _hash_wrtbuf(rel, ovflbuf); /* * Fix up the bucket chain. this is a doubly-linked list, so we must fix * up the bucket chain members behind and ahead of the overflow page being * deleted. No concurrency issues since we hold exclusive lock on the * entire bucket. */ if (BlockNumberIsValid(prevblkno)) { Buffer prevbuf = _hash_getbuf_with_strategy(rel, prevblkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, bstrategy); Page prevpage = BufferGetPage(prevbuf); HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); Assert(prevopaque->hasho_bucket == bucket); prevopaque->hasho_nextblkno = nextblkno; _hash_wrtbuf(rel, prevbuf); } if (BlockNumberIsValid(nextblkno)) { Buffer nextbuf = _hash_getbuf_with_strategy(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); Page nextpage = BufferGetPage(nextbuf); HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); Assert(nextopaque->hasho_bucket == bucket); nextopaque->hasho_prevblkno = prevblkno; _hash_wrtbuf(rel, nextbuf); } /* Note: bstrategy is intentionally not used for metapage and bitmap */ /* Read the metapage so we can determine which bitmap page to use */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* Identify which bit to set */ ovflbitno = blkno_to_bitno(metap, ovflblkno); bitmappage = ovflbitno >> BMPG_SHIFT(metap); bitmapbit = ovflbitno & BMPG_MASK(metap); if (bitmappage >= metap->hashm_nmaps) elog(ERROR, "invalid overflow bit number %u", ovflbitno); blkno = metap->hashm_mapp[bitmappage]; /* Release metapage lock while we access the bitmap page */ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); /* Clear the bitmap bit to indicate that this overflow page is free */ mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); freep = HashPageGetBitmap(mappage); Assert(ISSET(freep, bitmapbit)); CLRBIT(freep, bitmapbit); _hash_wrtbuf(rel, mapbuf); /* Get write-lock on metapage to update firstfree */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); /* if this is now the first free page, update hashm_firstfree */ if (ovflbitno < metap->hashm_firstfree) { metap->hashm_firstfree = ovflbitno; _hash_wrtbuf(rel, metabuf); } else { /* no need to change metapage */ _hash_relbuf(rel, metabuf); } return nextblkno; }
/* * _hash_first() -- Find the first item in a scan. * * Find the first item in the index that * satisfies the qualification associated with the scan descriptor. On * success, the page containing the current index tuple is read locked * and pinned, and the scan's opaque data entry is updated to * include the buffer. */ bool _hash_first(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; HashScanOpaque so = (HashScanOpaque) scan->opaque; uint32 hashkey; Bucket bucket; BlockNumber blkno; Buffer buf; Buffer metabuf; Page page; HashPageOpaque opaque; HashMetaPage metap; IndexTuple itup; ItemPointer current; OffsetNumber offnum; MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; pgstat_count_index_scan(rel); current = &(scan->currentItemData); ItemPointerSetInvalid(current); /* * We do not support hash scans with no index qualification, because we * would have to read the whole index rather than just one bucket. That * creates a whole raft of problems, since we haven't got a practical way * to lock all the buckets against splits or compactions. */ if (scan->numberOfKeys < 1) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("hash indexes do not support whole-index scans"))); /* * If the constant in the index qual is NULL, assume it cannot match any * items in the index. */ if (scan->keyData[0].sk_flags & SK_ISNULL) return false; /* * Okay to compute the hash key. We want to do this before acquiring any * locks, in case a user-defined hash function happens to be slow. */ hashkey = _hash_datum2hashkey(rel, scan->keyData[0].sk_argument); /* * Acquire shared split lock so we can compute the target bucket safely * (see README). */ _hash_getlock(rel, 0, HASH_SHARE); /* Read the metapage */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ); _hash_checkpage(rel, metabuf, LH_META_PAGE); metap = (HashMetaPage) BufferGetPage(metabuf); /* * Compute the target bucket number, and convert to block number. */ bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask); blkno = BUCKET_TO_BLKNO(metap, bucket); /* done with the metapage */ _hash_relbuf(rel, metabuf); /* * Acquire share lock on target bucket; then we can release split lock. */ _hash_getlock(rel, blkno, HASH_SHARE); _hash_droplock(rel, 0, HASH_SHARE); /* Update scan opaque state to show we have lock on the bucket */ so->hashso_bucket = bucket; so->hashso_bucket_valid = true; so->hashso_bucket_blkno = blkno; /* Fetch the primary bucket page for the bucket */ buf = _hash_getbuf(rel, blkno, HASH_READ); _hash_checkpage(rel, buf, LH_BUCKET_PAGE); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(opaque->hasho_bucket == bucket); /* If a backwards scan is requested, move to the end of the chain */ if (ScanDirectionIsBackward(dir)) { while (BlockNumberIsValid(opaque->hasho_nextblkno)) _hash_readnext(rel, &buf, &page, &opaque); } /* Now find the first tuple satisfying the qualification */ if (!_hash_step(scan, &buf, dir)) return false; /* if we're here, _hash_step found a valid tuple */ offnum = ItemPointerGetOffsetNumber(current); _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); page = BufferGetPage(buf); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); scan->xs_ctup.t_self = itup->t_tid; return true; }
/* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * * We are splitting a bucket that consists of a base bucket page and zero * or more overflow (bucket chain) pages. We must relocate tuples that * belong in the new bucket, and compress out any free space in the old * bucket. * * The caller must hold exclusive locks on both buckets to ensure that * no one else is trying to access them (see README). * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. (The metapage is only * touched if it becomes necessary to add or remove overflow pages.) */ static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, BlockNumber start_oblkno, BlockNumber start_nblkno, uint32 maxbucket, uint32 highmask, uint32 lowmask) { Bucket bucket; Buffer obuf; Buffer nbuf; BlockNumber oblkno; BlockNumber nblkno; bool null; Datum datum; HashItem hitem; HashPageOpaque oopaque; HashPageOpaque nopaque; IndexTuple itup; Size itemsz; OffsetNumber ooffnum; OffsetNumber noffnum; OffsetNumber omaxoffnum; Page opage; Page npage; TupleDesc itupdesc = RelationGetDescr(rel); /* * It should be okay to simultaneously write-lock pages from each * bucket, since no one else can be trying to acquire buffer lock * on pages of either bucket. */ oblkno = start_oblkno; nblkno = start_nblkno; obuf = _hash_getbuf(rel, oblkno, HASH_WRITE); nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE); opage = BufferGetPage(obuf); npage = BufferGetPage(nbuf); _hash_checkpage(rel, opage, LH_BUCKET_PAGE); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); /* initialize the new bucket's primary page */ _hash_pageinit(npage, BufferGetPageSize(nbuf)); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); nopaque->hasho_prevblkno = InvalidBlockNumber; nopaque->hasho_nextblkno = InvalidBlockNumber; nopaque->hasho_bucket = nbucket; nopaque->hasho_flag = LH_BUCKET_PAGE; nopaque->hasho_filler = HASHO_FILL; /* * Partition the tuples in the old bucket between the old bucket and the * new bucket, advancing along the old bucket's overflow bucket chain * and adding overflow pages to the new bucket as needed. */ ooffnum = FirstOffsetNumber; omaxoffnum = PageGetMaxOffsetNumber(opage); for (;;) { /* * at each iteration through this loop, each of these variables * should be up-to-date: obuf opage oopaque ooffnum omaxoffnum */ /* check if we're at the end of the page */ if (ooffnum > omaxoffnum) { /* at end of page, but check for an(other) overflow page */ oblkno = oopaque->hasho_nextblkno; if (!BlockNumberIsValid(oblkno)) break; /* * we ran out of tuples on this particular page, but we * have more overflow pages; advance to next page. */ _hash_wrtbuf(rel, obuf); obuf = _hash_getbuf(rel, oblkno, HASH_WRITE); opage = BufferGetPage(obuf); _hash_checkpage(rel, opage, LH_OVERFLOW_PAGE); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); ooffnum = FirstOffsetNumber; omaxoffnum = PageGetMaxOffsetNumber(opage); continue; } /* * Re-hash the tuple to determine which bucket it now belongs in. * * It is annoying to call the hash function while holding locks, * but releasing and relocking the page for each tuple is unappealing * too. */ hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum)); itup = &(hitem->hash_itup); datum = index_getattr(itup, 1, itupdesc, &null); Assert(!null); bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum), maxbucket, highmask, lowmask); if (bucket == nbucket) { /* * insert the tuple into the new bucket. if it doesn't fit on * the current page in the new bucket, we must allocate a new * overflow page and place the tuple on that page instead. */ itemsz = IndexTupleDSize(hitem->hash_itup) + (sizeof(HashItemData) - sizeof(IndexTupleData)); itemsz = MAXALIGN(itemsz); if (PageGetFreeSpace(npage) < itemsz) { /* write out nbuf and drop lock, but keep pin */ _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK); /* chain to a new overflow page */ nbuf = _hash_addovflpage(rel, metabuf, nbuf); npage = BufferGetPage(nbuf); _hash_checkpage(rel, npage, LH_OVERFLOW_PAGE); /* we don't need nopaque within the loop */ } noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage)); if (PageAddItem(npage, (Item) hitem, itemsz, noffnum, LP_USED) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel)); /* * now delete the tuple from the old bucket. after this * section of code, 'ooffnum' will actually point to the * ItemId to which we would point if we had advanced it before * the deletion (PageIndexTupleDelete repacks the ItemId * array). this also means that 'omaxoffnum' is exactly one * less than it used to be, so we really can just decrement it * instead of calling PageGetMaxOffsetNumber. */ PageIndexTupleDelete(opage, ooffnum); omaxoffnum = OffsetNumberPrev(omaxoffnum); } else { /* * the tuple stays on this page. we didn't move anything, so * we didn't delete anything and therefore we don't have to * change 'omaxoffnum'. */ Assert(bucket == obucket); ooffnum = OffsetNumberNext(ooffnum); } } /* * We're at the end of the old bucket chain, so we're done partitioning * the tuples. Before quitting, call _hash_squeezebucket to ensure the * tuples remaining in the old bucket (including the overflow pages) are * packed as tightly as possible. The new bucket is already tight. */ _hash_wrtbuf(rel, obuf); _hash_wrtbuf(rel, nbuf); _hash_squeezebucket(rel, obucket, start_oblkno); }
/* * Helper function to perform deletion of index entries from a bucket. * * This function expects that the caller has acquired a cleanup lock on the * primary bucket page, and will return with a write lock again held on the * primary bucket page. The lock won't necessarily be held continuously, * though, because we'll release it when visiting overflow pages. * * It would be very bad if this function cleaned a page while some other * backend was in the midst of scanning it, because hashgettuple assumes * that the next valid TID will be greater than or equal to the current * valid TID. There can't be any concurrent scans in progress when we first * enter this function because of the cleanup lock we hold on the primary * bucket page, but as soon as we release that lock, there might be. We * handle that by conspiring to prevent those scans from passing our cleanup * scan. To do that, we lock the next page in the bucket chain before * releasing the lock on the previous page. (This type of lock chaining is * not ideal, so we might want to look for a better solution at some point.) * * We need to retain a pin on the primary bucket to ensure that no concurrent * split can start. */ void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state) { BlockNumber blkno; Buffer buf; Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket; bool bucket_dirty = false; blkno = bucket_blkno; buf = bucket_buf; if (split_cleanup) new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket, lowmask, maxbucket); /* Scan each page in bucket */ for (;;) { HashPageOpaque opaque; OffsetNumber offno; OffsetNumber maxoffno; Buffer next_buf; Page page; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; bool retain_pin = false; bool curr_page_dirty = false; vacuum_delay_point(); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* Scan each tuple in page */ maxoffno = PageGetMaxOffsetNumber(page); for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) { ItemPointer htup; IndexTuple itup; Bucket bucket; bool kill_tuple = false; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); htup = &(itup->t_tid); /* * To remove the dead tuples, we strictly want to rely on results * of callback function. refer btvacuumpage for detailed reason. */ if (callback && callback(htup, callback_state)) { kill_tuple = true; if (tuples_removed) *tuples_removed += 1; } else if (split_cleanup) { /* delete the tuples that are moved by split. */ bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); /* mark the item for deletion */ if (bucket != cur_bucket) { /* * We expect tuples to either belong to curent bucket or * new_bucket. This is ensured because we don't allow * further splits from bucket that contains garbage. See * comments in _hash_expandtable. */ Assert(bucket == new_bucket); kill_tuple = true; } } if (kill_tuple) { /* mark the item for deletion */ deletable[ndeletable++] = offno; } else { /* we're keeping it, so count it */ if (num_index_tuples) *num_index_tuples += 1; } } /* retain the pin on primary bucket page till end of bucket scan */ if (blkno == bucket_blkno) retain_pin = true; else retain_pin = false; blkno = opaque->hasho_nextblkno; /* * Apply deletions, advance to next page and write page if needed. */ if (ndeletable > 0) { PageIndexMultiDelete(page, deletable, ndeletable); bucket_dirty = true; curr_page_dirty = true; } /* bail out if there are no more pages to scan. */ if (!BlockNumberIsValid(blkno)) break; next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); /* * release the lock on previous page after acquiring the lock on next * page */ if (curr_page_dirty) { if (retain_pin) _hash_chgbufaccess(rel, buf, HASH_WRITE, HASH_NOLOCK); else _hash_wrtbuf(rel, buf); curr_page_dirty = false; } else if (retain_pin) _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK); else _hash_relbuf(rel, buf); buf = next_buf; } /* * lock the bucket page to clear the garbage flag and squeeze the bucket. * if the current buffer is same as bucket buffer, then we already have * lock on bucket page. */ if (buf != bucket_buf) { _hash_relbuf(rel, buf); _hash_chgbufaccess(rel, bucket_buf, HASH_NOLOCK, HASH_WRITE); } /* * Clear the garbage flag from bucket after deleting the tuples that are * moved by split. We purposefully clear the flag before squeeze bucket, * so that after restart, vacuum shouldn't again try to delete the moved * by split tuples. */ if (split_cleanup) { HashPageOpaque bucket_opaque; Page page; page = BufferGetPage(bucket_buf); bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; } /* * If we have deleted anything, try to compact free space. For squeezing * the bucket, we must have a cleanup lock, else it can impact the * ordering of tuples for a scan that has started before it. */ if (bucket_dirty && IsBufferCleanupOK(bucket_buf)) _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf, bstrategy); else _hash_chgbufaccess(rel, bucket_buf, HASH_WRITE, HASH_NOLOCK); }
/* * _hash_addovflpage * * Add an overflow page to the bucket whose last page is pointed to by 'buf'. * * On entry, the caller must hold a pin but no lock on 'buf'. The pin is * dropped before exiting (we assume the caller is not interested in 'buf' * anymore) if not asked to retain. The pin will be retained only for the * primary bucket. The returned overflow page will be pinned and * write-locked; it is guaranteed to be empty. * * The caller must hold a pin, but no lock, on the metapage buffer. * That buffer is returned in the same state. * * NB: since this could be executed concurrently by multiple processes, * one should not assume that the returned overflow page will be the * immediate successor of the originally passed 'buf'. Additional overflow * pages might have been added to the bucket chain in between. */ Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) { Buffer ovflbuf; Page page; Page ovflpage; HashPageOpaque pageopaque; HashPageOpaque ovflopaque; HashMetaPage metap; Buffer mapbuf = InvalidBuffer; Buffer newmapbuf = InvalidBuffer; BlockNumber blkno; uint32 orig_firstfree; uint32 splitnum; uint32 *freep = NULL; uint32 max_ovflpg; uint32 bit; uint32 bitmap_page_bit; uint32 first_page; uint32 last_bit; uint32 last_page; uint32 i, j; bool page_found = false; /* * Write-lock the tail page. Here, we need to maintain locking order such * that, first acquire the lock on tail page of bucket, then on meta page * to find and lock the bitmap page and if it is found, then lock on meta * page is released, then finally acquire the lock on new overflow buffer. * We need this locking order to avoid deadlock with backends that are * doing inserts. * * Note: We could have avoided locking many buffers here if we made two * WAL records for acquiring an overflow page (one to allocate an overflow * page and another to add it to overflow bucket chain). However, doing * so can leak an overflow page, if the system crashes after allocation. * Needless to say, it is better to have a single record from a * performance point of view as well. */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); /* probably redundant... */ _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); /* loop to find current tail page, in case someone else inserted too */ for (;;) { BlockNumber nextblkno; page = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); nextblkno = pageopaque->hasho_nextblkno; if (!BlockNumberIsValid(nextblkno)) break; /* we assume we do not need to write the unmodified page */ if (retain_pin) { /* pin will be retained only for the primary bucket page */ Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_BUCKET_PAGE); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } else _hash_relbuf(rel, buf); retain_pin = false; buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); } /* Get exclusive lock on the meta page */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); _hash_checkpage(rel, metabuf, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* start search at hashm_firstfree */ orig_firstfree = metap->hashm_firstfree; first_page = orig_firstfree >> BMPG_SHIFT(metap); bit = orig_firstfree & BMPG_MASK(metap); i = first_page; j = bit / BITS_PER_MAP; bit &= ~(BITS_PER_MAP - 1); /* outer loop iterates once per bitmap page */ for (;;) { BlockNumber mapblkno; Page mappage; uint32 last_inpage; /* want to end search with the last existing overflow page */ splitnum = metap->hashm_ovflpoint; max_ovflpg = metap->hashm_spares[splitnum] - 1; last_page = max_ovflpg >> BMPG_SHIFT(metap); last_bit = max_ovflpg & BMPG_MASK(metap); if (i > last_page) break; Assert(i < metap->hashm_nmaps); mapblkno = metap->hashm_mapp[i]; if (i == last_page) last_inpage = last_bit; else last_inpage = BMPGSZ_BIT(metap) - 1; /* Release exclusive lock on metapage while reading bitmap page */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); freep = HashPageGetBitmap(mappage); for (; bit <= last_inpage; j++, bit += BITS_PER_MAP) { if (freep[j] != ALL_SET) { page_found = true; /* Reacquire exclusive lock on the meta page */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); /* convert bit to bit number within page */ bit += _hash_firstfreebit(freep[j]); bitmap_page_bit = bit; /* convert bit to absolute bit number */ bit += (i << BMPG_SHIFT(metap)); /* Calculate address of the recycled overflow page */ blkno = bitno_to_blkno(metap, bit); /* Fetch and init the recycled page */ ovflbuf = _hash_getinitbuf(rel, blkno); goto found; } } /* No free space here, try to advance to next map page */ _hash_relbuf(rel, mapbuf); mapbuf = InvalidBuffer; i++; j = 0; /* scan from start of next map page */ bit = 0; /* Reacquire exclusive lock on the meta page */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); } /* * No free pages --- have to extend the relation to add an overflow page. * First, check to see if we have to add a new bitmap page too. */ if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1)) { /* * We create the new bitmap page with all pages marked "in use". * Actually two pages in the new bitmap's range will exist * immediately: the bitmap page itself, and the following page which * is the one we return to the caller. Both of these are correctly * marked "in use". Subsequent pages do not exist yet, but it is * convenient to pre-mark them as "in use" too. */ bit = metap->hashm_spares[splitnum]; /* metapage already has a write lock */ if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("out of overflow pages in hash index \"%s\"", RelationGetRelationName(rel)))); newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM); } else { /* * Nothing to do here; since the page will be past the last used page, * we know its bitmap bit was preinitialized to "in use". */ } /* Calculate address of the new overflow page */ bit = BufferIsValid(newmapbuf) ? metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum]; blkno = bitno_to_blkno(metap, bit); /* * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the * relation length stays in sync with ours. XXX It's annoying to do this * with metapage write lock held; would be better to use a lock that * doesn't block incoming searches. * * It is okay to hold two buffer locks here (one on tail page of bucket * and other on new overflow page) since there cannot be anyone else * contending for access to ovflbuf. */ ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM); found: /* * Do the update. No ereport(ERROR) until changes are logged. We want to * log the changes for bitmap page and overflow page together to avoid * loss of pages in case the new page is added. */ START_CRIT_SECTION(); if (page_found) { Assert(BufferIsValid(mapbuf)); /* mark page "in use" in the bitmap */ SETBIT(freep, bitmap_page_bit); MarkBufferDirty(mapbuf); } else { /* update the count to indicate new overflow page is added */ metap->hashm_spares[splitnum]++; if (BufferIsValid(newmapbuf)) { _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false); MarkBufferDirty(newmapbuf); /* add the new bitmap page to the metapage's list of bitmaps */ metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf); metap->hashm_nmaps++; metap->hashm_spares[splitnum]++; } MarkBufferDirty(metabuf); /* * for new overflow page, we don't need to explicitly set the bit in * bitmap page, as by default that will be set to "in use". */ } /* * Adjust hashm_firstfree to avoid redundant searches. But don't risk * changing it if someone moved it while we were searching bitmap pages. */ if (metap->hashm_firstfree == orig_firstfree) { metap->hashm_firstfree = bit + 1; MarkBufferDirty(metabuf); } /* initialize new overflow page */ ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); ovflopaque->hasho_nextblkno = InvalidBlockNumber; ovflopaque->hasho_bucket = pageopaque->hasho_bucket; ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; ovflopaque->hasho_page_id = HASHO_PAGE_ID; MarkBufferDirty(ovflbuf); /* logically chain overflow page to previous page */ pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); MarkBufferDirty(buf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; xl_hash_add_ovfl_page xlrec; xlrec.bmpage_found = page_found; xlrec.bmsize = metap->hashm_bmsize; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashAddOvflPage); XLogRegisterBuffer(0, ovflbuf, REGBUF_WILL_INIT); XLogRegisterBufData(0, (char *) &pageopaque->hasho_bucket, sizeof(Bucket)); XLogRegisterBuffer(1, buf, REGBUF_STANDARD); if (BufferIsValid(mapbuf)) { XLogRegisterBuffer(2, mapbuf, REGBUF_STANDARD); XLogRegisterBufData(2, (char *) &bitmap_page_bit, sizeof(uint32)); } if (BufferIsValid(newmapbuf)) XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT); XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD); XLogRegisterBufData(4, (char *) &metap->hashm_firstfree, sizeof(uint32)); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE); PageSetLSN(BufferGetPage(ovflbuf), recptr); PageSetLSN(BufferGetPage(buf), recptr); if (BufferIsValid(mapbuf)) PageSetLSN(BufferGetPage(mapbuf), recptr); if (BufferIsValid(newmapbuf)) PageSetLSN(BufferGetPage(newmapbuf), recptr); PageSetLSN(BufferGetPage(metabuf), recptr); } END_CRIT_SECTION(); if (retain_pin) LockBuffer(buf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, buf); if (BufferIsValid(mapbuf)) _hash_relbuf(rel, mapbuf); LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); if (BufferIsValid(newmapbuf)) _hash_relbuf(rel, newmapbuf); return ovflbuf; }
static void btree_xlog_vacuum(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; Buffer buffer; Page page; BTPageOpaque opaque; #ifdef UNUSED xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record); /* * This section of code is thought to be no longer needed, after analysis * of the calling paths. It is retained to allow the code to be reinstated * if a flaw is revealed in that thinking. * * If we are running non-MVCC scans using this index we need to do some * additional work to ensure correctness, which is known as a "pin scan" * described in more detail in next paragraphs. We used to do the extra * work in all cases, whereas we now avoid that work in most cases. If * lastBlockVacuumed is set to InvalidBlockNumber then we skip the * additional work required for the pin scan. * * Avoiding this extra work is important since it requires us to touch * every page in the index, so is an O(N) operation. Worse, it is an * operation performed in the foreground during redo, so it delays * replication directly. * * If queries might be active then we need to ensure every leaf page is * unpinned between the lastBlockVacuumed and the current block, if there * are any. This prevents replay of the VACUUM from reaching the stage of * removing heap tuples while there could still be indexscans "in flight" * to those particular tuples for those scans which could be confused by * finding new tuples at the old TID locations (see nbtree/README). * * It might be worth checking if there are actually any backends running; * if not, we could just skip this. * * Since VACUUM can visit leaf pages out-of-order, it might issue records * with lastBlockVacuumed >= block; that's not an error, it just means * nothing to do now. * * Note: since we touch all pages in the range, we will lock non-leaf * pages, and also any empty (all-zero) pages that may be in the index. It * doesn't seem worth the complexity to avoid that. But it's important * that HotStandbyActiveInReplay() will not return true if the database * isn't yet consistent; so we need not fear reading still-corrupt blocks * here during crash recovery. */ if (HotStandbyActiveInReplay() && BlockNumberIsValid(xlrec->lastBlockVacuumed)) { RelFileNode thisrnode; BlockNumber thisblkno; BlockNumber blkno; XLogRecGetBlockTag(record, 0, &thisrnode, NULL, &thisblkno); for (blkno = xlrec->lastBlockVacuumed + 1; blkno < thisblkno; blkno++) { /* * We use RBM_NORMAL_NO_LOG mode because it's not an error * condition to see all-zero pages. The original btvacuumpage * scan would have skipped over all-zero pages, noting them in FSM * but not bothering to initialize them just yet; so we mustn't * throw an error here. (We could skip acquiring the cleanup lock * if PageIsNew, but it's probably not worth the cycles to test.) * * XXX we don't actually need to read the block, we just need to * confirm it is unpinned. If we had a special call into the * buffer manager we could optimise this so that if the block is * not in shared_buffers we confirm it as unpinned. Optimizing * this is now moot, since in most cases we avoid the scan. */ buffer = XLogReadBufferExtended(thisrnode, MAIN_FORKNUM, blkno, RBM_NORMAL_NO_LOG); if (BufferIsValid(buffer)) { LockBufferForCleanup(buffer); UnlockReleaseBuffer(buffer); } } } #endif /* * Like in btvacuumpage(), we need to take a cleanup lock on every leaf * page. See nbtree/README for details. */ if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer) == BLK_NEEDS_REDO) { char *ptr; Size len; ptr = XLogRecGetBlockData(record, 0, &len); page = (Page) BufferGetPage(buffer); if (len > 0) { OffsetNumber *unused; OffsetNumber *unend; unused = (OffsetNumber *) ptr; unend = (OffsetNumber *) ((char *) ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); } /* * Mark the page as not containing any LP_DEAD items --- see comments * in _bt_delitems_vacuum(). */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_flags &= ~BTP_HAS_GARBAGE; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }
/* * _hash_squeezebucket(rel, bucket) * * Try to squeeze the tuples onto pages occurring earlier in the * bucket chain in an attempt to free overflow pages. When we start * the "squeezing", the page from which we start taking tuples (the * "read" page) is the last bucket in the bucket chain and the page * onto which we start squeezing tuples (the "write" page) is the * first page in the bucket chain. The read page works backward and * the write page works forward; the procedure terminates when the * read page and write page are the same page. * * At completion of this procedure, it is guaranteed that all pages in * the bucket are nonempty, unless the bucket is totally empty (in * which case all overflow pages will be freed). The original implementation * required that to be true on entry as well, but it's a lot easier for * callers to leave empty overflow pages and let this guy clean it up. * * Caller must hold exclusive lock on the target bucket. This allows * us to safely lock multiple pages in the bucket. */ void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno) { Buffer wbuf; Buffer rbuf = 0; BlockNumber wblkno; BlockNumber rblkno; Page wpage; Page rpage; HashPageOpaque wopaque; HashPageOpaque ropaque; OffsetNumber woffnum; OffsetNumber roffnum; IndexTuple itup; Size itemsz; /* * start squeezing into the base bucket page. */ wblkno = bucket_blkno; wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE); _hash_checkpage(rel, wbuf, LH_BUCKET_PAGE); wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); /* * if there aren't any overflow pages, there's nothing to squeeze. */ if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) { _hash_relbuf(rel, wbuf); return; } /* * find the last page in the bucket chain by starting at the base bucket * page and working forward. */ ropaque = wopaque; do { rblkno = ropaque->hasho_nextblkno; if (ropaque != wopaque) _hash_relbuf(rel, rbuf); rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE); _hash_checkpage(rel, rbuf, LH_OVERFLOW_PAGE); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); /* * squeeze the tuples. */ roffnum = FirstOffsetNumber; for (;;) { /* this test is needed in case page is empty on entry */ if (roffnum <= PageGetMaxOffsetNumber(rpage)) { itup = (IndexTuple) PageGetItem(rpage, PageGetItemId(rpage, roffnum)); itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* * Walk up the bucket chain, looking for a page big enough for * this item. Exit if we reach the read page. */ while (PageGetFreeSpace(wpage) < itemsz) { Assert(!PageIsEmpty(wpage)); wblkno = wopaque->hasho_nextblkno; Assert(BlockNumberIsValid(wblkno)); _hash_wrtbuf(rel, wbuf); if (rblkno == wblkno) { /* wbuf is already released */ _hash_wrtbuf(rel, rbuf); return; } wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE); _hash_checkpage(rel, wbuf, LH_OVERFLOW_PAGE); wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); Assert(wopaque->hasho_bucket == bucket); } /* * we have found room so insert on the "write" page. */ woffnum = OffsetNumberNext(PageGetMaxOffsetNumber(wpage)); if (PageAddItem(wpage, (Item) itup, itemsz, woffnum, LP_USED) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel)); /* * delete the tuple from the "read" page. PageIndexTupleDelete * repacks the ItemId array, so 'roffnum' will be "advanced" to * the "next" ItemId. */ PageIndexTupleDelete(rpage, roffnum); } /* * if the "read" page is now empty because of the deletion (or because * it was empty when we got to it), free it. * * Tricky point here: if our read and write pages are adjacent in the * bucket chain, our write lock on wbuf will conflict with * _hash_freeovflpage's attempt to update the sibling links of the * removed page. However, in that case we are done anyway, so we can * simply drop the write lock before calling _hash_freeovflpage. */ if (PageIsEmpty(rpage)) { rblkno = ropaque->hasho_prevblkno; Assert(BlockNumberIsValid(rblkno)); /* are we freeing the page adjacent to wbuf? */ if (rblkno == wblkno) { /* yes, so release wbuf lock first */ _hash_wrtbuf(rel, wbuf); /* free this overflow page (releases rbuf) */ _hash_freeovflpage(rel, rbuf); /* done */ return; } /* free this overflow page, then get the previous one */ _hash_freeovflpage(rel, rbuf); rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE); _hash_checkpage(rel, rbuf, LH_OVERFLOW_PAGE); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); roffnum = FirstOffsetNumber; } } /* NOTREACHED */ }
/* * _hash_doinsert() -- Handle insertion of a single index tuple. * * This routine is called by the public interface routines, hashbuild * and hashinsert. By here, itup is completely filled in. */ void _hash_doinsert(Relation rel, IndexTuple itup) { Buffer buf; Buffer metabuf; HashMetaPage metap; BlockNumber blkno; Page page; HashPageOpaque pageopaque; Size itemsz; bool do_expand; uint32 hashkey; Bucket bucket; /* * Get the hash key for the item (it's stored in the index tuple itself). */ hashkey = _hash_get_indextuple_hashkey(itup); /* compute item size too */ itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we * need to be consistent */ /* * Acquire shared split lock so we can compute the target bucket safely * (see README). */ _hash_getlock(rel, 0, HASH_SHARE); /* Read the metapage */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* * Check whether the item can fit on a hash page at all. (Eventually, we * ought to try to apply TOAST methods if not.) Note that at this point, * itemsz doesn't include the ItemId. * * XXX this is useless code if we are only storing hash keys. */ if (itemsz > HashMaxItemSize((Page) metap)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds hash maximum %lu", (unsigned long) itemsz, (unsigned long) HashMaxItemSize((Page) metap)), errhint("Values larger than a buffer page cannot be indexed."))); /* * Compute the target bucket number, and convert to block number. */ bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask); blkno = BUCKET_TO_BLKNO(metap, bucket); /* release lock on metapage, but keep pin since we'll need it again */ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); /* * Acquire share lock on target bucket; then we can release split lock. */ _hash_getlock(rel, blkno, HASH_SHARE); _hash_droplock(rel, 0, HASH_SHARE); /* Fetch the primary bucket page for the bucket */ buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE); page = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(pageopaque->hasho_bucket == bucket); /* Do the insertion */ while (PageGetFreeSpace(page) < itemsz) { /* * no space on this page; check for an overflow page */ BlockNumber nextblkno = pageopaque->hasho_nextblkno; if (BlockNumberIsValid(nextblkno)) { /* * ovfl page exists; go get it. if it doesn't have room, we'll * find out next pass through the loop test above. */ _hash_relbuf(rel, buf); buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); page = BufferGetPage(buf); } else { /* * we're at the end of the bucket chain and we haven't found a * page with enough room. allocate a new overflow page. */ /* release our write lock without modifying buffer */ _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK); /* chain to a new overflow page */ buf = _hash_addovflpage(rel, metabuf, buf); page = BufferGetPage(buf); /* should fit now, given test above */ Assert(PageGetFreeSpace(page) >= itemsz); } pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE); Assert(pageopaque->hasho_bucket == bucket); } /* found page with enough space, so add the item here */ (void) _hash_pgaddtup(rel, buf, itemsz, itup); /* write and release the modified page */ _hash_wrtbuf(rel, buf); /* We can drop the bucket lock now */ _hash_droplock(rel, blkno, HASH_SHARE); /* * Write-lock the metapage so we can increment the tuple count. After * incrementing it, check to see if it's time for a split. */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); metap->hashm_ntuples += 1; /* Make sure this stays in sync with _hash_expandtable() */ do_expand = metap->hashm_ntuples > (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1); /* Write out the metapage and drop lock, but keep pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); /* Attempt to split if a split is needed */ if (do_expand) _hash_expandtable(rel, metabuf); /* Finally drop our pin on the metapage */ _hash_dropbuf(rel, metabuf); }
/* * _hash_finish_split() -- Finish the previously interrupted split operation * * To complete the split operation, we form the hash table of TIDs in new * bucket which is then used by split operation to skip tuples that are * already moved before the split operation was previously interrupted. * * The caller must hold a pin, but no lock, on the metapage and old bucket's * primary page buffer. The buffers are returned in the same state. (The * metapage is only touched if it becomes necessary to add or remove overflow * pages.) */ void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket, uint32 maxbucket, uint32 highmask, uint32 lowmask) { HASHCTL hash_ctl; HTAB *tidhtab; Buffer bucket_nbuf = InvalidBuffer; Buffer nbuf; Page npage; BlockNumber nblkno; BlockNumber bucket_nblkno; HashPageOpaque npageopaque; Bucket nbucket; bool found; /* Initialize hash tables used to track TIDs */ memset(&hash_ctl, 0, sizeof(hash_ctl)); hash_ctl.keysize = sizeof(ItemPointerData); hash_ctl.entrysize = sizeof(ItemPointerData); hash_ctl.hcxt = CurrentMemoryContext; tidhtab = hash_create("bucket ctids", 256, /* arbitrary initial size */ &hash_ctl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); bucket_nblkno = nblkno = _hash_get_newblock_from_oldbucket(rel, obucket); /* * Scan the new bucket and build hash table of TIDs */ for (;;) { OffsetNumber noffnum; OffsetNumber nmaxoffnum; nbuf = _hash_getbuf(rel, nblkno, HASH_READ, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); /* remember the primary bucket buffer to acquire cleanup lock on it. */ if (nblkno == bucket_nblkno) bucket_nbuf = nbuf; npage = BufferGetPage(nbuf); npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage); /* Scan each tuple in new page */ nmaxoffnum = PageGetMaxOffsetNumber(npage); for (noffnum = FirstOffsetNumber; noffnum <= nmaxoffnum; noffnum = OffsetNumberNext(noffnum)) { IndexTuple itup; /* Fetch the item's TID and insert it in hash table. */ itup = (IndexTuple) PageGetItem(npage, PageGetItemId(npage, noffnum)); (void) hash_search(tidhtab, &itup->t_tid, HASH_ENTER, &found); Assert(!found); } nblkno = npageopaque->hasho_nextblkno; /* * release our write lock without modifying buffer and ensure to * retain the pin on primary bucket. */ if (nbuf == bucket_nbuf) LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, nbuf); /* Exit loop if no more overflow pages in new bucket */ if (!BlockNumberIsValid(nblkno)) break; } /* * Conditionally get the cleanup lock on old and new buckets to perform * the split operation. If we don't get the cleanup locks, silently give * up and next insertion on old bucket will try again to complete the * split. */ if (!ConditionalLockBufferForCleanup(obuf)) { hash_destroy(tidhtab); return; } if (!ConditionalLockBufferForCleanup(bucket_nbuf)) { LockBuffer(obuf, BUFFER_LOCK_UNLOCK); hash_destroy(tidhtab); return; } npage = BufferGetPage(bucket_nbuf); npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage); nbucket = npageopaque->hasho_bucket; _hash_splitbucket(rel, metabuf, obucket, nbucket, obuf, bucket_nbuf, tidhtab, maxbucket, highmask, lowmask); _hash_dropbuf(rel, bucket_nbuf); hash_destroy(tidhtab); }
/* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * * This routine is used to partition the tuples between old and new bucket and * is used to finish the incomplete split operations. To finish the previously * interrupted split operation, the caller needs to fill htab. If htab is set, * then we skip the movement of tuples that exists in htab, otherwise NULL * value of htab indicates movement of all the tuples that belong to the new * bucket. * * We are splitting a bucket that consists of a base bucket page and zero * or more overflow (bucket chain) pages. We must relocate tuples that * belong in the new bucket. * * The caller must hold cleanup locks on both buckets to ensure that * no one else is trying to access them (see README). * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. (The metapage is only * touched if it becomes necessary to add or remove overflow pages.) * * Split needs to retain pin on primary bucket pages of both old and new * buckets till end of operation. This is to prevent vacuum from starting * while a split is in progress. * * In addition, the caller must have created the new bucket's base page, * which is passed in buffer nbuf, pinned and write-locked. The lock will be * released here and pin must be released by the caller. (The API is set up * this way because we must do _hash_getnewbuf() before releasing the metapage * write lock. So instead of passing the new bucket's start block number, we * pass an actual buffer.) */ static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, Buffer obuf, Buffer nbuf, HTAB *htab, uint32 maxbucket, uint32 highmask, uint32 lowmask) { Buffer bucket_obuf; Buffer bucket_nbuf; Page opage; Page npage; HashPageOpaque oopaque; HashPageOpaque nopaque; OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; IndexTuple itups[MaxIndexTuplesPerPage]; Size all_tups_size = 0; int i; uint16 nitups = 0; bucket_obuf = obuf; opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); bucket_nbuf = nbuf; npage = BufferGetPage(nbuf); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); /* * Partition the tuples in the old bucket between the old bucket and the * new bucket, advancing along the old bucket's overflow bucket chain and * adding overflow pages to the new bucket as needed. Outer loop iterates * once per page in old bucket. */ for (;;) { BlockNumber oblkno; OffsetNumber ooffnum; OffsetNumber omaxoffnum; /* Scan each tuple in old page */ omaxoffnum = PageGetMaxOffsetNumber(opage); for (ooffnum = FirstOffsetNumber; ooffnum <= omaxoffnum; ooffnum = OffsetNumberNext(ooffnum)) { IndexTuple itup; Size itemsz; Bucket bucket; bool found = false; /* skip dead tuples */ if (ItemIdIsDead(PageGetItemId(opage, ooffnum))) continue; /* * Before inserting a tuple, probe the hash table containing TIDs * of tuples belonging to new bucket, if we find a match, then * skip that tuple, else fetch the item's hash key (conveniently * stored in the item) and determine which bucket it now belongs * in. */ itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum)); if (htab) (void) hash_search(htab, &itup->t_tid, HASH_FIND, &found); if (found) continue; bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); if (bucket == nbucket) { IndexTuple new_itup; /* * make a copy of index tuple as we have to scribble on it. */ new_itup = CopyIndexTuple(itup); /* * mark the index tuple as moved by split, such tuples are * skipped by scan if there is split in progress for a bucket. */ new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK; /* * insert the tuple into the new bucket. if it doesn't fit on * the current page in the new bucket, we must allocate a new * overflow page and place the tuple on that page instead. */ itemsz = IndexTupleDSize(*new_itup); itemsz = MAXALIGN(itemsz); if (PageGetFreeSpaceForMultipleTuples(npage, nitups + 1) < (all_tups_size + itemsz)) { /* * Change the shared buffer state in critical section, * otherwise any error could make it unrecoverable. */ START_CRIT_SECTION(); _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups); MarkBufferDirty(nbuf); /* log the split operation before releasing the lock */ log_split_page(rel, nbuf); END_CRIT_SECTION(); /* drop lock, but keep pin */ LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); /* be tidy */ for (i = 0; i < nitups; i++) pfree(itups[i]); nitups = 0; all_tups_size = 0; /* chain to a new overflow page */ nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf) ? true : false); npage = BufferGetPage(nbuf); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); } itups[nitups++] = new_itup; all_tups_size += itemsz; } else { /* * the tuple stays on this page, so nothing to do. */ Assert(bucket == obucket); } } oblkno = oopaque->hasho_nextblkno; /* retain the pin on the old primary bucket */ if (obuf == bucket_obuf) LockBuffer(obuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, obuf); /* Exit loop if no more overflow pages in old bucket */ if (!BlockNumberIsValid(oblkno)) { /* * Change the shared buffer state in critical section, otherwise * any error could make it unrecoverable. */ START_CRIT_SECTION(); _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups); MarkBufferDirty(nbuf); /* log the split operation before releasing the lock */ log_split_page(rel, nbuf); END_CRIT_SECTION(); if (nbuf == bucket_nbuf) LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, nbuf); /* be tidy */ for (i = 0; i < nitups; i++) pfree(itups[i]); break; } /* Else, advance to next old page */ obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE); opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); } /* * We're at the end of the old bucket chain, so we're done partitioning * the tuples. Mark the old and new buckets to indicate split is * finished. * * To avoid deadlocks due to locking order of buckets, first lock the old * bucket and then the new bucket. */ LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE); opage = BufferGetPage(bucket_obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); LockBuffer(bucket_nbuf, BUFFER_LOCK_EXCLUSIVE); npage = BufferGetPage(bucket_nbuf); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); START_CRIT_SECTION(); oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT; nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED; /* * After the split is finished, mark the old bucket to indicate that it * contains deletable tuples. We will clear split-cleanup flag after * deleting such tuples either at the end of split or at the next split * from old bucket or at the time of vacuum. */ oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP; /* * now write the buffers, here we don't release the locks as caller is * responsible to release locks. */ MarkBufferDirty(bucket_obuf); MarkBufferDirty(bucket_nbuf); if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; xl_hash_split_complete xlrec; xlrec.old_bucket_flag = oopaque->hasho_flag; xlrec.new_bucket_flag = nopaque->hasho_flag; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashSplitComplete); XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD); XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE); PageSetLSN(BufferGetPage(bucket_obuf), recptr); PageSetLSN(BufferGetPage(bucket_nbuf), recptr); } END_CRIT_SECTION(); /* * If possible, clean up the old bucket. We might not be able to do this * if someone else has a pin on it, but if not then we can go ahead. This * isn't absolutely necessary, but it reduces bloat; if we don't do it * now, VACUUM will do it eventually, but maybe not until new overflow * pages have been allocated. Note that there's no need to clean up the * new bucket. */ if (IsBufferCleanupOK(bucket_obuf)) { LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK); hashbucketcleanup(rel, obucket, bucket_obuf, BufferGetBlockNumber(bucket_obuf), NULL, maxbucket, highmask, lowmask, NULL, NULL, true, NULL, NULL); } else { LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK); LockBuffer(bucket_obuf, BUFFER_LOCK_UNLOCK); } }
/* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * * We are splitting a bucket that consists of a base bucket page and zero * or more overflow (bucket chain) pages. We must relocate tuples that * belong in the new bucket, and compress out any free space in the old * bucket. * * The caller must hold exclusive locks on both buckets to ensure that * no one else is trying to access them (see README). * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. (The metapage is only * touched if it becomes necessary to add or remove overflow pages.) */ static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, BlockNumber start_oblkno, BlockNumber start_nblkno, uint32 maxbucket, uint32 highmask, uint32 lowmask) { BlockNumber oblkno; BlockNumber nblkno; Buffer obuf; Buffer nbuf; Page opage; Page npage; HashPageOpaque oopaque; HashPageOpaque nopaque; /* * It should be okay to simultaneously write-lock pages from each bucket, * since no one else can be trying to acquire buffer lock on pages of * either bucket. */ oblkno = start_oblkno; obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_BUCKET_PAGE); opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); nblkno = start_nblkno; nbuf = _hash_getnewbuf(rel, nblkno, MAIN_FORKNUM); npage = BufferGetPage(nbuf); /* initialize the new bucket's primary page */ nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); nopaque->hasho_prevblkno = InvalidBlockNumber; nopaque->hasho_nextblkno = InvalidBlockNumber; nopaque->hasho_bucket = nbucket; nopaque->hasho_flag = LH_BUCKET_PAGE; nopaque->hasho_page_id = HASHO_PAGE_ID; /* * Partition the tuples in the old bucket between the old bucket and the * new bucket, advancing along the old bucket's overflow bucket chain and * adding overflow pages to the new bucket as needed. Outer loop iterates * once per page in old bucket. */ for (;;) { OffsetNumber ooffnum; OffsetNumber omaxoffnum; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; /* Scan each tuple in old page */ omaxoffnum = PageGetMaxOffsetNumber(opage); for (ooffnum = FirstOffsetNumber; ooffnum <= omaxoffnum; ooffnum = OffsetNumberNext(ooffnum)) { IndexTuple itup; Size itemsz; Bucket bucket; /* * Fetch the item's hash key (conveniently stored in the item) and * determine which bucket it now belongs in. */ itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum)); bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); if (bucket == nbucket) { /* * insert the tuple into the new bucket. if it doesn't fit on * the current page in the new bucket, we must allocate a new * overflow page and place the tuple on that page instead. */ itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); if (PageGetFreeSpace(npage) < itemsz) { /* write out nbuf and drop lock, but keep pin */ _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK); /* chain to a new overflow page */ nbuf = _hash_addovflpage(rel, metabuf, nbuf); npage = BufferGetPage(nbuf); /* we don't need nblkno or nopaque within the loop */ } /* * Insert tuple on new page, using _hash_pgaddtup to ensure * correct ordering by hashkey. This is a tad inefficient * since we may have to shuffle itempointers repeatedly. * Possible future improvement: accumulate all the items for * the new page and qsort them before insertion. */ (void) _hash_pgaddtup(rel, nbuf, itemsz, itup); /* * Mark tuple for deletion from old page. */ deletable[ndeletable++] = ooffnum; } else { /* * the tuple stays on this page, so nothing to do. */ Assert(bucket == obucket); } } oblkno = oopaque->hasho_nextblkno; /* * Done scanning this old page. If we moved any tuples, delete them * from the old page. */ if (ndeletable > 0) { PageIndexMultiDelete(opage, deletable, ndeletable); _hash_wrtbuf(rel, obuf); } else _hash_relbuf(rel, obuf); /* Exit loop if no more overflow pages in old bucket */ if (!BlockNumberIsValid(oblkno)) break; /* Else, advance to next old page */ obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_OVERFLOW_PAGE); opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); } /* * We're at the end of the old bucket chain, so we're done partitioning * the tuples. Before quitting, call _hash_squeezebucket to ensure the * tuples remaining in the old bucket (including the overflow pages) are * packed as tightly as possible. The new bucket is already tight. */ _hash_wrtbuf(rel, nbuf); _hash_squeezebucket(rel, obucket, start_oblkno, NULL); }
/* * Update tuple origtup (size origsz), located in offset oldoff of buffer * oldbuf, to newtup (size newsz) as summary tuple for the page range starting * at heapBlk. oldbuf must not be locked on entry, and is not locked at exit. * * If samepage is true, attempt to put the new tuple in the same page, but if * there's no room, use some other one. * * If the update is successful, return true; the revmap is updated to point to * the new tuple. If the update is not done for whatever reason, return false. * Caller may retry the update if this happens. */ bool brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, BlockNumber heapBlk, Buffer oldbuf, OffsetNumber oldoff, const BrinTuple *origtup, Size origsz, const BrinTuple *newtup, Size newsz, bool samepage) { Page oldpage; ItemId oldlp; BrinTuple *oldtup; Size oldsz; Buffer newbuf; bool extended; Assert(newsz == MAXALIGN(newsz)); /* If the item is oversized, don't bother. */ if (newsz > BrinMaxItemSize) { ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", (unsigned long) newsz, (unsigned long) BrinMaxItemSize, RelationGetRelationName(idxrel)))); return false; /* keep compiler quiet */ } /* make sure the revmap is long enough to contain the entry we need */ brinRevmapExtend(revmap, heapBlk); if (!samepage) { /* need a page on which to put the item */ newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended); if (!BufferIsValid(newbuf)) { Assert(!extended); return false; } /* * Note: it's possible (though unlikely) that the returned newbuf is * the same as oldbuf, if brin_getinsertbuffer determined that the old * buffer does in fact have enough space. */ if (newbuf == oldbuf) { Assert(!extended); newbuf = InvalidBuffer; } } else { LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); newbuf = InvalidBuffer; extended = false; } oldpage = BufferGetPage(oldbuf); oldlp = PageGetItemId(oldpage, oldoff); /* * Check that the old tuple wasn't updated concurrently: it might have * moved someplace else entirely ... */ if (!ItemIdIsNormal(oldlp)) { LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); /* * If this happens, and the new buffer was obtained by extending the * relation, then we need to ensure we don't leave it uninitialized or * forget about it. */ if (BufferIsValid(newbuf)) { if (extended) brin_initialize_empty_new_buffer(idxrel, newbuf); UnlockReleaseBuffer(newbuf); if (extended) FreeSpaceMapVacuum(idxrel); } return false; } oldsz = ItemIdGetLength(oldlp); oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp); /* * ... or it might have been updated in place to different contents. */ if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz)) { LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); if (BufferIsValid(newbuf)) { if (extended) brin_initialize_empty_new_buffer(idxrel, newbuf); UnlockReleaseBuffer(newbuf); if (extended) FreeSpaceMapVacuum(idxrel); } return false; } /* * Great, the old tuple is intact. We can proceed with the update. * * If there's enough room in the old page for the new tuple, replace it. * * Note that there might now be enough space on the page even though the * caller told us there isn't, if a concurrent update moved another tuple * elsewhere or replaced a tuple with a smaller one. */ if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) && brin_can_do_samepage_update(oldbuf, origsz, newsz)) { if (BufferIsValid(newbuf)) { /* as above */ if (extended) brin_initialize_empty_new_buffer(idxrel, newbuf); UnlockReleaseBuffer(newbuf); } START_CRIT_SECTION(); if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) newtup, newsz)) elog(ERROR, "failed to replace BRIN tuple"); MarkBufferDirty(oldbuf); /* XLOG stuff */ if (RelationNeedsWAL(idxrel)) { xl_brin_samepage_update xlrec; XLogRecPtr recptr; uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE; xlrec.offnum = oldoff; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBrinSamepageUpdate); XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD); XLogRegisterBufData(0, (char *) newtup, newsz); recptr = XLogInsert(RM_BRIN_ID, info); PageSetLSN(oldpage, recptr); } END_CRIT_SECTION(); LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); if (extended) FreeSpaceMapVacuum(idxrel); return true; } else if (newbuf == InvalidBuffer) { /* * Not enough space, but caller said that there was. Tell them to * start over. */ LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); return false; } else { /* * Not enough free space on the oldpage. Put the new tuple on the new * page, and update the revmap. */ Page newpage = BufferGetPage(newbuf); Buffer revmapbuf; ItemPointerData newtid; OffsetNumber newoff; BlockNumber newblk = InvalidBlockNumber; Size freespace = 0; revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); START_CRIT_SECTION(); /* * We need to initialize the page if it's newly obtained. Note we * will WAL-log the initialization as part of the update, so we don't * need to do that here. */ if (extended) brin_page_init(BufferGetPage(newbuf), BRIN_PAGETYPE_REGULAR); PageIndexTupleDeleteNoCompact(oldpage, oldoff); newoff = PageAddItem(newpage, (Item) newtup, newsz, InvalidOffsetNumber, false, false); if (newoff == InvalidOffsetNumber) elog(ERROR, "failed to add BRIN tuple to new page"); MarkBufferDirty(oldbuf); MarkBufferDirty(newbuf); /* needed to update FSM below */ if (extended) { newblk = BufferGetBlockNumber(newbuf); freespace = br_page_get_freespace(newpage); } ItemPointerSet(&newtid, BufferGetBlockNumber(newbuf), newoff); brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid); MarkBufferDirty(revmapbuf); /* XLOG stuff */ if (RelationNeedsWAL(idxrel)) { xl_brin_update xlrec; XLogRecPtr recptr; uint8 info; info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0); xlrec.insert.offnum = newoff; xlrec.insert.heapBlk = heapBlk; xlrec.insert.pagesPerRange = pagesPerRange; xlrec.oldOffnum = oldoff; XLogBeginInsert(); /* new page */ XLogRegisterData((char *) &xlrec, SizeOfBrinUpdate); XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); XLogRegisterBufData(0, (char *) newtup, newsz); /* revmap page */ XLogRegisterBuffer(1, revmapbuf, 0); /* old page */ XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD); recptr = XLogInsert(RM_BRIN_ID, info); PageSetLSN(oldpage, recptr); PageSetLSN(newpage, recptr); PageSetLSN(BufferGetPage(revmapbuf), recptr); } END_CRIT_SECTION(); LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); UnlockReleaseBuffer(newbuf); if (extended) { Assert(BlockNumberIsValid(newblk)); RecordPageWithFreeSpace(idxrel, newblk, freespace); FreeSpaceMapVacuum(idxrel); } return true; } }
/* * _hash_freeovflpage() - * * Remove this overflow page from its bucket's chain, and mark the page as * free. On entry, ovflbuf is write-locked; it is released before exiting. * * Add the tuples (itups) to wbuf in this function. We could do that in the * caller as well, but the advantage of doing it here is we can easily write * the WAL for XLOG_HASH_SQUEEZE_PAGE operation. Addition of tuples and * removal of overflow page has to done as an atomic operation, otherwise * during replay on standby users might find duplicate records. * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. * * Returns the block number of the page that followed the given page * in the bucket, or InvalidBlockNumber if no following page. * * NB: caller must not hold lock on metapage, nor on page, that's next to * ovflbuf in the bucket chain. We don't acquire the lock on page that's * prior to ovflbuf in chain if it is same as wbuf because the caller already * has a lock on same. */ BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy) { HashMetaPage metap; Buffer metabuf; Buffer mapbuf; BlockNumber ovflblkno; BlockNumber prevblkno; BlockNumber blkno; BlockNumber nextblkno; BlockNumber writeblkno; HashPageOpaque ovflopaque; Page ovflpage; Page mappage; uint32 *freep; uint32 ovflbitno; int32 bitmappage, bitmapbit; Bucket bucket PG_USED_FOR_ASSERTS_ONLY; Buffer prevbuf = InvalidBuffer; Buffer nextbuf = InvalidBuffer; bool update_metap = false; /* Get information from the doomed page */ _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE); ovflblkno = BufferGetBlockNumber(ovflbuf); ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); nextblkno = ovflopaque->hasho_nextblkno; prevblkno = ovflopaque->hasho_prevblkno; writeblkno = BufferGetBlockNumber(wbuf); bucket = ovflopaque->hasho_bucket; /* * Fix up the bucket chain. this is a doubly-linked list, so we must fix * up the bucket chain members behind and ahead of the overflow page being * deleted. Concurrency issues are avoided by using lock chaining as * described atop hashbucketcleanup. */ if (BlockNumberIsValid(prevblkno)) { if (prevblkno == writeblkno) prevbuf = wbuf; else prevbuf = _hash_getbuf_with_strategy(rel, prevblkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, bstrategy); } if (BlockNumberIsValid(nextblkno)) nextbuf = _hash_getbuf_with_strategy(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); /* Note: bstrategy is intentionally not used for metapage and bitmap */ /* Read the metapage so we can determine which bitmap page to use */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* Identify which bit to set */ ovflbitno = _hash_ovflblkno_to_bitno(metap, ovflblkno); bitmappage = ovflbitno >> BMPG_SHIFT(metap); bitmapbit = ovflbitno & BMPG_MASK(metap); if (bitmappage >= metap->hashm_nmaps) elog(ERROR, "invalid overflow bit number %u", ovflbitno); blkno = metap->hashm_mapp[bitmappage]; /* Release metapage lock while we access the bitmap page */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); /* read the bitmap page to clear the bitmap bit */ mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); freep = HashPageGetBitmap(mappage); Assert(ISSET(freep, bitmapbit)); /* Get write-lock on metapage to update firstfree */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); /* This operation needs to log multiple tuples, prepare WAL for that */ if (RelationNeedsWAL(rel)) XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS, 4 + nitups); START_CRIT_SECTION(); /* * we have to insert tuples on the "write" page, being careful to preserve * hashkey ordering. (If we insert many tuples into the same "write" page * it would be worth qsort'ing them). */ if (nitups > 0) { _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); MarkBufferDirty(wbuf); } /* * Reinitialize the freed overflow page. Just zeroing the page won't * work, because WAL replay routines expect pages to be initialized. See * explanation of RBM_NORMAL mode atop XLogReadBufferExtended. We are * careful to make the special space valid here so that tools like * pageinspect won't get confused. */ _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); ovflopaque->hasho_prevblkno = InvalidBlockNumber; ovflopaque->hasho_nextblkno = InvalidBlockNumber; ovflopaque->hasho_bucket = -1; ovflopaque->hasho_flag = LH_UNUSED_PAGE; ovflopaque->hasho_page_id = HASHO_PAGE_ID; MarkBufferDirty(ovflbuf); if (BufferIsValid(prevbuf)) { Page prevpage = BufferGetPage(prevbuf); HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); Assert(prevopaque->hasho_bucket == bucket); prevopaque->hasho_nextblkno = nextblkno; MarkBufferDirty(prevbuf); } if (BufferIsValid(nextbuf)) { Page nextpage = BufferGetPage(nextbuf); HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); Assert(nextopaque->hasho_bucket == bucket); nextopaque->hasho_prevblkno = prevblkno; MarkBufferDirty(nextbuf); } /* Clear the bitmap bit to indicate that this overflow page is free */ CLRBIT(freep, bitmapbit); MarkBufferDirty(mapbuf); /* if this is now the first free page, update hashm_firstfree */ if (ovflbitno < metap->hashm_firstfree) { metap->hashm_firstfree = ovflbitno; update_metap = true; MarkBufferDirty(metabuf); } /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_hash_squeeze_page xlrec; XLogRecPtr recptr; int i; xlrec.prevblkno = prevblkno; xlrec.nextblkno = nextblkno; xlrec.ntups = nitups; xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf); xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf); XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage); /* * bucket buffer needs to be registered to ensure that we can acquire * a cleanup lock on it during replay. */ if (!xlrec.is_prim_bucket_same_wrt) XLogRegisterBuffer(0, bucketbuf, REGBUF_STANDARD | REGBUF_NO_IMAGE); XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); if (xlrec.ntups > 0) { XLogRegisterBufData(1, (char *) itup_offsets, nitups * sizeof(OffsetNumber)); for (i = 0; i < nitups; i++) XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); } XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD); /* * If prevpage and the writepage (block in which we are moving tuples * from overflow) are same, then no need to separately register * prevpage. During replay, we can directly update the nextblock in * writepage. */ if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD); if (BufferIsValid(nextbuf)) XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD); XLogRegisterBuffer(5, mapbuf, REGBUF_STANDARD); XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32)); if (update_metap) { XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD); XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32)); } recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE); PageSetLSN(BufferGetPage(wbuf), recptr); PageSetLSN(BufferGetPage(ovflbuf), recptr); if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) PageSetLSN(BufferGetPage(prevbuf), recptr); if (BufferIsValid(nextbuf)) PageSetLSN(BufferGetPage(nextbuf), recptr); PageSetLSN(BufferGetPage(mapbuf), recptr); if (update_metap) PageSetLSN(BufferGetPage(metabuf), recptr); } END_CRIT_SECTION(); /* release previous bucket if it is not same as write bucket */ if (BufferIsValid(prevbuf) && prevblkno != writeblkno) _hash_relbuf(rel, prevbuf); if (BufferIsValid(ovflbuf)) _hash_relbuf(rel, ovflbuf); if (BufferIsValid(nextbuf)) _hash_relbuf(rel, nextbuf); _hash_relbuf(rel, mapbuf); _hash_relbuf(rel, metabuf); return nextblkno; }
/* * replay addition of overflow page for hash index */ static void hash_xlog_add_ovfl_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) XLogRecGetData(record); Buffer leftbuf; Buffer ovflbuf; Buffer metabuf; BlockNumber leftblk; BlockNumber rightblk; BlockNumber newmapblk = InvalidBlockNumber; Page ovflpage; HashPageOpaque ovflopaque; uint32 *num_bucket; char *data; Size datalen PG_USED_FOR_ASSERTS_ONLY; bool new_bmpage = false; XLogRecGetBlockTag(record, 0, NULL, NULL, &rightblk); XLogRecGetBlockTag(record, 1, NULL, NULL, &leftblk); ovflbuf = XLogInitBufferForRedo(record, 0); Assert(BufferIsValid(ovflbuf)); data = XLogRecGetBlockData(record, 0, &datalen); num_bucket = (uint32 *) data; Assert(datalen == sizeof(uint32)); _hash_initbuf(ovflbuf, InvalidBlockNumber, *num_bucket, LH_OVERFLOW_PAGE, true); /* update backlink */ ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); ovflopaque->hasho_prevblkno = leftblk; PageSetLSN(ovflpage, lsn); MarkBufferDirty(ovflbuf); if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO) { Page leftpage; HashPageOpaque leftopaque; leftpage = BufferGetPage(leftbuf); leftopaque = (HashPageOpaque) PageGetSpecialPointer(leftpage); leftopaque->hasho_nextblkno = rightblk; PageSetLSN(leftpage, lsn); MarkBufferDirty(leftbuf); } if (BufferIsValid(leftbuf)) UnlockReleaseBuffer(leftbuf); UnlockReleaseBuffer(ovflbuf); /* * Note: in normal operation, we'd update the bitmap and meta page while * still holding lock on the overflow pages. But during replay it's not * necessary to hold those locks, since no other index updates can be * happening concurrently. */ if (XLogRecHasBlockRef(record, 2)) { Buffer mapbuffer; if (XLogReadBufferForRedo(record, 2, &mapbuffer) == BLK_NEEDS_REDO) { Page mappage = (Page) BufferGetPage(mapbuffer); uint32 *freep = NULL; char *data; uint32 *bitmap_page_bit; freep = HashPageGetBitmap(mappage); data = XLogRecGetBlockData(record, 2, &datalen); bitmap_page_bit = (uint32 *) data; SETBIT(freep, *bitmap_page_bit); PageSetLSN(mappage, lsn); MarkBufferDirty(mapbuffer); } if (BufferIsValid(mapbuffer)) UnlockReleaseBuffer(mapbuffer); } if (XLogRecHasBlockRef(record, 3)) { Buffer newmapbuf; newmapbuf = XLogInitBufferForRedo(record, 3); _hash_initbitmapbuffer(newmapbuf, xlrec->bmsize, true); new_bmpage = true; newmapblk = BufferGetBlockNumber(newmapbuf); MarkBufferDirty(newmapbuf); PageSetLSN(BufferGetPage(newmapbuf), lsn); UnlockReleaseBuffer(newmapbuf); } if (XLogReadBufferForRedo(record, 4, &metabuf) == BLK_NEEDS_REDO) { HashMetaPage metap; Page page; uint32 *firstfree_ovflpage; data = XLogRecGetBlockData(record, 4, &datalen); firstfree_ovflpage = (uint32 *) data; page = BufferGetPage(metabuf); metap = HashPageGetMeta(page); metap->hashm_firstfree = *firstfree_ovflpage; if (!xlrec->bmpage_found) { metap->hashm_spares[metap->hashm_ovflpoint]++; if (new_bmpage) { Assert(BlockNumberIsValid(newmapblk)); metap->hashm_mapp[metap->hashm_nmaps] = newmapblk; metap->hashm_nmaps++; metap->hashm_spares[metap->hashm_ovflpoint]++; } } PageSetLSN(page, lsn); MarkBufferDirty(metabuf); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); }
/* * _hash_squeezebucket(rel, bucket) * * Try to squeeze the tuples onto pages occurring earlier in the * bucket chain in an attempt to free overflow pages. When we start * the "squeezing", the page from which we start taking tuples (the * "read" page) is the last bucket in the bucket chain and the page * onto which we start squeezing tuples (the "write" page) is the * first page in the bucket chain. The read page works backward and * the write page works forward; the procedure terminates when the * read page and write page are the same page. * * At completion of this procedure, it is guaranteed that all pages in * the bucket are nonempty, unless the bucket is totally empty (in * which case all overflow pages will be freed). The original implementation * required that to be true on entry as well, but it's a lot easier for * callers to leave empty overflow pages and let this guy clean it up. * * Caller must acquire cleanup lock on the primary page of the target * bucket to exclude any scans that are in progress, which could easily * be confused into returning the same tuple more than once or some tuples * not at all by the rearrangement we are performing here. To prevent * any concurrent scan to cross the squeeze scan we use lock chaining * similar to hasbucketcleanup. Refer comments atop hashbucketcleanup. * * We need to retain a pin on the primary bucket to ensure that no concurrent * split can start. * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. */ void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy) { BlockNumber wblkno; BlockNumber rblkno; Buffer wbuf; Buffer rbuf; Page wpage; Page rpage; HashPageOpaque wopaque; HashPageOpaque ropaque; /* * start squeezing into the primary bucket page. */ wblkno = bucket_blkno; wbuf = bucket_buf; wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); /* * if there aren't any overflow pages, there's nothing to squeeze. caller * is responsible for releasing the pin on primary bucket page. */ if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) { LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); return; } /* * Find the last page in the bucket chain by starting at the base bucket * page and working forward. Note: we assume that a hash bucket chain is * usually smaller than the buffer ring being used by VACUUM, else using * the access strategy here would be counterproductive. */ rbuf = InvalidBuffer; ropaque = wopaque; do { rblkno = ropaque->hasho_nextblkno; if (rbuf != InvalidBuffer) _hash_relbuf(rel, rbuf); rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); /* * squeeze the tuples. */ for (;;) { OffsetNumber roffnum; OffsetNumber maxroffnum; OffsetNumber deletable[MaxOffsetNumber]; IndexTuple itups[MaxIndexTuplesPerPage]; Size tups_size[MaxIndexTuplesPerPage]; OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; uint16 ndeletable = 0; uint16 nitups = 0; Size all_tups_size = 0; int i; bool retain_pin = false; readpage: /* Scan each tuple in "read" page */ maxroffnum = PageGetMaxOffsetNumber(rpage); for (roffnum = FirstOffsetNumber; roffnum <= maxroffnum; roffnum = OffsetNumberNext(roffnum)) { IndexTuple itup; Size itemsz; /* skip dead tuples */ if (ItemIdIsDead(PageGetItemId(rpage, roffnum))) continue; itup = (IndexTuple) PageGetItem(rpage, PageGetItemId(rpage, roffnum)); itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* * Walk up the bucket chain, looking for a page big enough for * this item and all other accumulated items. Exit if we reach * the read page. */ while (PageGetFreeSpaceForMultipleTuples(wpage, nitups + 1) < (all_tups_size + itemsz)) { Buffer next_wbuf = InvalidBuffer; bool tups_moved = false; Assert(!PageIsEmpty(wpage)); if (wblkno == bucket_blkno) retain_pin = true; wblkno = wopaque->hasho_nextblkno; Assert(BlockNumberIsValid(wblkno)); /* don't need to move to next page if we reached the read page */ if (wblkno != rblkno) next_wbuf = _hash_getbuf_with_strategy(rel, wblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); if (nitups > 0) { Assert(nitups == ndeletable); /* * This operation needs to log multiple tuples, prepare * WAL for that. */ if (RelationNeedsWAL(rel)) XLogEnsureRecordSpace(0, 3 + nitups); START_CRIT_SECTION(); /* * we have to insert tuples on the "write" page, being * careful to preserve hashkey ordering. (If we insert * many tuples into the same "write" page it would be * worth qsort'ing them). */ _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); MarkBufferDirty(wbuf); /* Delete tuples we already moved off read page */ PageIndexMultiDelete(rpage, deletable, ndeletable); MarkBufferDirty(rbuf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; xl_hash_move_page_contents xlrec; xlrec.ntups = nitups; xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf) ? true : false; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashMovePageContents); /* * bucket buffer needs to be registered to ensure that * we can acquire a cleanup lock on it during replay. */ if (!xlrec.is_prim_bucket_same_wrt) XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); XLogRegisterBufData(1, (char *) itup_offsets, nitups * sizeof(OffsetNumber)); for (i = 0; i < nitups; i++) XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD); XLogRegisterBufData(2, (char *) deletable, ndeletable * sizeof(OffsetNumber)); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS); PageSetLSN(BufferGetPage(wbuf), recptr); PageSetLSN(BufferGetPage(rbuf), recptr); } END_CRIT_SECTION(); tups_moved = true; } /* * release the lock on previous page after acquiring the lock * on next page */ if (retain_pin) LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, wbuf); /* nothing more to do if we reached the read page */ if (rblkno == wblkno) { _hash_relbuf(rel, rbuf); return; } wbuf = next_wbuf; wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); Assert(wopaque->hasho_bucket == bucket); retain_pin = false; /* be tidy */ for (i = 0; i < nitups; i++) pfree(itups[i]); nitups = 0; all_tups_size = 0; ndeletable = 0; /* * after moving the tuples, rpage would have been compacted, * so we need to rescan it. */ if (tups_moved) goto readpage; } /* remember tuple for deletion from "read" page */ deletable[ndeletable++] = roffnum; /* * we need a copy of index tuples as they can be freed as part of * overflow page, however we need them to write a WAL record in * _hash_freeovflpage. */ itups[nitups] = CopyIndexTuple(itup); tups_size[nitups++] = itemsz; all_tups_size += itemsz; } /* * If we reach here, there are no live tuples on the "read" page --- * it was empty when we got to it, or we moved them all. So we can * just free the page without bothering with deleting tuples * individually. Then advance to the previous "read" page. * * Tricky point here: if our read and write pages are adjacent in the * bucket chain, our write lock on wbuf will conflict with * _hash_freeovflpage's attempt to update the sibling links of the * removed page. In that case, we don't need to lock it again. */ rblkno = ropaque->hasho_prevblkno; Assert(BlockNumberIsValid(rblkno)); /* free this overflow page (releases rbuf) */ _hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets, tups_size, nitups, bstrategy); /* be tidy */ for (i = 0; i < nitups; i++) pfree(itups[i]); /* are we freeing the page adjacent to wbuf? */ if (rblkno == wblkno) { /* retain the pin on primary bucket page till end of bucket scan */ if (wblkno == bucket_blkno) LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, wbuf); return; } rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } /* NOTREACHED */ }
/* * _hash_first() -- Find the first item in a scan. * * Find the first item in the index that * satisfies the qualification associated with the scan descriptor. On * success, the page containing the current index tuple is read locked * and pinned, and the scan's opaque data entry is updated to * include the buffer. */ bool _hash_first(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; HashScanOpaque so = (HashScanOpaque) scan->opaque; ScanKey cur; uint32 hashkey; Bucket bucket; BlockNumber blkno; Buffer buf; Buffer metabuf; Page page; HashPageOpaque opaque; HashMetaPage metap; IndexTuple itup; ItemPointer current; OffsetNumber offnum; pgstat_count_index_scan(rel); current = &(so->hashso_curpos); ItemPointerSetInvalid(current); /* * We do not support hash scans with no index qualification, because we * would have to read the whole index rather than just one bucket. That * creates a whole raft of problems, since we haven't got a practical way * to lock all the buckets against splits or compactions. */ if (scan->numberOfKeys < 1) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("hash indexes do not support whole-index scans"))); /* There may be more than one index qual, but we hash only the first */ cur = &scan->keyData[0]; /* We support only single-column hash indexes */ Assert(cur->sk_attno == 1); /* And there's only one operator strategy, too */ Assert(cur->sk_strategy == HTEqualStrategyNumber); /* * If the constant in the index qual is NULL, assume it cannot match any * items in the index. */ if (cur->sk_flags & SK_ISNULL) return false; /* * Okay to compute the hash key. We want to do this before acquiring any * locks, in case a user-defined hash function happens to be slow. * * If scankey operator is not a cross-type comparison, we can use the * cached hash function; otherwise gotta look it up in the catalogs. * * We support the convention that sk_subtype == InvalidOid means the * opclass input type; this is a hack to simplify life for ScanKeyInit(). */ if (cur->sk_subtype == rel->rd_opcintype[0] || cur->sk_subtype == InvalidOid) hashkey = _hash_datum2hashkey(rel, cur->sk_argument); else hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument, cur->sk_subtype); so->hashso_sk_hash = hashkey; /* * Acquire shared split lock so we can compute the target bucket safely * (see README). */ _hash_getlock(rel, 0, HASH_SHARE); /* Read the metapage */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* * Compute the target bucket number, and convert to block number. */ bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask); blkno = BUCKET_TO_BLKNO(metap, bucket); /* done with the metapage */ _hash_relbuf(rel, metabuf); /* * Acquire share lock on target bucket; then we can release split lock. */ _hash_getlock(rel, blkno, HASH_SHARE); _hash_droplock(rel, 0, HASH_SHARE); /* Update scan opaque state to show we have lock on the bucket */ so->hashso_bucket = bucket; so->hashso_bucket_valid = true; so->hashso_bucket_blkno = blkno; /* Fetch the primary bucket page for the bucket */ buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(opaque->hasho_bucket == bucket); /* If a backwards scan is requested, move to the end of the chain */ if (ScanDirectionIsBackward(dir)) { while (BlockNumberIsValid(opaque->hasho_nextblkno)) _hash_readnext(rel, &buf, &page, &opaque); } /* Now find the first tuple satisfying the qualification */ if (!_hash_step(scan, &buf, dir)) return false; /* if we're here, _hash_step found a valid tuple */ offnum = ItemPointerGetOffsetNumber(current); _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); page = BufferGetPage(buf); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); so->hashso_heappos = itup->t_tid; return true; }
static void gistRedoPageSplitRecord(XLogRecPtr lsn, XLogRecord *record) { gistxlogPageSplit *xldata = (gistxlogPageSplit *) XLogRecGetData(record); PageSplitRecord xlrec; Buffer firstbuffer = InvalidBuffer; Buffer buffer; Page page; int i; bool isrootsplit = false; decodePageSplitRecord(&xlrec, record); /* * We must hold lock on the first-listed page throughout the action, * including while updating the left child page (if any). We can unlock * remaining pages in the list as soon as they've been written, because * there is no path for concurrent queries to reach those pages without * first visiting the first-listed page. */ /* loop around all pages */ for (i = 0; i < xlrec.data->npage; i++) { NewPage *newpage = xlrec.page + i; int flags; if (newpage->header->blkno == GIST_ROOT_BLKNO) { Assert(i == 0); isrootsplit = true; } buffer = XLogReadBuffer(xlrec.data->node, newpage->header->blkno, true); Assert(BufferIsValid(buffer)); page = (Page) BufferGetPage(buffer); /* ok, clear buffer */ if (xlrec.data->origleaf && newpage->header->blkno != GIST_ROOT_BLKNO) flags = F_LEAF; else flags = 0; GISTInitBuffer(buffer, flags); /* and fill it */ gistfillbuffer(page, newpage->itup, newpage->header->num, FirstOffsetNumber); if (newpage->header->blkno == GIST_ROOT_BLKNO) { GistPageGetOpaque(page)->rightlink = InvalidBlockNumber; GistPageSetNSN(page, xldata->orignsn); GistClearFollowRight(page); } else { if (i < xlrec.data->npage - 1) GistPageGetOpaque(page)->rightlink = xlrec.page[i + 1].header->blkno; else GistPageGetOpaque(page)->rightlink = xldata->origrlink; GistPageSetNSN(page, xldata->orignsn); if (i < xlrec.data->npage - 1 && !isrootsplit && xldata->markfollowright) GistMarkFollowRight(page); else GistClearFollowRight(page); } PageSetLSN(page, lsn); MarkBufferDirty(buffer); if (i == 0) firstbuffer = buffer; else UnlockReleaseBuffer(buffer); } /* Fix follow-right data on left child page, if any */ if (BlockNumberIsValid(xldata->leftchild)) gistRedoClearFollowRight(lsn, record, 0, xldata->node, xldata->leftchild); /* Finally, release lock on the first page */ UnlockReleaseBuffer(firstbuffer); }
/* * Helper function to perform deletion of index entries from a bucket. * * This function expects that the caller has acquired a cleanup lock on the * primary bucket page, and will return with a write lock again held on the * primary bucket page. The lock won't necessarily be held continuously, * though, because we'll release it when visiting overflow pages. * * It would be very bad if this function cleaned a page while some other * backend was in the midst of scanning it, because hashgettuple assumes * that the next valid TID will be greater than or equal to the current * valid TID. There can't be any concurrent scans in progress when we first * enter this function because of the cleanup lock we hold on the primary * bucket page, but as soon as we release that lock, there might be. We * handle that by conspiring to prevent those scans from passing our cleanup * scan. To do that, we lock the next page in the bucket chain before * releasing the lock on the previous page. (This type of lock chaining is * not ideal, so we might want to look for a better solution at some point.) * * We need to retain a pin on the primary bucket to ensure that no concurrent * split can start. */ void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state) { BlockNumber blkno; Buffer buf; Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket; bool bucket_dirty = false; blkno = bucket_blkno; buf = bucket_buf; if (split_cleanup) new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket, lowmask, maxbucket); /* Scan each page in bucket */ for (;;) { HashPageOpaque opaque; OffsetNumber offno; OffsetNumber maxoffno; Buffer next_buf; Page page; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; bool retain_pin = false; bool clear_dead_marking = false; vacuum_delay_point(); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* Scan each tuple in page */ maxoffno = PageGetMaxOffsetNumber(page); for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) { ItemPointer htup; IndexTuple itup; Bucket bucket; bool kill_tuple = false; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); htup = &(itup->t_tid); /* * To remove the dead tuples, we strictly want to rely on results * of callback function. refer btvacuumpage for detailed reason. */ if (callback && callback(htup, callback_state)) { kill_tuple = true; if (tuples_removed) *tuples_removed += 1; } else if (split_cleanup) { /* delete the tuples that are moved by split. */ bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); /* mark the item for deletion */ if (bucket != cur_bucket) { /* * We expect tuples to either belong to current bucket or * new_bucket. This is ensured because we don't allow * further splits from bucket that contains garbage. See * comments in _hash_expandtable. */ Assert(bucket == new_bucket); kill_tuple = true; } } if (kill_tuple) { /* mark the item for deletion */ deletable[ndeletable++] = offno; } else { /* we're keeping it, so count it */ if (num_index_tuples) *num_index_tuples += 1; } } /* retain the pin on primary bucket page till end of bucket scan */ if (blkno == bucket_blkno) retain_pin = true; else retain_pin = false; blkno = opaque->hasho_nextblkno; /* * Apply deletions, advance to next page and write page if needed. */ if (ndeletable > 0) { /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); PageIndexMultiDelete(page, deletable, ndeletable); bucket_dirty = true; /* * Let us mark the page as clean if vacuum removes the DEAD tuples * from an index page. We do this by clearing * LH_PAGE_HAS_DEAD_TUPLES flag. */ if (tuples_removed && *tuples_removed > 0 && H_HAS_DEAD_TUPLES(opaque)) { opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; clear_dead_marking = true; } MarkBufferDirty(buf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_hash_delete xlrec; XLogRecPtr recptr; xlrec.clear_dead_marking = clear_dead_marking; xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashDelete); /* * bucket buffer needs to be registered to ensure that we can * acquire a cleanup lock on it during replay. */ if (!xlrec.is_primary_bucket_page) XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); XLogRegisterBuffer(1, buf, REGBUF_STANDARD); XLogRegisterBufData(1, (char *) deletable, ndeletable * sizeof(OffsetNumber)); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE); PageSetLSN(BufferGetPage(buf), recptr); } END_CRIT_SECTION(); } /* bail out if there are no more pages to scan. */ if (!BlockNumberIsValid(blkno)) break; next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); /* * release the lock on previous page after acquiring the lock on next * page */ if (retain_pin) LockBuffer(buf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, buf); buf = next_buf; } /* * lock the bucket page to clear the garbage flag and squeeze the bucket. * if the current buffer is same as bucket buffer, then we already have * lock on bucket page. */ if (buf != bucket_buf) { _hash_relbuf(rel, buf); LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE); } /* * Clear the garbage flag from bucket after deleting the tuples that are * moved by split. We purposefully clear the flag before squeeze bucket, * so that after restart, vacuum shouldn't again try to delete the moved * by split tuples. */ if (split_cleanup) { HashPageOpaque bucket_opaque; Page page; page = BufferGetPage(bucket_buf); bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; MarkBufferDirty(bucket_buf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; XLogBeginInsert(); XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP); PageSetLSN(page, recptr); } END_CRIT_SECTION(); } /* * If we have deleted anything, try to compact free space. For squeezing * the bucket, we must have a cleanup lock, else it can impact the * ordering of tuples for a scan that has started before it. */ if (bucket_dirty && IsBufferCleanupOK(bucket_buf)) _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf, bstrategy); else LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK); }
/* * redo any page update (except page split) */ static void gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record) { char *begin = XLogRecGetData(record); gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) begin; Buffer buffer; Page page; char *data; /* * We need to acquire and hold lock on target page while updating the left * child page. If we have a full-page image of target page, getting the * lock is a side-effect of restoring that image. Note that even if the * target page no longer exists, we'll still attempt to replay the change * on the child page. */ if (record->xl_info & XLR_BKP_BLOCK(0)) buffer = RestoreBackupBlock(lsn, record, 0, false, true); else buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); /* Fix follow-right data on left child page */ if (BlockNumberIsValid(xldata->leftchild)) gistRedoClearFollowRight(lsn, record, 1, xldata->node, xldata->leftchild); /* Done if target page no longer exists */ if (!BufferIsValid(buffer)) return; /* nothing more to do if page was backed up (and no info to do it with) */ if (record->xl_info & XLR_BKP_BLOCK(0)) { UnlockReleaseBuffer(buffer); return; } page = (Page) BufferGetPage(buffer); /* nothing more to do if change already applied */ if (lsn <= PageGetLSN(page)) { UnlockReleaseBuffer(buffer); return; } data = begin + sizeof(gistxlogPageUpdate); /* Delete old tuples */ if (xldata->ntodelete > 0) { int i; OffsetNumber *todelete = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntodelete; for (i = 0; i < xldata->ntodelete; i++) PageIndexTupleDelete(page, todelete[i]); if (GistPageIsLeaf(page)) GistMarkTuplesDeleted(page); } /* add tuples */ if (data - begin < record->xl_len) { OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); while (data - begin < record->xl_len) { IndexTuple itup = (IndexTuple) data; Size sz = IndexTupleSize(itup); OffsetNumber l; data += sz; l = PageAddItem(page, (Item) itup, sz, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to GiST index page, size %d bytes", (int) sz); off++; } } else { /* * special case: leafpage, nothing to insert, nothing to delete, then * vacuum marks page */ if (GistPageIsLeaf(page) && xldata->ntodelete == 0) GistClearTuplesDeleted(page); } if (!GistPageIsLeaf(page) && PageGetMaxOffsetNumber(page) == InvalidOffsetNumber && xldata->blkno == GIST_ROOT_BLKNO) { /* * all links on non-leaf root page was deleted by vacuum full, so root * page becomes a leaf */ GistPageSetLeaf(page); } GistPageGetOpaque(page)->rightlink = InvalidBlockNumber; PageSetLSN(page, lsn); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); }
/* * Insert value (stored in RumBtree) to tree described by stack * * During an index build, buildStats is non-null and the counters * it contains should be incremented as needed. * * NB: the passed-in stack is freed, as though by freeRumBtreeStack. */ void rumInsertValue(Relation index, RumBtree btree, RumBtreeStack * stack, GinStatsData *buildStats) { RumBtreeStack *parent; BlockNumber rootBlkno; Page page, rpage, lpage; GenericXLogState *state = NULL; /* extract root BlockNumber from stack */ Assert(stack != NULL); parent = stack; while (parent->parent) parent = parent->parent; rootBlkno = parent->blkno; Assert(BlockNumberIsValid(rootBlkno)); /* this loop crawls up the stack until the insertion is complete */ for (;;) { BlockNumber savedLeftLink, savedRightLink; page = BufferGetPage(stack->buffer); savedLeftLink = RumPageGetOpaque(page)->leftlink; savedRightLink = RumPageGetOpaque(page)->rightlink; if (btree->isEnoughSpace(btree, stack->buffer, stack->off)) { if (btree->rumstate->isBuild) { page = BufferGetPage(stack->buffer); START_CRIT_SECTION(); } else { state = GenericXLogStart(index); page = GenericXLogRegisterBuffer(state, stack->buffer, 0); } btree->placeToPage(btree, page, stack->off); if (btree->rumstate->isBuild) { MarkBufferDirty(stack->buffer); END_CRIT_SECTION(); } else GenericXLogFinish(state); LockBuffer(stack->buffer, RUM_UNLOCK); freeRumBtreeStack(stack); return; } else { Buffer rbuffer = RumNewBuffer(btree->index); Page newlpage; /* During index build, count the newly-split page */ if (buildStats) { if (btree->isData) buildStats->nDataPages++; else buildStats->nEntryPages++; } parent = stack->parent; if (parent == NULL) { Buffer lbuffer; if (btree->rumstate->isBuild) { page = BufferGetPage(stack->buffer); rpage = BufferGetPage(rbuffer); } else { state = GenericXLogStart(index); page = GenericXLogRegisterBuffer(state, stack->buffer, 0); rpage = GenericXLogRegisterBuffer(state, rbuffer, GENERIC_XLOG_FULL_IMAGE); } /* * newlpage is a pointer to memory page, it doesn't associate * with buffer, stack->buffer should be untouched */ newlpage = btree->splitPage(btree, stack->buffer, rbuffer, page, rpage, stack->off); /* * split root, so we need to allocate new left page and place * pointer on root to left and right page */ lbuffer = RumNewBuffer(btree->index); if (btree->rumstate->isBuild) lpage = BufferGetPage(lbuffer); else lpage = GenericXLogRegisterBuffer(state, lbuffer, GENERIC_XLOG_FULL_IMAGE); RumPageGetOpaque(rpage)->rightlink = InvalidBlockNumber; RumPageGetOpaque(newlpage)->leftlink = InvalidBlockNumber; RumPageGetOpaque(rpage)->leftlink = BufferGetBlockNumber(lbuffer); RumPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); RumInitPage(page, RumPageGetOpaque(newlpage)->flags & ~RUM_LEAF, BufferGetPageSize(stack->buffer)); PageRestoreTempPage(newlpage, lpage); btree->fillRoot(btree, stack->buffer, lbuffer, rbuffer, page, lpage, rpage); PredicateLockPageSplit(btree->index, BufferGetBlockNumber(stack->buffer), BufferGetBlockNumber(lbuffer)); PredicateLockPageSplit(btree->index, BufferGetBlockNumber(stack->buffer), BufferGetBlockNumber(rbuffer)); if (btree->rumstate->isBuild) { START_CRIT_SECTION(); MarkBufferDirty(rbuffer); MarkBufferDirty(lbuffer); MarkBufferDirty(stack->buffer); } else GenericXLogFinish(state); UnlockReleaseBuffer(rbuffer); UnlockReleaseBuffer(lbuffer); LockBuffer(stack->buffer, RUM_UNLOCK); if (btree->rumstate->isBuild) END_CRIT_SECTION(); freeRumBtreeStack(stack); /* During index build, count the newly-added root page */ if (buildStats) { if (btree->isData) buildStats->nDataPages++; else buildStats->nEntryPages++; } return; } else { BlockNumber rightrightBlkno = InvalidBlockNumber; Buffer rightrightBuffer = InvalidBuffer; /* split non-root page */ if (btree->rumstate->isBuild) { lpage = BufferGetPage(stack->buffer); rpage = BufferGetPage(rbuffer); } else { state = GenericXLogStart(index); lpage = GenericXLogRegisterBuffer(state, stack->buffer, 0); rpage = GenericXLogRegisterBuffer(state, rbuffer, 0); } rightrightBlkno = RumPageGetOpaque(lpage)->rightlink; /* * newlpage is a pointer to memory page, it doesn't associate * with buffer, stack->buffer should be untouched */ newlpage = btree->splitPage(btree, stack->buffer, rbuffer, lpage, rpage, stack->off); RumPageGetOpaque(rpage)->rightlink = savedRightLink; RumPageGetOpaque(newlpage)->leftlink = savedLeftLink; RumPageGetOpaque(rpage)->leftlink = BufferGetBlockNumber(stack->buffer); RumPageGetOpaque(newlpage)->rightlink = BufferGetBlockNumber(rbuffer); PredicateLockPageSplit(btree->index, BufferGetBlockNumber(stack->buffer), BufferGetBlockNumber(rbuffer)); /* * it's safe because we don't have right-to-left walking * with locking bth pages except vacuum. But vacuum will * try to lock all pages with conditional lock */ if (rightrightBlkno != InvalidBlockNumber) { Page rightrightPage; rightrightBuffer = ReadBuffer(btree->index, rightrightBlkno); LockBuffer(rightrightBuffer, RUM_EXCLUSIVE); if (btree->rumstate->isBuild) rightrightPage = BufferGetPage(rightrightBuffer); else rightrightPage = GenericXLogRegisterBuffer(state, rightrightBuffer, 0); RumPageGetOpaque(rightrightPage)->leftlink = BufferGetBlockNumber(rbuffer); } if (btree->rumstate->isBuild) START_CRIT_SECTION(); PageRestoreTempPage(newlpage, lpage); if (btree->rumstate->isBuild) { MarkBufferDirty(rbuffer); MarkBufferDirty(stack->buffer); if (rightrightBlkno != InvalidBlockNumber) MarkBufferDirty(rightrightBuffer); END_CRIT_SECTION(); } else GenericXLogFinish(state); UnlockReleaseBuffer(rbuffer); if (rightrightBlkno != InvalidBlockNumber) UnlockReleaseBuffer(rightrightBuffer); } } btree->isDelete = false; /* search parent to lock */ LockBuffer(parent->buffer, RUM_EXCLUSIVE); /* move right if it's needed */ page = BufferGetPage(parent->buffer); while ((parent->off = btree->findChildPtr(btree, page, stack->blkno, parent->off)) == InvalidOffsetNumber) { BlockNumber rightlink = RumPageGetOpaque(page)->rightlink; if (rightlink == InvalidBlockNumber) { /* * rightmost page, but we don't find parent, we should use * plain search... */ LockBuffer(parent->buffer, RUM_UNLOCK); rumFindParents(btree, stack, rootBlkno); parent = stack->parent; Assert(parent != NULL); break; } parent->buffer = rumStep(parent->buffer, btree->index, RUM_EXCLUSIVE, ForwardScanDirection); parent->blkno = rightlink; page = BufferGetPage(parent->buffer); } UnlockReleaseBuffer(stack->buffer); pfree(stack); stack = parent; } }
static void gistRedoPageSplitRecord(XLogRecPtr lsn, XLogRecord *record) { gistxlogPageSplit *xldata = (gistxlogPageSplit *) XLogRecGetData(record); PageSplitRecord xlrec; Buffer buffer; Page page; int i; bool isrootsplit = false; if (BlockNumberIsValid(xldata->leftchild)) gistRedoClearFollowRight(xldata->node, lsn, xldata->leftchild); decodePageSplitRecord(&xlrec, record); /* loop around all pages */ for (i = 0; i < xlrec.data->npage; i++) { NewPage *newpage = xlrec.page + i; int flags; if (newpage->header->blkno == GIST_ROOT_BLKNO) { Assert(i == 0); isrootsplit = true; } buffer = XLogReadBuffer(xlrec.data->node, newpage->header->blkno, true); Assert(BufferIsValid(buffer)); page = (Page) BufferGetPage(buffer); /* ok, clear buffer */ if (xlrec.data->origleaf && newpage->header->blkno != GIST_ROOT_BLKNO) flags = F_LEAF; else flags = 0; GISTInitBuffer(buffer, flags); /* and fill it */ gistfillbuffer(page, newpage->itup, newpage->header->num, FirstOffsetNumber); if (newpage->header->blkno == GIST_ROOT_BLKNO) { GistPageGetOpaque(page)->rightlink = InvalidBlockNumber; GistPageGetOpaque(page)->nsn = xldata->orignsn; GistClearFollowRight(page); } else { if (i < xlrec.data->npage - 1) GistPageGetOpaque(page)->rightlink = xlrec.page[i + 1].header->blkno; else GistPageGetOpaque(page)->rightlink = xldata->origrlink; GistPageGetOpaque(page)->nsn = xldata->orignsn; if (i < xlrec.data->npage - 1 && !isrootsplit) GistMarkFollowRight(page); else GistClearFollowRight(page); } PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); } }
/* * _hash_squeezebucket(rel, bucket) * * Try to squeeze the tuples onto pages occurring earlier in the * bucket chain in an attempt to free overflow pages. When we start * the "squeezing", the page from which we start taking tuples (the * "read" page) is the last bucket in the bucket chain and the page * onto which we start squeezing tuples (the "write" page) is the * first page in the bucket chain. The read page works backward and * the write page works forward; the procedure terminates when the * read page and write page are the same page. * * At completion of this procedure, it is guaranteed that all pages in * the bucket are nonempty, unless the bucket is totally empty (in * which case all overflow pages will be freed). The original implementation * required that to be true on entry as well, but it's a lot easier for * callers to leave empty overflow pages and let this guy clean it up. * * Caller must hold exclusive lock on the target bucket. This allows * us to safely lock multiple pages in the bucket. * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. */ void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy) { BlockNumber wblkno; BlockNumber rblkno; Buffer wbuf; Buffer rbuf; Page wpage; Page rpage; HashPageOpaque wopaque; HashPageOpaque ropaque; bool wbuf_dirty; /* * start squeezing into the base bucket page. */ wblkno = bucket_blkno; wbuf = _hash_getbuf_with_strategy(rel, wblkno, HASH_WRITE, LH_BUCKET_PAGE, bstrategy); wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); /* * if there aren't any overflow pages, there's nothing to squeeze. */ if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) { _hash_relbuf(rel, wbuf); return; } /* * Find the last page in the bucket chain by starting at the base bucket * page and working forward. Note: we assume that a hash bucket chain is * usually smaller than the buffer ring being used by VACUUM, else using * the access strategy here would be counterproductive. */ rbuf = InvalidBuffer; ropaque = wopaque; do { rblkno = ropaque->hasho_nextblkno; if (rbuf != InvalidBuffer) _hash_relbuf(rel, rbuf); rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); /* * squeeze the tuples. */ wbuf_dirty = false; for (;;) { OffsetNumber roffnum; OffsetNumber maxroffnum; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; /* Scan each tuple in "read" page */ maxroffnum = PageGetMaxOffsetNumber(rpage); for (roffnum = FirstOffsetNumber; roffnum <= maxroffnum; roffnum = OffsetNumberNext(roffnum)) { IndexTuple itup; Size itemsz; itup = (IndexTuple) PageGetItem(rpage, PageGetItemId(rpage, roffnum)); itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* * Walk up the bucket chain, looking for a page big enough for * this item. Exit if we reach the read page. */ while (PageGetFreeSpace(wpage) < itemsz) { Assert(!PageIsEmpty(wpage)); wblkno = wopaque->hasho_nextblkno; Assert(BlockNumberIsValid(wblkno)); if (wbuf_dirty) _hash_wrtbuf(rel, wbuf); else _hash_relbuf(rel, wbuf); /* nothing more to do if we reached the read page */ if (rblkno == wblkno) { if (ndeletable > 0) { /* Delete tuples we already moved off read page */ PageIndexMultiDelete(rpage, deletable, ndeletable); _hash_wrtbuf(rel, rbuf); } else _hash_relbuf(rel, rbuf); return; } wbuf = _hash_getbuf_with_strategy(rel, wblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); Assert(wopaque->hasho_bucket == bucket); wbuf_dirty = false; } /* * we have found room so insert on the "write" page, being careful * to preserve hashkey ordering. (If we insert many tuples into * the same "write" page it would be worth qsort'ing instead of * doing repeated _hash_pgaddtup.) */ (void) _hash_pgaddtup(rel, wbuf, itemsz, itup); wbuf_dirty = true; /* remember tuple for deletion from "read" page */ deletable[ndeletable++] = roffnum; } /* * If we reach here, there are no live tuples on the "read" page --- * it was empty when we got to it, or we moved them all. So we can * just free the page without bothering with deleting tuples * individually. Then advance to the previous "read" page. * * Tricky point here: if our read and write pages are adjacent in the * bucket chain, our write lock on wbuf will conflict with * _hash_freeovflpage's attempt to update the sibling links of the * removed page. However, in that case we are done anyway, so we can * simply drop the write lock before calling _hash_freeovflpage. */ rblkno = ropaque->hasho_prevblkno; Assert(BlockNumberIsValid(rblkno)); /* are we freeing the page adjacent to wbuf? */ if (rblkno == wblkno) { /* yes, so release wbuf lock first */ if (wbuf_dirty) _hash_wrtbuf(rel, wbuf); else _hash_relbuf(rel, wbuf); /* free this overflow page (releases rbuf) */ _hash_freeovflpage(rel, rbuf, bstrategy); /* done */ return; } /* free this overflow page, then get the previous one */ _hash_freeovflpage(rel, rbuf, bstrategy); rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } /* NOTREACHED */ }
/* * redo any page update (except page split) */ static void gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record) { char *begin = XLogRecGetData(record); gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) begin; Buffer buffer; Page page; char *data; if (BlockNumberIsValid(xldata->leftchild)) gistRedoClearFollowRight(xldata->node, lsn, xldata->leftchild); /* nothing more to do if page was backed up (and no info to do it with) */ if (record->xl_info & XLR_BKP_BLOCK_1) return; buffer = XLogReadBuffer(xldata->node, xldata->blkno, false); if (!BufferIsValid(buffer)) return; page = (Page) BufferGetPage(buffer); if (XLByteLE(lsn, PageGetLSN(page))) { UnlockReleaseBuffer(buffer); return; } data = begin + sizeof(gistxlogPageUpdate); /* Delete old tuples */ if (xldata->ntodelete > 0) { int i; OffsetNumber *todelete = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntodelete; for (i = 0; i < xldata->ntodelete; i++) PageIndexTupleDelete(page, todelete[i]); if (GistPageIsLeaf(page)) GistMarkTuplesDeleted(page); } /* add tuples */ if (data - begin < record->xl_len) { OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); while (data - begin < record->xl_len) { IndexTuple itup = (IndexTuple) data; Size sz = IndexTupleSize(itup); OffsetNumber l; data += sz; l = PageAddItem(page, (Item) itup, sz, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to GiST index page, size %d bytes", (int) sz); off++; } } else { /* * special case: leafpage, nothing to insert, nothing to delete, then * vacuum marks page */ if (GistPageIsLeaf(page) && xldata->ntodelete == 0) GistClearTuplesDeleted(page); } if (!GistPageIsLeaf(page) && PageGetMaxOffsetNumber(page) == InvalidOffsetNumber && xldata->blkno == GIST_ROOT_BLKNO) /* * all links on non-leaf root page was deleted by vacuum full, so root * page becomes a leaf */ GistPageSetLeaf(page); GistPageGetOpaque(page)->rightlink = InvalidBlockNumber; PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); }
/* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state) { Relation rel = info->index; double tuples_removed; double num_index_tuples; double orig_ntuples; Bucket orig_maxbucket; Bucket cur_maxbucket; Bucket cur_bucket; Buffer metabuf; HashMetaPage metap; HashMetaPageData local_metapage; tuples_removed = 0; num_index_tuples = 0; /* * Read the metapage to fetch original bucket and tuple counts. Also, we * keep a copy of the last-seen metapage so that we can use its * hashm_spares[] values to compute bucket page addresses. This is a bit * hokey but perfectly safe, since the interesting entries in the spares * array cannot change under us; and it beats rereading the metapage for * each bucket. */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); orig_maxbucket = metap->hashm_maxbucket; orig_ntuples = metap->hashm_ntuples; memcpy(&local_metapage, metap, sizeof(local_metapage)); _hash_relbuf(rel, metabuf); /* Scan the buckets that we know exist */ cur_bucket = 0; cur_maxbucket = orig_maxbucket; loop_top: while (cur_bucket <= cur_maxbucket) { BlockNumber bucket_blkno; BlockNumber blkno; bool bucket_dirty = false; /* Get address of bucket's start page */ bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket); /* Exclusive-lock the bucket so we can shrink it */ _hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE); /* Shouldn't have any active scans locally, either */ if (_hash_has_active_scan(rel, cur_bucket)) elog(ERROR, "hash index has active scan during VACUUM"); /* Scan each page in bucket */ blkno = bucket_blkno; while (BlockNumberIsValid(blkno)) { Buffer buf; Page page; HashPageOpaque opaque; OffsetNumber offno; OffsetNumber maxoffno; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; vacuum_delay_point(); buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, info->strategy); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(opaque->hasho_bucket == cur_bucket); /* Scan each tuple in page */ maxoffno = PageGetMaxOffsetNumber(page); for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) { IndexTuple itup; ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); htup = &(itup->t_tid); if (callback(htup, callback_state)) { /* mark the item for deletion */ deletable[ndeletable++] = offno; tuples_removed += 1; } else num_index_tuples += 1; } /* * Apply deletions and write page if needed, advance to next page. */ blkno = opaque->hasho_nextblkno; if (ndeletable > 0) { PageIndexMultiDelete(page, deletable, ndeletable); _hash_wrtbuf(rel, buf); bucket_dirty = true; } else _hash_relbuf(rel, buf); } /* If we deleted anything, try to compact free space */ if (bucket_dirty) _hash_squeezebucket(rel, cur_bucket, bucket_blkno, info->strategy); /* Release bucket lock */ _hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE); /* Advance to next bucket */ cur_bucket++; } /* Write-lock metapage and check for split since we started */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); if (cur_maxbucket != metap->hashm_maxbucket) { /* There's been a split, so process the additional bucket(s) */ cur_maxbucket = metap->hashm_maxbucket; memcpy(&local_metapage, metap, sizeof(local_metapage)); _hash_relbuf(rel, metabuf); goto loop_top; } /* Okay, we're really done. Update tuple count in metapage. */ if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) { /* * No one has split or inserted anything since start of scan, so * believe our count as gospel. */ metap->hashm_ntuples = num_index_tuples; } else { /* * Otherwise, our count is untrustworthy since we may have * double-scanned tuples in split buckets. Proceed by dead-reckoning. * (Note: we still return estimated_count = false, because using this * count is better than not updating reltuples at all.) */ if (metap->hashm_ntuples > tuples_removed) metap->hashm_ntuples -= tuples_removed; else metap->hashm_ntuples = 0; num_index_tuples = metap->hashm_ntuples; } _hash_wrtbuf(rel, metabuf); /* return statistics */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); stats->estimated_count = false; stats->num_index_tuples = num_index_tuples; stats->tuples_removed += tuples_removed; /* hashvacuumcleanup will fill in num_pages */ return stats; }
/* * _hash_addovflpage * * Add an overflow page to the bucket whose last page is pointed to by 'buf'. * * On entry, the caller must hold a pin but no lock on 'buf'. The pin is * dropped before exiting (we assume the caller is not interested in 'buf' * anymore). The returned overflow page will be pinned and write-locked; * it is guaranteed to be empty. * * The caller must hold a pin, but no lock, on the metapage buffer. * That buffer is returned in the same state. * * The caller must hold at least share lock on the bucket, to ensure that * no one else tries to compact the bucket meanwhile. This guarantees that * 'buf' won't stop being part of the bucket while it's unlocked. * * NB: since this could be executed concurrently by multiple processes, * one should not assume that the returned overflow page will be the * immediate successor of the originally passed 'buf'. Additional overflow * pages might have been added to the bucket chain in between. */ Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf) { Buffer ovflbuf; Page page; Page ovflpage; HashPageOpaque pageopaque; HashPageOpaque ovflopaque; /*CS3223*/ /*declare variables*/ HashMetaPage metap; Bucket bucket; int i; int index; int bitIndexInElement; uint32 ovflElement; uint32 *tempPointer; /* allocate and lock an empty overflow page */ ovflbuf = _hash_getovflpage(rel, metabuf); /*CS3223*/ metap = HashPageGetMeta(BufferGetPage(metabuf)); /* find bucket number of primary page */ page = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); bucket = pageopaque -> hasho_bucket; /* * Write-lock the tail page. It is okay to hold two buffer locks here * since there cannot be anyone else contending for access to ovflbuf. */ _hash_chgbufaccess(rel, buf, HASH_NOLOCK, HASH_WRITE); /* probably redundant... */ _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); /* loop to find current tail page, in case someone else inserted too */ //for (;;) /*CS3223*/ while (i>=0) { BlockNumber nextblkno; page = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); nextblkno = pageopaque->hasho_nextblkno; if (!BlockNumberIsValid(nextblkno)) break; /* we assume we do not need to write the unmodified page */ _hash_relbuf(rel, buf); buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); /*CS3223*/ i++; } /* now that we have correct backlink, initialize new overflow page */ ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); ovflopaque->hasho_nextblkno = InvalidBlockNumber; ovflopaque->hasho_bucket = pageopaque->hasho_bucket; ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; ovflopaque->hasho_page_id = HASHO_PAGE_ID; MarkBufferDirty(ovflbuf); /* logically chain overflow page to previous page */ pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); _hash_wrtbuf(rel, buf); /*CS3223*/ /* if length of the bucket chain is 1, only one ovflpage was added, which means the bucket was not split before */ if (i == 1) { index = bucket / 32; bitIndexInElement = bucket % 32; ovflElement = (uint32)metap->ovflBkts[index]; ovflElement = ovflElement | (1 << bitIndexInElement); tempPointer = &(metap->ovflBkts[index]); *tempPointer = ovflElement; } return ovflbuf; }