/* * _bt_restore_page -- re-enter all the index tuples on a page * * The page is freshly init'd, and *from (length len) is a copy of what * had been its upper part (pd_upper to pd_special). We assume that the * tuples had been added to the page in item-number order, and therefore * the one with highest item number appears first (lowest on the page). */ static void _bt_restore_page(Page page, char *from, int len) { IndexTupleData itupdata; Size itemsz; char *end = from + len; Item items[MaxIndexTuplesPerPage]; uint16 itemsizes[MaxIndexTuplesPerPage]; int i; int nitems; /* * To get the items back in the original order, we add them to the page in * reverse. To figure out where one tuple ends and another begins, we * have to scan them in forward order first. */ i = 0; while (from < end) { /* Need to copy tuple header due to alignment considerations */ memcpy(&itupdata, from, sizeof(IndexTupleData)); itemsz = IndexTupleDSize(itupdata); itemsz = MAXALIGN(itemsz); items[i] = (Item) from; itemsizes[i] = itemsz; i++; from += itemsz; } nitems = i; for (i = nitems - 1; i >= 0; i--) { if (PageAddItem(page, items[i], itemsizes[i], nitems - i, false, false) == InvalidOffsetNumber) elog(PANIC, "_bt_restore_page: cannot add item to page"); from += itemsz; } }
/* * _hash_squeezebucket(rel, bucket) * * Try to squeeze the tuples onto pages occurring earlier in the * bucket chain in an attempt to free overflow pages. When we start * the "squeezing", the page from which we start taking tuples (the * "read" page) is the last bucket in the bucket chain and the page * onto which we start squeezing tuples (the "write" page) is the * first page in the bucket chain. The read page works backward and * the write page works forward; the procedure terminates when the * read page and write page are the same page. * * At completion of this procedure, it is guaranteed that all pages in * the bucket are nonempty, unless the bucket is totally empty (in * which case all overflow pages will be freed). The original implementation * required that to be true on entry as well, but it's a lot easier for * callers to leave empty overflow pages and let this guy clean it up. * * Caller must hold exclusive lock on the target bucket. This allows * us to safely lock multiple pages in the bucket. * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. */ void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy) { BlockNumber wblkno; BlockNumber rblkno; Buffer wbuf; Buffer rbuf; Page wpage; Page rpage; HashPageOpaque wopaque; HashPageOpaque ropaque; bool wbuf_dirty; /* * start squeezing into the base bucket page. */ wblkno = bucket_blkno; wbuf = _hash_getbuf_with_strategy(rel, wblkno, HASH_WRITE, LH_BUCKET_PAGE, bstrategy); wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); /* * if there aren't any overflow pages, there's nothing to squeeze. */ if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) { _hash_relbuf(rel, wbuf); return; } /* * Find the last page in the bucket chain by starting at the base bucket * page and working forward. Note: we assume that a hash bucket chain is * usually smaller than the buffer ring being used by VACUUM, else using * the access strategy here would be counterproductive. */ rbuf = InvalidBuffer; ropaque = wopaque; do { rblkno = ropaque->hasho_nextblkno; if (rbuf != InvalidBuffer) _hash_relbuf(rel, rbuf); rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); /* * squeeze the tuples. */ wbuf_dirty = false; for (;;) { OffsetNumber roffnum; OffsetNumber maxroffnum; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; /* Scan each tuple in "read" page */ maxroffnum = PageGetMaxOffsetNumber(rpage); for (roffnum = FirstOffsetNumber; roffnum <= maxroffnum; roffnum = OffsetNumberNext(roffnum)) { IndexTuple itup; Size itemsz; itup = (IndexTuple) PageGetItem(rpage, PageGetItemId(rpage, roffnum)); itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* * Walk up the bucket chain, looking for a page big enough for * this item. Exit if we reach the read page. */ while (PageGetFreeSpace(wpage) < itemsz) { Assert(!PageIsEmpty(wpage)); wblkno = wopaque->hasho_nextblkno; Assert(BlockNumberIsValid(wblkno)); if (wbuf_dirty) _hash_wrtbuf(rel, wbuf); else _hash_relbuf(rel, wbuf); /* nothing more to do if we reached the read page */ if (rblkno == wblkno) { if (ndeletable > 0) { /* Delete tuples we already moved off read page */ PageIndexMultiDelete(rpage, deletable, ndeletable); _hash_wrtbuf(rel, rbuf); } else _hash_relbuf(rel, rbuf); return; } wbuf = _hash_getbuf_with_strategy(rel, wblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); Assert(wopaque->hasho_bucket == bucket); wbuf_dirty = false; } /* * we have found room so insert on the "write" page, being careful * to preserve hashkey ordering. (If we insert many tuples into * the same "write" page it would be worth qsort'ing instead of * doing repeated _hash_pgaddtup.) */ (void) _hash_pgaddtup(rel, wbuf, itemsz, itup); wbuf_dirty = true; /* remember tuple for deletion from "read" page */ deletable[ndeletable++] = roffnum; } /* * If we reach here, there are no live tuples on the "read" page --- * it was empty when we got to it, or we moved them all. So we can * just free the page without bothering with deleting tuples * individually. Then advance to the previous "read" page. * * Tricky point here: if our read and write pages are adjacent in the * bucket chain, our write lock on wbuf will conflict with * _hash_freeovflpage's attempt to update the sibling links of the * removed page. However, in that case we are done anyway, so we can * simply drop the write lock before calling _hash_freeovflpage. */ rblkno = ropaque->hasho_prevblkno; Assert(BlockNumberIsValid(rblkno)); /* are we freeing the page adjacent to wbuf? */ if (rblkno == wblkno) { /* yes, so release wbuf lock first */ if (wbuf_dirty) _hash_wrtbuf(rel, wbuf); else _hash_relbuf(rel, wbuf); /* free this overflow page (releases rbuf) */ _hash_freeovflpage(rel, rbuf, bstrategy); /* done */ return; } /* free this overflow page, then get the previous one */ _hash_freeovflpage(rel, rbuf, bstrategy); rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } /* NOTREACHED */ }
/*---------- * Add an item to a disk page from the sort output. * * We must be careful to observe the page layout conventions of nbtsearch.c: * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY. * - on non-leaf pages, the key portion of the first item need not be * stored, we should store only the link. * * A leaf page being built looks like: * * +----------------+---------------------------------+ * | PageHeaderData | linp0 linp1 linp2 ... | * +-----------+----+---------------------------------+ * | ... linpN | | * +-----------+--------------------------------------+ * | ^ last | * | | * +-------------+------------------------------------+ * | | itemN ... | * +-------------+------------------+-----------------+ * | ... item3 item2 item1 | "special space" | * +--------------------------------+-----------------+ * * Contrast this with the diagram in bufpage.h; note the mismatch * between linps and items. This is because we reserve linp0 as a * placeholder for the pointer to the "high key" item; when we have * filled up the page, we will set linp0 to point to itemN and clear * linpN. On the other hand, if we find this is the last (rightmost) * page, we leave the items alone and slide the linp array over. * * 'last' pointer indicates the last offset added to the page. *---------- */ static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) { Page npage; BlockNumber nblkno; OffsetNumber last_off; Size pgspc; Size itupsz; /* * This is a handy place to check for cancel interrupts during the btree * load phase of index creation. */ CHECK_FOR_INTERRUPTS(); npage = state->btps_page; nblkno = state->btps_blkno; last_off = state->btps_lastoff; pgspc = PageGetFreeSpace(npage); itupsz = IndexTupleDSize(*itup); itupsz = MAXALIGN(itupsz); /* * Check whether the item can fit on a btree page at all. (Eventually, we * ought to try to apply TOAST methods if not.) We actually need to be * able to fit three items on every page, so restrict any one item to 1/3 * the per-page available space. Note that at this point, itupsz doesn't * include the ItemId. * * NOTE: similar code appears in _bt_insertonpg() to defend against * oversize items being inserted into an already-existing index. But * during creation of an index, we don't go through there. */ if (itupsz > BTMaxItemSize(npage)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", itupsz, BTMaxItemSize(npage), RelationGetRelationName(wstate->index)), errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" "Consider a function index of an MD5 hash of the value, " "or use full text indexing."), errtableconstraint(wstate->heap, RelationGetRelationName(wstate->index)))); /* * Check to see if page is "full". It's definitely full if the item won't * fit. Otherwise, compare to the target freespace derived from the * fillfactor. However, we must put at least two items on each page, so * disregard fillfactor if we don't have that many. */ if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY)) { /* * Finish off the page and write it out. */ Page opage = npage; BlockNumber oblkno = nblkno; ItemId ii; ItemId hii; IndexTuple oitup; /* Create new page of same level */ npage = _bt_blnewpage(state->btps_level); /* and assign it a page position */ nblkno = wstate->btws_pages_alloced++; /* * We copy the last item on the page into the new page, and then * rearrange the old page so that the 'last item' becomes its high key * rather than a true data item. There had better be at least two * items on the page already, else the page would be empty of useful * data. */ Assert(last_off > P_FIRSTKEY); ii = PageGetItemId(opage, last_off); oitup = (IndexTuple) PageGetItem(opage, ii); _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY); /* * Move 'last' into the high key position on opage */ hii = PageGetItemId(opage, P_HIKEY); *hii = *ii; ItemIdSetUnused(ii); /* redundant */ ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); /* * Link the old page into its parent, using its minimum key. If we * don't have a parent, we have to create one; this adds a new btree * level. */ if (state->btps_next == NULL) state->btps_next = _bt_pagestate(wstate, state->btps_level + 1); Assert(state->btps_minkey != NULL); ItemPointerSet(&(state->btps_minkey->t_tid), oblkno, P_HIKEY); _bt_buildadd(wstate, state->btps_next, state->btps_minkey); pfree(state->btps_minkey); /* * Save a copy of the minimum key for the new page. We have to copy * it off the old page, not the new one, in case we are not at leaf * level. */ state->btps_minkey = CopyIndexTuple(oitup); /* * Set the sibling links for both pages. */ { BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage); oopaque->btpo_next = nblkno; nopaque->btpo_prev = oblkno; nopaque->btpo_next = P_NONE; /* redundant */ } /* * Write out the old page. We never need to touch it again, so we can * free the opage workspace too. */ _bt_blwritepage(wstate, opage, oblkno); /* * Reset last_off to point to new page */ last_off = P_FIRSTKEY; } /* * If the new item is the first for its page, stash a copy for later. Note * this will only happen for the first item on a level; on later pages, * the first item for a page is copied from the prior page in the code * above. */ if (last_off == P_HIKEY) { Assert(state->btps_minkey == NULL); state->btps_minkey = CopyIndexTuple(itup); } /* * Add the new item into the current page. */ last_off = OffsetNumberNext(last_off); _bt_sortaddtup(npage, itupsz, itup, last_off); state->btps_page = npage; state->btps_blkno = nblkno; state->btps_lastoff = last_off; }
/* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * * We are splitting a bucket that consists of a base bucket page and zero * or more overflow (bucket chain) pages. We must relocate tuples that * belong in the new bucket, and compress out any free space in the old * bucket. * * The caller must hold exclusive locks on both buckets to ensure that * no one else is trying to access them (see README). * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. (The metapage is only * touched if it becomes necessary to add or remove overflow pages.) */ static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, BlockNumber start_oblkno, BlockNumber start_nblkno, uint32 maxbucket, uint32 highmask, uint32 lowmask) { Bucket bucket; Buffer obuf; Buffer nbuf; BlockNumber oblkno; BlockNumber nblkno; bool null; Datum datum; HashItem hitem; HashPageOpaque oopaque; HashPageOpaque nopaque; IndexTuple itup; Size itemsz; OffsetNumber ooffnum; OffsetNumber noffnum; OffsetNumber omaxoffnum; Page opage; Page npage; TupleDesc itupdesc = RelationGetDescr(rel); /* * It should be okay to simultaneously write-lock pages from each * bucket, since no one else can be trying to acquire buffer lock * on pages of either bucket. */ oblkno = start_oblkno; nblkno = start_nblkno; obuf = _hash_getbuf(rel, oblkno, HASH_WRITE); nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE); opage = BufferGetPage(obuf); npage = BufferGetPage(nbuf); _hash_checkpage(rel, opage, LH_BUCKET_PAGE); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); /* initialize the new bucket's primary page */ _hash_pageinit(npage, BufferGetPageSize(nbuf)); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); nopaque->hasho_prevblkno = InvalidBlockNumber; nopaque->hasho_nextblkno = InvalidBlockNumber; nopaque->hasho_bucket = nbucket; nopaque->hasho_flag = LH_BUCKET_PAGE; nopaque->hasho_filler = HASHO_FILL; /* * Partition the tuples in the old bucket between the old bucket and the * new bucket, advancing along the old bucket's overflow bucket chain * and adding overflow pages to the new bucket as needed. */ ooffnum = FirstOffsetNumber; omaxoffnum = PageGetMaxOffsetNumber(opage); for (;;) { /* * at each iteration through this loop, each of these variables * should be up-to-date: obuf opage oopaque ooffnum omaxoffnum */ /* check if we're at the end of the page */ if (ooffnum > omaxoffnum) { /* at end of page, but check for an(other) overflow page */ oblkno = oopaque->hasho_nextblkno; if (!BlockNumberIsValid(oblkno)) break; /* * we ran out of tuples on this particular page, but we * have more overflow pages; advance to next page. */ _hash_wrtbuf(rel, obuf); obuf = _hash_getbuf(rel, oblkno, HASH_WRITE); opage = BufferGetPage(obuf); _hash_checkpage(rel, opage, LH_OVERFLOW_PAGE); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); ooffnum = FirstOffsetNumber; omaxoffnum = PageGetMaxOffsetNumber(opage); continue; } /* * Re-hash the tuple to determine which bucket it now belongs in. * * It is annoying to call the hash function while holding locks, * but releasing and relocking the page for each tuple is unappealing * too. */ hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum)); itup = &(hitem->hash_itup); datum = index_getattr(itup, 1, itupdesc, &null); Assert(!null); bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum), maxbucket, highmask, lowmask); if (bucket == nbucket) { /* * insert the tuple into the new bucket. if it doesn't fit on * the current page in the new bucket, we must allocate a new * overflow page and place the tuple on that page instead. */ itemsz = IndexTupleDSize(hitem->hash_itup) + (sizeof(HashItemData) - sizeof(IndexTupleData)); itemsz = MAXALIGN(itemsz); if (PageGetFreeSpace(npage) < itemsz) { /* write out nbuf and drop lock, but keep pin */ _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK); /* chain to a new overflow page */ nbuf = _hash_addovflpage(rel, metabuf, nbuf); npage = BufferGetPage(nbuf); _hash_checkpage(rel, npage, LH_OVERFLOW_PAGE); /* we don't need nopaque within the loop */ } noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage)); if (PageAddItem(npage, (Item) hitem, itemsz, noffnum, LP_USED) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel)); /* * now delete the tuple from the old bucket. after this * section of code, 'ooffnum' will actually point to the * ItemId to which we would point if we had advanced it before * the deletion (PageIndexTupleDelete repacks the ItemId * array). this also means that 'omaxoffnum' is exactly one * less than it used to be, so we really can just decrement it * instead of calling PageGetMaxOffsetNumber. */ PageIndexTupleDelete(opage, ooffnum); omaxoffnum = OffsetNumberPrev(omaxoffnum); } else { /* * the tuple stays on this page. we didn't move anything, so * we didn't delete anything and therefore we don't have to * change 'omaxoffnum'. */ Assert(bucket == obucket); ooffnum = OffsetNumberNext(ooffnum); } } /* * We're at the end of the old bucket chain, so we're done partitioning * the tuples. Before quitting, call _hash_squeezebucket to ensure the * tuples remaining in the old bucket (including the overflow pages) are * packed as tightly as possible. The new bucket is already tight. */ _hash_wrtbuf(rel, obuf); _hash_wrtbuf(rel, nbuf); _hash_squeezebucket(rel, obucket, start_oblkno); }
/* * replay move of page contents for squeeze operation of hash index */ static void hash_xlog_move_page_contents(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_move_page_contents *xldata = (xl_hash_move_page_contents *) XLogRecGetData(record); Buffer bucketbuf = InvalidBuffer; Buffer writebuf = InvalidBuffer; Buffer deletebuf = InvalidBuffer; XLogRedoAction action; /* * Ensure we have a cleanup lock on primary bucket page before we start * with the actual replay operation. This is to ensure that neither a * scan can start nor a scan can be already-in-progress during the replay * of this operation. If we allow scans during this operation, then they * can miss some records or show the same record multiple times. */ if (xldata->is_prim_bucket_same_wrt) action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf); else { /* * we don't care for return value as the purpose of reading bucketbuf * is to ensure a cleanup lock on primary bucket page. */ (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); action = XLogReadBufferForRedo(record, 1, &writebuf); } /* replay the record for adding entries in overflow buffer */ if (action == BLK_NEEDS_REDO) { Page writepage; char *begin; char *data; Size datalen; uint16 ninserted = 0; data = begin = XLogRecGetBlockData(record, 1, &datalen); writepage = (Page) BufferGetPage(writebuf); if (xldata->ntups > 0) { OffsetNumber *towrite = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntups; while (data - begin < datalen) { IndexTuple itup = (IndexTuple) data; Size itemsz; OffsetNumber l; itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); data += itemsz; l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); if (l == InvalidOffsetNumber) elog(ERROR, "hash_xlog_move_page_contents: failed to add item to hash index page, size %d bytes", (int) itemsz); ninserted++; } } /* * number of tuples inserted must be same as requested in REDO record. */ Assert(ninserted == xldata->ntups); PageSetLSN(writepage, lsn); MarkBufferDirty(writebuf); } /* replay the record for deleting entries from overflow buffer */ if (XLogReadBufferForRedo(record, 2, &deletebuf) == BLK_NEEDS_REDO) { Page page; char *ptr; Size len; ptr = XLogRecGetBlockData(record, 2, &len); page = (Page) BufferGetPage(deletebuf); if (len > 0) { OffsetNumber *unused; OffsetNumber *unend; unused = (OffsetNumber *) ptr; unend = (OffsetNumber *) ((char *) ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); } PageSetLSN(page, lsn); MarkBufferDirty(deletebuf); } /* * Replay is complete, now we can release the buffers. We release locks at * end of replay operation to ensure that we hold lock on primary bucket * page till end of operation. We can optimize by releasing the lock on * write buffer as soon as the operation for same is complete, if it is * not same as primary bucket page, but that doesn't seem to be worth * complicating the code. */ if (BufferIsValid(deletebuf)) UnlockReleaseBuffer(deletebuf); if (BufferIsValid(writebuf)) UnlockReleaseBuffer(writebuf); if (BufferIsValid(bucketbuf)) UnlockReleaseBuffer(bucketbuf); }
/* * replay squeeze page operation of hash index */ static void hash_xlog_squeeze_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) XLogRecGetData(record); Buffer bucketbuf = InvalidBuffer; Buffer writebuf; Buffer ovflbuf; Buffer prevbuf = InvalidBuffer; Buffer mapbuf; XLogRedoAction action; /* * Ensure we have a cleanup lock on primary bucket page before we start * with the actual replay operation. This is to ensure that neither a * scan can start nor a scan can be already-in-progress during the replay * of this operation. If we allow scans during this operation, then they * can miss some records or show the same record multiple times. */ if (xldata->is_prim_bucket_same_wrt) action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf); else { /* * we don't care for return value as the purpose of reading bucketbuf * is to ensure a cleanup lock on primary bucket page. */ (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); action = XLogReadBufferForRedo(record, 1, &writebuf); } /* replay the record for adding entries in overflow buffer */ if (action == BLK_NEEDS_REDO) { Page writepage; char *begin; char *data; Size datalen; uint16 ninserted = 0; data = begin = XLogRecGetBlockData(record, 1, &datalen); writepage = (Page) BufferGetPage(writebuf); if (xldata->ntups > 0) { OffsetNumber *towrite = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntups; while (data - begin < datalen) { IndexTuple itup = (IndexTuple) data; Size itemsz; OffsetNumber l; itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); data += itemsz; l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); if (l == InvalidOffsetNumber) elog(ERROR, "hash_xlog_squeeze_page: failed to add item to hash index page, size %d bytes", (int) itemsz); ninserted++; } } /* * number of tuples inserted must be same as requested in REDO record. */ Assert(ninserted == xldata->ntups); /* * if the page on which are adding tuples is a page previous to freed * overflow page, then update its nextblno. */ if (xldata->is_prev_bucket_same_wrt) { HashPageOpaque writeopaque = (HashPageOpaque) PageGetSpecialPointer(writepage); writeopaque->hasho_nextblkno = xldata->nextblkno; } PageSetLSN(writepage, lsn); MarkBufferDirty(writebuf); } /* replay the record for initializing overflow buffer */ if (XLogReadBufferForRedo(record, 2, &ovflbuf) == BLK_NEEDS_REDO) { Page ovflpage; ovflpage = BufferGetPage(ovflbuf); _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); PageSetLSN(ovflpage, lsn); MarkBufferDirty(ovflbuf); } if (BufferIsValid(ovflbuf)) UnlockReleaseBuffer(ovflbuf); /* replay the record for page previous to the freed overflow page */ if (!xldata->is_prev_bucket_same_wrt && XLogReadBufferForRedo(record, 3, &prevbuf) == BLK_NEEDS_REDO) { Page prevpage = BufferGetPage(prevbuf); HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); prevopaque->hasho_nextblkno = xldata->nextblkno; PageSetLSN(prevpage, lsn); MarkBufferDirty(prevbuf); } if (BufferIsValid(prevbuf)) UnlockReleaseBuffer(prevbuf); /* replay the record for page next to the freed overflow page */ if (XLogRecHasBlockRef(record, 4)) { Buffer nextbuf; if (XLogReadBufferForRedo(record, 4, &nextbuf) == BLK_NEEDS_REDO) { Page nextpage = BufferGetPage(nextbuf); HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); nextopaque->hasho_prevblkno = xldata->prevblkno; PageSetLSN(nextpage, lsn); MarkBufferDirty(nextbuf); } if (BufferIsValid(nextbuf)) UnlockReleaseBuffer(nextbuf); } if (BufferIsValid(writebuf)) UnlockReleaseBuffer(writebuf); if (BufferIsValid(bucketbuf)) UnlockReleaseBuffer(bucketbuf); /* * Note: in normal operation, we'd update the bitmap and meta page while * still holding lock on the primary bucket page and overflow pages. But * during replay it's not necessary to hold those locks, since no other * index updates can be happening concurrently. */ /* replay the record for bitmap page */ if (XLogReadBufferForRedo(record, 5, &mapbuf) == BLK_NEEDS_REDO) { Page mappage = (Page) BufferGetPage(mapbuf); uint32 *freep = NULL; char *data; uint32 *bitmap_page_bit; Size datalen; freep = HashPageGetBitmap(mappage); data = XLogRecGetBlockData(record, 5, &datalen); bitmap_page_bit = (uint32 *) data; CLRBIT(freep, *bitmap_page_bit); PageSetLSN(mappage, lsn); MarkBufferDirty(mapbuf); } if (BufferIsValid(mapbuf)) UnlockReleaseBuffer(mapbuf); /* replay the record for meta page */ if (XLogRecHasBlockRef(record, 6)) { Buffer metabuf; if (XLogReadBufferForRedo(record, 6, &metabuf) == BLK_NEEDS_REDO) { HashMetaPage metap; Page page; char *data; uint32 *firstfree_ovflpage; Size datalen; data = XLogRecGetBlockData(record, 6, &datalen); firstfree_ovflpage = (uint32 *) data; page = BufferGetPage(metabuf); metap = HashPageGetMeta(page); metap->hashm_firstfree = *firstfree_ovflpage; PageSetLSN(page, lsn); MarkBufferDirty(metabuf); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); } }
/* * _hash_squeezebucket(rel, bucket) * * Try to squeeze the tuples onto pages occurring earlier in the * bucket chain in an attempt to free overflow pages. When we start * the "squeezing", the page from which we start taking tuples (the * "read" page) is the last bucket in the bucket chain and the page * onto which we start squeezing tuples (the "write" page) is the * first page in the bucket chain. The read page works backward and * the write page works forward; the procedure terminates when the * read page and write page are the same page. * * At completion of this procedure, it is guaranteed that all pages in * the bucket are nonempty, unless the bucket is totally empty (in * which case all overflow pages will be freed). The original implementation * required that to be true on entry as well, but it's a lot easier for * callers to leave empty overflow pages and let this guy clean it up. * * Caller must acquire cleanup lock on the primary page of the target * bucket to exclude any scans that are in progress, which could easily * be confused into returning the same tuple more than once or some tuples * not at all by the rearrangement we are performing here. To prevent * any concurrent scan to cross the squeeze scan we use lock chaining * similar to hasbucketcleanup. Refer comments atop hashbucketcleanup. * * We need to retain a pin on the primary bucket to ensure that no concurrent * split can start. * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. */ void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy) { BlockNumber wblkno; BlockNumber rblkno; Buffer wbuf; Buffer rbuf; Page wpage; Page rpage; HashPageOpaque wopaque; HashPageOpaque ropaque; /* * start squeezing into the primary bucket page. */ wblkno = bucket_blkno; wbuf = bucket_buf; wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); /* * if there aren't any overflow pages, there's nothing to squeeze. caller * is responsible for releasing the pin on primary bucket page. */ if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) { LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); return; } /* * Find the last page in the bucket chain by starting at the base bucket * page and working forward. Note: we assume that a hash bucket chain is * usually smaller than the buffer ring being used by VACUUM, else using * the access strategy here would be counterproductive. */ rbuf = InvalidBuffer; ropaque = wopaque; do { rblkno = ropaque->hasho_nextblkno; if (rbuf != InvalidBuffer) _hash_relbuf(rel, rbuf); rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); /* * squeeze the tuples. */ for (;;) { OffsetNumber roffnum; OffsetNumber maxroffnum; OffsetNumber deletable[MaxOffsetNumber]; IndexTuple itups[MaxIndexTuplesPerPage]; Size tups_size[MaxIndexTuplesPerPage]; OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; uint16 ndeletable = 0; uint16 nitups = 0; Size all_tups_size = 0; int i; bool retain_pin = false; readpage: /* Scan each tuple in "read" page */ maxroffnum = PageGetMaxOffsetNumber(rpage); for (roffnum = FirstOffsetNumber; roffnum <= maxroffnum; roffnum = OffsetNumberNext(roffnum)) { IndexTuple itup; Size itemsz; /* skip dead tuples */ if (ItemIdIsDead(PageGetItemId(rpage, roffnum))) continue; itup = (IndexTuple) PageGetItem(rpage, PageGetItemId(rpage, roffnum)); itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* * Walk up the bucket chain, looking for a page big enough for * this item and all other accumulated items. Exit if we reach * the read page. */ while (PageGetFreeSpaceForMultipleTuples(wpage, nitups + 1) < (all_tups_size + itemsz)) { Buffer next_wbuf = InvalidBuffer; bool tups_moved = false; Assert(!PageIsEmpty(wpage)); if (wblkno == bucket_blkno) retain_pin = true; wblkno = wopaque->hasho_nextblkno; Assert(BlockNumberIsValid(wblkno)); /* don't need to move to next page if we reached the read page */ if (wblkno != rblkno) next_wbuf = _hash_getbuf_with_strategy(rel, wblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); if (nitups > 0) { Assert(nitups == ndeletable); /* * This operation needs to log multiple tuples, prepare * WAL for that. */ if (RelationNeedsWAL(rel)) XLogEnsureRecordSpace(0, 3 + nitups); START_CRIT_SECTION(); /* * we have to insert tuples on the "write" page, being * careful to preserve hashkey ordering. (If we insert * many tuples into the same "write" page it would be * worth qsort'ing them). */ _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); MarkBufferDirty(wbuf); /* Delete tuples we already moved off read page */ PageIndexMultiDelete(rpage, deletable, ndeletable); MarkBufferDirty(rbuf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; xl_hash_move_page_contents xlrec; xlrec.ntups = nitups; xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf) ? true : false; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashMovePageContents); /* * bucket buffer needs to be registered to ensure that * we can acquire a cleanup lock on it during replay. */ if (!xlrec.is_prim_bucket_same_wrt) XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE); XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); XLogRegisterBufData(1, (char *) itup_offsets, nitups * sizeof(OffsetNumber)); for (i = 0; i < nitups; i++) XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD); XLogRegisterBufData(2, (char *) deletable, ndeletable * sizeof(OffsetNumber)); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS); PageSetLSN(BufferGetPage(wbuf), recptr); PageSetLSN(BufferGetPage(rbuf), recptr); } END_CRIT_SECTION(); tups_moved = true; } /* * release the lock on previous page after acquiring the lock * on next page */ if (retain_pin) LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, wbuf); /* nothing more to do if we reached the read page */ if (rblkno == wblkno) { _hash_relbuf(rel, rbuf); return; } wbuf = next_wbuf; wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); Assert(wopaque->hasho_bucket == bucket); retain_pin = false; /* be tidy */ for (i = 0; i < nitups; i++) pfree(itups[i]); nitups = 0; all_tups_size = 0; ndeletable = 0; /* * after moving the tuples, rpage would have been compacted, * so we need to rescan it. */ if (tups_moved) goto readpage; } /* remember tuple for deletion from "read" page */ deletable[ndeletable++] = roffnum; /* * we need a copy of index tuples as they can be freed as part of * overflow page, however we need them to write a WAL record in * _hash_freeovflpage. */ itups[nitups] = CopyIndexTuple(itup); tups_size[nitups++] = itemsz; all_tups_size += itemsz; } /* * If we reach here, there are no live tuples on the "read" page --- * it was empty when we got to it, or we moved them all. So we can * just free the page without bothering with deleting tuples * individually. Then advance to the previous "read" page. * * Tricky point here: if our read and write pages are adjacent in the * bucket chain, our write lock on wbuf will conflict with * _hash_freeovflpage's attempt to update the sibling links of the * removed page. In that case, we don't need to lock it again. */ rblkno = ropaque->hasho_prevblkno; Assert(BlockNumberIsValid(rblkno)); /* free this overflow page (releases rbuf) */ _hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets, tups_size, nitups, bstrategy); /* be tidy */ for (i = 0; i < nitups; i++) pfree(itups[i]); /* are we freeing the page adjacent to wbuf? */ if (rblkno == wblkno) { /* retain the pin on primary bucket page till end of bucket scan */ if (wblkno == bucket_blkno) LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, wbuf); return; } rbuf = _hash_getbuf_with_strategy(rel, rblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } /* NOTREACHED */ }
/* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * * We are splitting a bucket that consists of a base bucket page and zero * or more overflow (bucket chain) pages. We must relocate tuples that * belong in the new bucket, and compress out any free space in the old * bucket. * * The caller must hold exclusive locks on both buckets to ensure that * no one else is trying to access them (see README). * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. (The metapage is only * touched if it becomes necessary to add or remove overflow pages.) */ static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, BlockNumber start_oblkno, BlockNumber start_nblkno, uint32 maxbucket, uint32 highmask, uint32 lowmask) { BlockNumber oblkno; BlockNumber nblkno; Buffer obuf; Buffer nbuf; Page opage; Page npage; HashPageOpaque oopaque; HashPageOpaque nopaque; /* * It should be okay to simultaneously write-lock pages from each bucket, * since no one else can be trying to acquire buffer lock on pages of * either bucket. */ oblkno = start_oblkno; obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_BUCKET_PAGE); opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); nblkno = start_nblkno; nbuf = _hash_getnewbuf(rel, nblkno, MAIN_FORKNUM); npage = BufferGetPage(nbuf); /* initialize the new bucket's primary page */ nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); nopaque->hasho_prevblkno = InvalidBlockNumber; nopaque->hasho_nextblkno = InvalidBlockNumber; nopaque->hasho_bucket = nbucket; nopaque->hasho_flag = LH_BUCKET_PAGE; nopaque->hasho_page_id = HASHO_PAGE_ID; /* * Partition the tuples in the old bucket between the old bucket and the * new bucket, advancing along the old bucket's overflow bucket chain and * adding overflow pages to the new bucket as needed. Outer loop iterates * once per page in old bucket. */ for (;;) { OffsetNumber ooffnum; OffsetNumber omaxoffnum; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; /* Scan each tuple in old page */ omaxoffnum = PageGetMaxOffsetNumber(opage); for (ooffnum = FirstOffsetNumber; ooffnum <= omaxoffnum; ooffnum = OffsetNumberNext(ooffnum)) { IndexTuple itup; Size itemsz; Bucket bucket; /* * Fetch the item's hash key (conveniently stored in the item) and * determine which bucket it now belongs in. */ itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum)); bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); if (bucket == nbucket) { /* * insert the tuple into the new bucket. if it doesn't fit on * the current page in the new bucket, we must allocate a new * overflow page and place the tuple on that page instead. */ itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); if (PageGetFreeSpace(npage) < itemsz) { /* write out nbuf and drop lock, but keep pin */ _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK); /* chain to a new overflow page */ nbuf = _hash_addovflpage(rel, metabuf, nbuf); npage = BufferGetPage(nbuf); /* we don't need nblkno or nopaque within the loop */ } /* * Insert tuple on new page, using _hash_pgaddtup to ensure * correct ordering by hashkey. This is a tad inefficient * since we may have to shuffle itempointers repeatedly. * Possible future improvement: accumulate all the items for * the new page and qsort them before insertion. */ (void) _hash_pgaddtup(rel, nbuf, itemsz, itup); /* * Mark tuple for deletion from old page. */ deletable[ndeletable++] = ooffnum; } else { /* * the tuple stays on this page, so nothing to do. */ Assert(bucket == obucket); } } oblkno = oopaque->hasho_nextblkno; /* * Done scanning this old page. If we moved any tuples, delete them * from the old page. */ if (ndeletable > 0) { PageIndexMultiDelete(opage, deletable, ndeletable); _hash_wrtbuf(rel, obuf); } else _hash_relbuf(rel, obuf); /* Exit loop if no more overflow pages in old bucket */ if (!BlockNumberIsValid(oblkno)) break; /* Else, advance to next old page */ obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_OVERFLOW_PAGE); opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); } /* * We're at the end of the old bucket chain, so we're done partitioning * the tuples. Before quitting, call _hash_squeezebucket to ensure the * tuples remaining in the old bucket (including the overflow pages) are * packed as tightly as possible. The new bucket is already tight. */ _hash_wrtbuf(rel, nbuf); _hash_squeezebucket(rel, obucket, start_oblkno, NULL); }
/* * _hash_squeezebucket(rel, bucket) * * Try to squeeze the tuples onto pages occurring earlier in the * bucket chain in an attempt to free overflow pages. When we start * the "squeezing", the page from which we start taking tuples (the * "read" page) is the last bucket in the bucket chain and the page * onto which we start squeezing tuples (the "write" page) is the * first page in the bucket chain. The read page works backward and * the write page works forward; the procedure terminates when the * read page and write page are the same page. * * At completion of this procedure, it is guaranteed that all pages in * the bucket are nonempty, unless the bucket is totally empty (in * which case all overflow pages will be freed). The original implementation * required that to be true on entry as well, but it's a lot easier for * callers to leave empty overflow pages and let this guy clean it up. * * Caller must hold exclusive lock on the target bucket. This allows * us to safely lock multiple pages in the bucket. */ void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno) { Buffer wbuf; Buffer rbuf = 0; BlockNumber wblkno; BlockNumber rblkno; Page wpage; Page rpage; HashPageOpaque wopaque; HashPageOpaque ropaque; OffsetNumber woffnum; OffsetNumber roffnum; IndexTuple itup; Size itemsz; /* * start squeezing into the base bucket page. */ wblkno = bucket_blkno; wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE); _hash_checkpage(rel, wbuf, LH_BUCKET_PAGE); wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); /* * if there aren't any overflow pages, there's nothing to squeeze. */ if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) { _hash_relbuf(rel, wbuf); return; } /* * find the last page in the bucket chain by starting at the base bucket * page and working forward. */ ropaque = wopaque; do { rblkno = ropaque->hasho_nextblkno; if (ropaque != wopaque) _hash_relbuf(rel, rbuf); rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE); _hash_checkpage(rel, rbuf, LH_OVERFLOW_PAGE); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); } while (BlockNumberIsValid(ropaque->hasho_nextblkno)); /* * squeeze the tuples. */ roffnum = FirstOffsetNumber; for (;;) { /* this test is needed in case page is empty on entry */ if (roffnum <= PageGetMaxOffsetNumber(rpage)) { itup = (IndexTuple) PageGetItem(rpage, PageGetItemId(rpage, roffnum)); itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* * Walk up the bucket chain, looking for a page big enough for * this item. Exit if we reach the read page. */ while (PageGetFreeSpace(wpage) < itemsz) { Assert(!PageIsEmpty(wpage)); wblkno = wopaque->hasho_nextblkno; Assert(BlockNumberIsValid(wblkno)); _hash_wrtbuf(rel, wbuf); if (rblkno == wblkno) { /* wbuf is already released */ _hash_wrtbuf(rel, rbuf); return; } wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE); _hash_checkpage(rel, wbuf, LH_OVERFLOW_PAGE); wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); Assert(wopaque->hasho_bucket == bucket); } /* * we have found room so insert on the "write" page. */ woffnum = OffsetNumberNext(PageGetMaxOffsetNumber(wpage)); if (PageAddItem(wpage, (Item) itup, itemsz, woffnum, LP_USED) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(rel)); /* * delete the tuple from the "read" page. PageIndexTupleDelete * repacks the ItemId array, so 'roffnum' will be "advanced" to * the "next" ItemId. */ PageIndexTupleDelete(rpage, roffnum); } /* * if the "read" page is now empty because of the deletion (or because * it was empty when we got to it), free it. * * Tricky point here: if our read and write pages are adjacent in the * bucket chain, our write lock on wbuf will conflict with * _hash_freeovflpage's attempt to update the sibling links of the * removed page. However, in that case we are done anyway, so we can * simply drop the write lock before calling _hash_freeovflpage. */ if (PageIsEmpty(rpage)) { rblkno = ropaque->hasho_prevblkno; Assert(BlockNumberIsValid(rblkno)); /* are we freeing the page adjacent to wbuf? */ if (rblkno == wblkno) { /* yes, so release wbuf lock first */ _hash_wrtbuf(rel, wbuf); /* free this overflow page (releases rbuf) */ _hash_freeovflpage(rel, rbuf); /* done */ return; } /* free this overflow page, then get the previous one */ _hash_freeovflpage(rel, rbuf); rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE); _hash_checkpage(rel, rbuf, LH_OVERFLOW_PAGE); rpage = BufferGetPage(rbuf); ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage); Assert(ropaque->hasho_bucket == bucket); roffnum = FirstOffsetNumber; } } /* NOTREACHED */ }
/* * _hash_doinsert() -- Handle insertion of a single index tuple. * * This routine is called by the public interface routines, hashbuild * and hashinsert. By here, itup is completely filled in. */ void _hash_doinsert(Relation rel, IndexTuple itup) { Buffer buf; Buffer metabuf; HashMetaPage metap; BlockNumber blkno; Page page; HashPageOpaque pageopaque; Size itemsz; bool do_expand; uint32 hashkey; Bucket bucket; /* * Get the hash key for the item (it's stored in the index tuple itself). */ hashkey = _hash_get_indextuple_hashkey(itup); /* compute item size too */ itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we * need to be consistent */ /* * Acquire shared split lock so we can compute the target bucket safely * (see README). */ _hash_getlock(rel, 0, HASH_SHARE); /* Read the metapage */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* * Check whether the item can fit on a hash page at all. (Eventually, we * ought to try to apply TOAST methods if not.) Note that at this point, * itemsz doesn't include the ItemId. * * XXX this is useless code if we are only storing hash keys. */ if (itemsz > HashMaxItemSize((Page) metap)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds hash maximum %lu", (unsigned long) itemsz, (unsigned long) HashMaxItemSize((Page) metap)), errhint("Values larger than a buffer page cannot be indexed."))); /* * Compute the target bucket number, and convert to block number. */ bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask); blkno = BUCKET_TO_BLKNO(metap, bucket); /* release lock on metapage, but keep pin since we'll need it again */ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); /* * Acquire share lock on target bucket; then we can release split lock. */ _hash_getlock(rel, blkno, HASH_SHARE); _hash_droplock(rel, 0, HASH_SHARE); /* Fetch the primary bucket page for the bucket */ buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE); page = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(pageopaque->hasho_bucket == bucket); /* Do the insertion */ while (PageGetFreeSpace(page) < itemsz) { /* * no space on this page; check for an overflow page */ BlockNumber nextblkno = pageopaque->hasho_nextblkno; if (BlockNumberIsValid(nextblkno)) { /* * ovfl page exists; go get it. if it doesn't have room, we'll * find out next pass through the loop test above. */ _hash_relbuf(rel, buf); buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); page = BufferGetPage(buf); } else { /* * we're at the end of the bucket chain and we haven't found a * page with enough room. allocate a new overflow page. */ /* release our write lock without modifying buffer */ _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK); /* chain to a new overflow page */ buf = _hash_addovflpage(rel, metabuf, buf); page = BufferGetPage(buf); /* should fit now, given test above */ Assert(PageGetFreeSpace(page) >= itemsz); } pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE); Assert(pageopaque->hasho_bucket == bucket); } /* found page with enough space, so add the item here */ (void) _hash_pgaddtup(rel, buf, itemsz, itup); /* write and release the modified page */ _hash_wrtbuf(rel, buf); /* We can drop the bucket lock now */ _hash_droplock(rel, blkno, HASH_SHARE); /* * Write-lock the metapage so we can increment the tuple count. After * incrementing it, check to see if it's time for a split. */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); metap->hashm_ntuples += 1; /* Make sure this stays in sync with _hash_expandtable() */ do_expand = metap->hashm_ntuples > (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1); /* Write out the metapage and drop lock, but keep pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); /* Attempt to split if a split is needed */ if (do_expand) _hash_expandtable(rel, metabuf); /* Finally drop our pin on the metapage */ _hash_dropbuf(rel, metabuf); }
/* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * * This routine is used to partition the tuples between old and new bucket and * is used to finish the incomplete split operations. To finish the previously * interrupted split operation, the caller needs to fill htab. If htab is set, * then we skip the movement of tuples that exists in htab, otherwise NULL * value of htab indicates movement of all the tuples that belong to the new * bucket. * * We are splitting a bucket that consists of a base bucket page and zero * or more overflow (bucket chain) pages. We must relocate tuples that * belong in the new bucket. * * The caller must hold cleanup locks on both buckets to ensure that * no one else is trying to access them (see README). * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. (The metapage is only * touched if it becomes necessary to add or remove overflow pages.) * * Split needs to retain pin on primary bucket pages of both old and new * buckets till end of operation. This is to prevent vacuum from starting * while a split is in progress. * * In addition, the caller must have created the new bucket's base page, * which is passed in buffer nbuf, pinned and write-locked. The lock will be * released here and pin must be released by the caller. (The API is set up * this way because we must do _hash_getnewbuf() before releasing the metapage * write lock. So instead of passing the new bucket's start block number, we * pass an actual buffer.) */ static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, Buffer obuf, Buffer nbuf, HTAB *htab, uint32 maxbucket, uint32 highmask, uint32 lowmask) { Buffer bucket_obuf; Buffer bucket_nbuf; Page opage; Page npage; HashPageOpaque oopaque; HashPageOpaque nopaque; OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; IndexTuple itups[MaxIndexTuplesPerPage]; Size all_tups_size = 0; int i; uint16 nitups = 0; bucket_obuf = obuf; opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); bucket_nbuf = nbuf; npage = BufferGetPage(nbuf); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); /* * Partition the tuples in the old bucket between the old bucket and the * new bucket, advancing along the old bucket's overflow bucket chain and * adding overflow pages to the new bucket as needed. Outer loop iterates * once per page in old bucket. */ for (;;) { BlockNumber oblkno; OffsetNumber ooffnum; OffsetNumber omaxoffnum; /* Scan each tuple in old page */ omaxoffnum = PageGetMaxOffsetNumber(opage); for (ooffnum = FirstOffsetNumber; ooffnum <= omaxoffnum; ooffnum = OffsetNumberNext(ooffnum)) { IndexTuple itup; Size itemsz; Bucket bucket; bool found = false; /* skip dead tuples */ if (ItemIdIsDead(PageGetItemId(opage, ooffnum))) continue; /* * Before inserting a tuple, probe the hash table containing TIDs * of tuples belonging to new bucket, if we find a match, then * skip that tuple, else fetch the item's hash key (conveniently * stored in the item) and determine which bucket it now belongs * in. */ itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum)); if (htab) (void) hash_search(htab, &itup->t_tid, HASH_FIND, &found); if (found) continue; bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), maxbucket, highmask, lowmask); if (bucket == nbucket) { IndexTuple new_itup; /* * make a copy of index tuple as we have to scribble on it. */ new_itup = CopyIndexTuple(itup); /* * mark the index tuple as moved by split, such tuples are * skipped by scan if there is split in progress for a bucket. */ new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK; /* * insert the tuple into the new bucket. if it doesn't fit on * the current page in the new bucket, we must allocate a new * overflow page and place the tuple on that page instead. */ itemsz = IndexTupleDSize(*new_itup); itemsz = MAXALIGN(itemsz); if (PageGetFreeSpaceForMultipleTuples(npage, nitups + 1) < (all_tups_size + itemsz)) { /* * Change the shared buffer state in critical section, * otherwise any error could make it unrecoverable. */ START_CRIT_SECTION(); _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups); MarkBufferDirty(nbuf); /* log the split operation before releasing the lock */ log_split_page(rel, nbuf); END_CRIT_SECTION(); /* drop lock, but keep pin */ LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); /* be tidy */ for (i = 0; i < nitups; i++) pfree(itups[i]); nitups = 0; all_tups_size = 0; /* chain to a new overflow page */ nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf) ? true : false); npage = BufferGetPage(nbuf); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); } itups[nitups++] = new_itup; all_tups_size += itemsz; } else { /* * the tuple stays on this page, so nothing to do. */ Assert(bucket == obucket); } } oblkno = oopaque->hasho_nextblkno; /* retain the pin on the old primary bucket */ if (obuf == bucket_obuf) LockBuffer(obuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, obuf); /* Exit loop if no more overflow pages in old bucket */ if (!BlockNumberIsValid(oblkno)) { /* * Change the shared buffer state in critical section, otherwise * any error could make it unrecoverable. */ START_CRIT_SECTION(); _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups); MarkBufferDirty(nbuf); /* log the split operation before releasing the lock */ log_split_page(rel, nbuf); END_CRIT_SECTION(); if (nbuf == bucket_nbuf) LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, nbuf); /* be tidy */ for (i = 0; i < nitups; i++) pfree(itups[i]); break; } /* Else, advance to next old page */ obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE); opage = BufferGetPage(obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); } /* * We're at the end of the old bucket chain, so we're done partitioning * the tuples. Mark the old and new buckets to indicate split is * finished. * * To avoid deadlocks due to locking order of buckets, first lock the old * bucket and then the new bucket. */ LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE); opage = BufferGetPage(bucket_obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); LockBuffer(bucket_nbuf, BUFFER_LOCK_EXCLUSIVE); npage = BufferGetPage(bucket_nbuf); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); START_CRIT_SECTION(); oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT; nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED; /* * After the split is finished, mark the old bucket to indicate that it * contains deletable tuples. We will clear split-cleanup flag after * deleting such tuples either at the end of split or at the next split * from old bucket or at the time of vacuum. */ oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP; /* * now write the buffers, here we don't release the locks as caller is * responsible to release locks. */ MarkBufferDirty(bucket_obuf); MarkBufferDirty(bucket_nbuf); if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; xl_hash_split_complete xlrec; xlrec.old_bucket_flag = oopaque->hasho_flag; xlrec.new_bucket_flag = nopaque->hasho_flag; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashSplitComplete); XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD); XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE); PageSetLSN(BufferGetPage(bucket_obuf), recptr); PageSetLSN(BufferGetPage(bucket_nbuf), recptr); } END_CRIT_SECTION(); /* * If possible, clean up the old bucket. We might not be able to do this * if someone else has a pin on it, but if not then we can go ahead. This * isn't absolutely necessary, but it reduces bloat; if we don't do it * now, VACUUM will do it eventually, but maybe not until new overflow * pages have been allocated. Note that there's no need to clean up the * new bucket. */ if (IsBufferCleanupOK(bucket_obuf)) { LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK); hashbucketcleanup(rel, obucket, bucket_obuf, BufferGetBlockNumber(bucket_obuf), NULL, maxbucket, highmask, lowmask, NULL, NULL, true, NULL, NULL); } else { LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK); LockBuffer(bucket_obuf, BUFFER_LOCK_UNLOCK); } }
/* * _hash_doinsert() -- Handle insertion of a single HashItem in the table. * * This routine is called by the public interface routines, hashbuild * and hashinsert. By here, hashitem is completely filled in. * The datum to be used as a "key" is in the hashitem. */ InsertIndexResult _hash_doinsert(Relation rel, HashItem hitem) { Buffer buf; Buffer metabuf; HashMetaPage metap; IndexTuple itup; BlockNumber itup_blkno; OffsetNumber itup_off; InsertIndexResult res; BlockNumber blkno; Page page; HashPageOpaque pageopaque; Size itemsz; bool do_expand; uint32 hashkey; Bucket bucket; Datum datum; bool isnull; /* * Compute the hash key for the item. We do this first so as not to * need to hold any locks while running the hash function. */ itup = &(hitem->hash_itup); if (rel->rd_rel->relnatts != 1) elog(ERROR, "hash indexes support only one index key"); datum = index_getattr(itup, 1, RelationGetDescr(rel), &isnull); Assert(!isnull); hashkey = _hash_datum2hashkey(rel, datum); /* compute item size too */ itemsz = IndexTupleDSize(hitem->hash_itup) + (sizeof(HashItemData) - sizeof(IndexTupleData)); itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but * we need to be consistent */ /* * Acquire shared split lock so we can compute the target bucket * safely (see README). */ _hash_getlock(rel, 0, HASH_SHARE); /* Read the metapage */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ); metap = (HashMetaPage) BufferGetPage(metabuf); _hash_checkpage(rel, (Page) metap, LH_META_PAGE); /* * Check whether the item can fit on a hash page at all. (Eventually, * we ought to try to apply TOAST methods if not.) Note that at this * point, itemsz doesn't include the ItemId. */ if (itemsz > HashMaxItemSize((Page) metap)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds hash maximum %lu", (unsigned long) itemsz, (unsigned long) HashMaxItemSize((Page) metap)))); /* * Compute the target bucket number, and convert to block number. */ bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask); blkno = BUCKET_TO_BLKNO(metap, bucket); /* release lock on metapage, but keep pin since we'll need it again */ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); /* * Acquire share lock on target bucket; then we can release split lock. */ _hash_getlock(rel, blkno, HASH_SHARE); _hash_droplock(rel, 0, HASH_SHARE); /* Fetch the primary bucket page for the bucket */ buf = _hash_getbuf(rel, blkno, HASH_WRITE); page = BufferGetPage(buf); _hash_checkpage(rel, page, LH_BUCKET_PAGE); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(pageopaque->hasho_bucket == bucket); /* Do the insertion */ while (PageGetFreeSpace(page) < itemsz) { /* * no space on this page; check for an overflow page */ BlockNumber nextblkno = pageopaque->hasho_nextblkno; if (BlockNumberIsValid(nextblkno)) { /* * ovfl page exists; go get it. if it doesn't have room, * we'll find out next pass through the loop test above. */ _hash_relbuf(rel, buf); buf = _hash_getbuf(rel, nextblkno, HASH_WRITE); page = BufferGetPage(buf); } else { /* * we're at the end of the bucket chain and we haven't found a * page with enough room. allocate a new overflow page. */ /* release our write lock without modifying buffer */ _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK); /* chain to a new overflow page */ buf = _hash_addovflpage(rel, metabuf, buf); page = BufferGetPage(buf); /* should fit now, given test above */ Assert(PageGetFreeSpace(page) >= itemsz); } _hash_checkpage(rel, page, LH_OVERFLOW_PAGE); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(pageopaque->hasho_bucket == bucket); } /* found page with enough space, so add the item here */ itup_off = _hash_pgaddtup(rel, buf, itemsz, hitem); itup_blkno = BufferGetBlockNumber(buf); /* write and release the modified page */ _hash_wrtbuf(rel, buf); /* We can drop the bucket lock now */ _hash_droplock(rel, blkno, HASH_SHARE); /* * Write-lock the metapage so we can increment the tuple count. * After incrementing it, check to see if it's time for a split. */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); metap->hashm_ntuples += 1; /* Make sure this stays in sync with _hash_expandtable() */ do_expand = metap->hashm_ntuples > (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1); /* Write out the metapage and drop lock, but keep pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); /* Attempt to split if a split is needed */ if (do_expand) _hash_expandtable(rel, metabuf); /* Finally drop our pin on the metapage */ _hash_dropbuf(rel, metabuf); /* Create the return data structure */ res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData)); ItemPointerSet(&(res->pointerData), itup_blkno, itup_off); return res; }
/* * _hash_doinsert() -- Handle insertion of a single index tuple. * * This routine is called by the public interface routines, hashbuild * and hashinsert. By here, itup is completely filled in. */ void _hash_doinsert(Relation rel, IndexTuple itup) { Buffer buf = InvalidBuffer; Buffer bucket_buf; Buffer metabuf; HashMetaPage metap; BlockNumber blkno; BlockNumber oldblkno; bool retry; Page page; HashPageOpaque pageopaque; Size itemsz; bool do_expand; uint32 hashkey; Bucket bucket; uint32 maxbucket; uint32 highmask; uint32 lowmask; /* * Get the hash key for the item (it's stored in the index tuple itself). */ hashkey = _hash_get_indextuple_hashkey(itup); /* compute item size too */ itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we * need to be consistent */ restart_insert: /* Read the metapage */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* * Check whether the item can fit on a hash page at all. (Eventually, we * ought to try to apply TOAST methods if not.) Note that at this point, * itemsz doesn't include the ItemId. * * XXX this is useless code if we are only storing hash keys. */ if (itemsz > HashMaxItemSize((Page) metap)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %zu exceeds hash maximum %zu", itemsz, HashMaxItemSize((Page) metap)), errhint("Values larger than a buffer page cannot be indexed."))); oldblkno = InvalidBlockNumber; retry = false; /* * Loop until we get a lock on the correct target bucket. */ for (;;) { /* * Compute the target bucket number, and convert to block number. */ bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask); blkno = BUCKET_TO_BLKNO(metap, bucket); /* * Copy bucket mapping info now; refer the comment in * _hash_expandtable where we copy this information before calling * _hash_splitbucket to see why this is okay. */ maxbucket = metap->hashm_maxbucket; highmask = metap->hashm_highmask; lowmask = metap->hashm_lowmask; /* Release metapage lock, but keep pin. */ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); /* * If the previous iteration of this loop locked the primary page of * what is still the correct target bucket, we are done. Otherwise, * drop any old lock before acquiring the new one. */ if (retry) { if (oldblkno == blkno) break; _hash_relbuf(rel, buf); } /* Fetch and lock the primary bucket page for the target bucket */ buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE); /* * Reacquire metapage lock and check that no bucket split has taken * place while we were awaiting the bucket lock. */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ); oldblkno = blkno; retry = true; } /* remember the primary bucket buffer to release the pin on it at end. */ bucket_buf = buf; page = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(pageopaque->hasho_bucket == bucket); /* * If this bucket is in the process of being split, try to finish the * split before inserting, because that might create room for the * insertion to proceed without allocating an additional overflow page. * It's only interesting to finish the split if we're trying to insert * into the bucket from which we're removing tuples (the "old" bucket), * not if we're trying to insert into the bucket into which tuples are * being moved (the "new" bucket). */ if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf)) { /* release the lock on bucket buffer, before completing the split. */ _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK); _hash_finish_split(rel, metabuf, buf, pageopaque->hasho_bucket, maxbucket, highmask, lowmask); /* release the pin on old and meta buffer. retry for insert. */ _hash_dropbuf(rel, buf); _hash_dropbuf(rel, metabuf); goto restart_insert; } /* Do the insertion */ while (PageGetFreeSpace(page) < itemsz) { /* * no space on this page; check for an overflow page */ BlockNumber nextblkno = pageopaque->hasho_nextblkno; if (BlockNumberIsValid(nextblkno)) { /* * ovfl page exists; go get it. if it doesn't have room, we'll * find out next pass through the loop test above. we always * release both the lock and pin if this is an overflow page, but * only the lock if this is the primary bucket page, since the pin * on the primary bucket must be retained throughout the scan. */ if (buf != bucket_buf) _hash_relbuf(rel, buf); else _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK); buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); page = BufferGetPage(buf); } else { /* * we're at the end of the bucket chain and we haven't found a * page with enough room. allocate a new overflow page. */ /* release our write lock without modifying buffer */ _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK); /* chain to a new overflow page */ buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false); page = BufferGetPage(buf); /* should fit now, given test above */ Assert(PageGetFreeSpace(page) >= itemsz); } pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE); Assert(pageopaque->hasho_bucket == bucket); } /* found page with enough space, so add the item here */ (void) _hash_pgaddtup(rel, buf, itemsz, itup); /* * dirty and release the modified page. if the page we modified was an * overflow page, we also need to separately drop the pin we retained on * the primary bucket page. */ MarkBufferDirty(buf); _hash_relbuf(rel, buf); if (buf != bucket_buf) _hash_dropbuf(rel, bucket_buf); /* * Write-lock the metapage so we can increment the tuple count. After * incrementing it, check to see if it's time for a split. */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); metap->hashm_ntuples += 1; /* Make sure this stays in sync with _hash_expandtable() */ do_expand = metap->hashm_ntuples > (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1); /* Write out the metapage and drop lock, but keep pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); /* Attempt to split if a split is needed */ if (do_expand) _hash_expandtable(rel, metabuf); /* Finally drop our pin on the metapage */ _hash_dropbuf(rel, metabuf); }