/* * _hash_getovflpage() * * Find an available overflow page and return it. The returned buffer * is pinned and write-locked, and has had _hash_pageinit() applied, * but it is caller's responsibility to fill the special space. * * The caller must hold a pin, but no lock, on the metapage buffer. * That buffer is left in the same state at exit. */ static Buffer _hash_getovflpage(Relation rel, Buffer metabuf) { HashMetaPage metap; Buffer mapbuf = 0; Buffer newbuf; BlockNumber blkno; uint32 orig_firstfree; uint32 splitnum; uint32 *freep = NULL; uint32 max_ovflpg; uint32 bit; uint32 first_page; uint32 last_bit; uint32 last_page; uint32 i, j; /* Get exclusive lock on the meta page */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); _hash_checkpage(rel, metabuf, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* start search at hashm_firstfree */ orig_firstfree = metap->hashm_firstfree; first_page = orig_firstfree >> BMPG_SHIFT(metap); bit = orig_firstfree & BMPG_MASK(metap); i = first_page; j = bit / BITS_PER_MAP; bit &= ~(BITS_PER_MAP - 1); /* outer loop iterates once per bitmap page */ for (;;) { BlockNumber mapblkno; Page mappage; uint32 last_inpage; /* want to end search with the last existing overflow page */ splitnum = metap->hashm_ovflpoint; max_ovflpg = metap->hashm_spares[splitnum] - 1; last_page = max_ovflpg >> BMPG_SHIFT(metap); last_bit = max_ovflpg & BMPG_MASK(metap); if (i > last_page) break; Assert(i < metap->hashm_nmaps); mapblkno = metap->hashm_mapp[i]; if (i == last_page) last_inpage = last_bit; else last_inpage = BMPGSZ_BIT(metap) - 1; /* Release exclusive lock on metapage while reading bitmap page */ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); freep = HashPageGetBitmap(mappage); for (; bit <= last_inpage; j++, bit += BITS_PER_MAP) { if (freep[j] != ALL_SET) goto found; } /* No free space here, try to advance to next map page */ _hash_relbuf(rel, mapbuf); i++; j = 0; /* scan from start of next map page */ bit = 0; /* Reacquire exclusive lock on the meta page */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); } /* * No free pages --- have to extend the relation to add an overflow page. * First, check to see if we have to add a new bitmap page too. */ if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1)) { /* * We create the new bitmap page with all pages marked "in use". * Actually two pages in the new bitmap's range will exist * immediately: the bitmap page itself, and the following page which * is the one we return to the caller. Both of these are correctly * marked "in use". Subsequent pages do not exist yet, but it is * convenient to pre-mark them as "in use" too. */ bit = metap->hashm_spares[splitnum]; _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit), MAIN_FORKNUM); metap->hashm_spares[splitnum]++; } else { /* * Nothing to do here; since the page will be past the last used page, * we know its bitmap bit was preinitialized to "in use". */ } /* Calculate address of the new overflow page */ bit = metap->hashm_spares[splitnum]; blkno = bitno_to_blkno(metap, bit); /* * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the * relation length stays in sync with ours. XXX It's annoying to do this * with metapage write lock held; would be better to use a lock that * doesn't block incoming searches. */ newbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM); metap->hashm_spares[splitnum]++; /* * Adjust hashm_firstfree to avoid redundant searches. But don't risk * changing it if someone moved it while we were searching bitmap pages. */ if (metap->hashm_firstfree == orig_firstfree) metap->hashm_firstfree = bit + 1; /* Write updated metapage and release lock, but not pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); return newbuf; found: /* convert bit to bit number within page */ bit += _hash_firstfreebit(freep[j]); /* mark page "in use" in the bitmap */ SETBIT(freep, bit); _hash_wrtbuf(rel, mapbuf); /* Reacquire exclusive lock on the meta page */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); /* convert bit to absolute bit number */ bit += (i << BMPG_SHIFT(metap)); /* Calculate address of the recycled overflow page */ blkno = bitno_to_blkno(metap, bit); /* * Adjust hashm_firstfree to avoid redundant searches. But don't risk * changing it if someone moved it while we were searching bitmap pages. */ if (metap->hashm_firstfree == orig_firstfree) { metap->hashm_firstfree = bit + 1; /* Write updated metapage and release lock, but not pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); } else { /* We didn't change the metapage, so no need to write */ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); } /* Fetch, init, and return the recycled page */ return _hash_getinitbuf(rel, blkno); }
/* * _hash_freeovflpage() - * * Remove this overflow page from its bucket's chain, and mark the page as * free. On entry, ovflbuf is write-locked; it is released before exiting. * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. * * Returns the block number of the page that followed the given page * in the bucket, or InvalidBlockNumber if no following page. * * NB: caller must not hold lock on metapage, nor on either page that's * adjacent in the bucket chain. The caller had better hold exclusive lock * on the bucket, too. */ BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, BufferAccessStrategy bstrategy) { HashMetaPage metap; Buffer metabuf; Buffer mapbuf; BlockNumber ovflblkno; BlockNumber prevblkno; BlockNumber blkno; BlockNumber nextblkno; HashPageOpaque ovflopaque; Page ovflpage; Page mappage; uint32 *freep; uint32 ovflbitno; int32 bitmappage, bitmapbit; Bucket bucket PG_USED_FOR_ASSERTS_ONLY; /* Get information from the doomed page */ _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE); ovflblkno = BufferGetBlockNumber(ovflbuf); ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); nextblkno = ovflopaque->hasho_nextblkno; prevblkno = ovflopaque->hasho_prevblkno; bucket = ovflopaque->hasho_bucket; /* * Zero the page for debugging's sake; then write and release it. (Note: * if we failed to zero the page here, we'd have problems with the Assert * in _hash_pageinit() when the page is reused.) */ MemSet(ovflpage, 0, BufferGetPageSize(ovflbuf)); _hash_wrtbuf(rel, ovflbuf); /* * Fix up the bucket chain. this is a doubly-linked list, so we must fix * up the bucket chain members behind and ahead of the overflow page being * deleted. No concurrency issues since we hold exclusive lock on the * entire bucket. */ if (BlockNumberIsValid(prevblkno)) { Buffer prevbuf = _hash_getbuf_with_strategy(rel, prevblkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, bstrategy); Page prevpage = BufferGetPage(prevbuf); HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); Assert(prevopaque->hasho_bucket == bucket); prevopaque->hasho_nextblkno = nextblkno; _hash_wrtbuf(rel, prevbuf); } if (BlockNumberIsValid(nextblkno)) { Buffer nextbuf = _hash_getbuf_with_strategy(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); Page nextpage = BufferGetPage(nextbuf); HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); Assert(nextopaque->hasho_bucket == bucket); nextopaque->hasho_prevblkno = prevblkno; _hash_wrtbuf(rel, nextbuf); } /* Note: bstrategy is intentionally not used for metapage and bitmap */ /* Read the metapage so we can determine which bitmap page to use */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* Identify which bit to set */ ovflbitno = blkno_to_bitno(metap, ovflblkno); bitmappage = ovflbitno >> BMPG_SHIFT(metap); bitmapbit = ovflbitno & BMPG_MASK(metap); if (bitmappage >= metap->hashm_nmaps) elog(ERROR, "invalid overflow bit number %u", ovflbitno); blkno = metap->hashm_mapp[bitmappage]; /* Release metapage lock while we access the bitmap page */ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); /* Clear the bitmap bit to indicate that this overflow page is free */ mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); freep = HashPageGetBitmap(mappage); Assert(ISSET(freep, bitmapbit)); CLRBIT(freep, bitmapbit); _hash_wrtbuf(rel, mapbuf); /* Get write-lock on metapage to update firstfree */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); /* if this is now the first free page, update hashm_firstfree */ if (ovflbitno < metap->hashm_firstfree) { metap->hashm_firstfree = ovflbitno; _hash_wrtbuf(rel, metabuf); } else { /* no need to change metapage */ _hash_relbuf(rel, metabuf); } return nextblkno; }
/* * _hash_metapinit() -- Initialize the metadata page of a hash index, * the two buckets that we begin with and the initial * bitmap page. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ void _hash_metapinit(Relation rel) { HashMetaPage metap; HashPageOpaque pageopaque; Buffer metabuf; Buffer buf; Page pg; int32 data_width; int32 item_width; int32 ffactor; uint16 i; /* safety check */ if (RelationGetNumberOfBlocks(rel) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); /* * Determine the target fill factor (tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about 3/4ths * full. We can compute it exactly if the index datatype is fixed-width, * but for var-width there's some guessing involved. */ data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid, RelationGetDescr(rel)->attrs[0]->atttypmod); item_width = MAXALIGN(sizeof(HashItemData)) + MAXALIGN(data_width) + sizeof(ItemIdData); /* include the line pointer */ ffactor = (BLCKSZ * 3 / 4) / item_width; /* keep to a sane range */ if (ffactor < 10) ffactor = 10; metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE); pg = BufferGetPage(metabuf); _hash_pageinit(pg, BufferGetPageSize(metabuf)); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = -1; pageopaque->hasho_flag = LH_META_PAGE; pageopaque->hasho_filler = HASHO_FILL; metap = (HashMetaPage) pg; metap->hashm_magic = HASH_MAGIC; metap->hashm_version = HASH_VERSION; metap->hashm_ntuples = 0; metap->hashm_nmaps = 0; metap->hashm_ffactor = ffactor; metap->hashm_bsize = BufferGetPageSize(metabuf); /* find largest bitmap array size that will fit in page size */ for (i = _hash_log2(metap->hashm_bsize); i > 0; --i) { if ((1 << i) <= (metap->hashm_bsize - (MAXALIGN(sizeof(PageHeaderData)) + MAXALIGN(sizeof(HashPageOpaqueData))))) break; } Assert(i > 0); metap->hashm_bmsize = 1 << i; metap->hashm_bmshift = i + BYTE_TO_BIT; Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1)); metap->hashm_procid = index_getprocid(rel, 1, HASHPROC); /* * We initialize the index with two buckets, 0 and 1, occupying physical * blocks 1 and 2. The first freespace bitmap page is in block 3. */ metap->hashm_maxbucket = metap->hashm_lowmask = 1; /* nbuckets - 1 */ metap->hashm_highmask = 3; /* (nbuckets << 1) - 1 */ MemSet((char *) metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet((char *) metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */ metap->hashm_ovflpoint = 1; metap->hashm_firstfree = 0; /* * Initialize the first two buckets */ for (i = 0; i <= 1; i++) { buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE); pg = BufferGetPage(buf); _hash_pageinit(pg, BufferGetPageSize(buf)); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = i; pageopaque->hasho_flag = LH_BUCKET_PAGE; pageopaque->hasho_filler = HASHO_FILL; _hash_wrtbuf(rel, buf); } /* * Initialize first bitmap page. Can't do this until we * create the first two buckets, else smgr will complain. */ _hash_initbitmap(rel, metap, 3); /* all done */ _hash_wrtbuf(rel, metabuf); }
/* ------------------------------------------------ * hash_bitmap_info() * * Get bitmap information for a particular overflow page * * Usage: SELECT * FROM hash_bitmap_info('con_hash_index'::regclass, 5); * ------------------------------------------------ */ Datum hash_bitmap_info(PG_FUNCTION_ARGS) { Oid indexRelid = PG_GETARG_OID(0); uint64 ovflblkno = PG_GETARG_INT64(1); HashMetaPage metap; Buffer metabuf, mapbuf; BlockNumber bitmapblkno; Page mappage; bool bit = false; TupleDesc tupleDesc; Relation indexRel; uint32 ovflbitno; int32 bitmappage, bitmapbit; HeapTuple tuple; int i, j; Datum values[3]; bool nulls[3]; uint32 *freep; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use raw page functions")))); indexRel = index_open(indexRelid, AccessShareLock); if (!IS_HASH(indexRel)) elog(ERROR, "relation \"%s\" is not a hash index", RelationGetRelationName(indexRel)); if (RELATION_IS_OTHER_TEMP(indexRel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"))); if (ovflblkno >= RelationGetNumberOfBlocks(indexRel)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("block number " UINT64_FORMAT " is out of range for relation \"%s\"", ovflblkno, RelationGetRelationName(indexRel)))); /* Read the metapage so we can determine which bitmap page to use */ metabuf = _hash_getbuf(indexRel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* * Reject attempt to read the bit for a metapage or bitmap page; this is * only meaningful for overflow pages. */ if (ovflblkno == 0) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid overflow block number %u", (BlockNumber) ovflblkno))); for (i = 0; i < metap->hashm_nmaps; i++) if (metap->hashm_mapp[i] == ovflblkno) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid overflow block number %u", (BlockNumber) ovflblkno))); /* * Identify overflow bit number. This will error out for primary bucket * pages, and we've already rejected the metapage and bitmap pages above. */ ovflbitno = _hash_ovflblkno_to_bitno(metap, (BlockNumber) ovflblkno); bitmappage = ovflbitno >> BMPG_SHIFT(metap); bitmapbit = ovflbitno & BMPG_MASK(metap); if (bitmappage >= metap->hashm_nmaps) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid overflow block number %u", (BlockNumber) ovflblkno))); bitmapblkno = metap->hashm_mapp[bitmappage]; _hash_relbuf(indexRel, metabuf); /* Check the status of bitmap bit for overflow page */ mapbuf = _hash_getbuf(indexRel, bitmapblkno, HASH_READ, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); freep = HashPageGetBitmap(mappage); bit = ISSET(freep, bitmapbit) != 0; _hash_relbuf(indexRel, mapbuf); index_close(indexRel, AccessShareLock); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); tupleDesc = BlessTupleDesc(tupleDesc); MemSet(nulls, 0, sizeof(nulls)); j = 0; values[j++] = Int64GetDatum((int64) bitmapblkno); values[j++] = Int32GetDatum(bitmapbit); values[j++] = BoolGetDatum(bit); tuple = heap_form_tuple(tupleDesc, values, nulls); PG_RETURN_DATUM(HeapTupleGetDatum(tuple)); }
/* * _hash_metapinit() -- Initialize the metadata page of a hash index, * the initial buckets, and the initial bitmap page. * * The initial number of buckets is dependent on num_tuples, an estimate * of the number of tuples to be loaded into the index initially. The * chosen number of buckets is returned. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ uint32 _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) { HashMetaPage metap; HashPageOpaque pageopaque; Buffer metabuf; Buffer buf; Page pg; int32 data_width; int32 item_width; int32 ffactor; double dnumbuckets; uint32 num_buckets; uint32 log2_num_buckets; uint32 i; /* safety check */ if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); /* * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full * as the user-settable fillfactor parameter says. We can compute it * exactly since the index datatype (i.e. uint32 hash key) is fixed-width. */ data_width = sizeof(uint32); item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) + sizeof(ItemIdData); /* include the line pointer */ ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width; /* keep to a sane range */ if (ffactor < 10) ffactor = 10; /* * Choose the number of initial bucket pages to match the fill factor * given the estimated number of tuples. We round up the result to the * next power of 2, however, and always force at least 2 bucket pages. The * upper limit is determined by considerations explained in * _hash_expandtable(). */ dnumbuckets = num_tuples / ffactor; if (dnumbuckets <= 2.0) num_buckets = 2; else if (dnumbuckets >= (double) 0x40000000) num_buckets = 0x40000000; else num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets); log2_num_buckets = _hash_log2(num_buckets); Assert(num_buckets == (((uint32) 1) << log2_num_buckets)); Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS); /* * We initialize the metapage, the first N bucket pages, and the first * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() * calls to occur. This ensures that the smgr level has the right idea of * the physical index length. */ metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); pg = BufferGetPage(metabuf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = -1; pageopaque->hasho_flag = LH_META_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; metap = HashPageGetMeta(pg); metap->hashm_magic = HASH_MAGIC; metap->hashm_version = HASH_VERSION; metap->hashm_ntuples = 0; metap->hashm_nmaps = 0; metap->hashm_ffactor = ffactor; metap->hashm_bsize = HashGetMaxBitmapSize(pg); /* find largest bitmap array size that will fit in page size */ for (i = _hash_log2(metap->hashm_bsize); i > 0; --i) { if ((1 << i) <= metap->hashm_bsize) break; } Assert(i > 0); metap->hashm_bmsize = 1 << i; metap->hashm_bmshift = i + BYTE_TO_BIT; Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1)); /* * Label the index with its primary hash support function's OID. This is * pretty useless for normal operation (in fact, hashm_procid is not used * anywhere), but it might be handy for forensic purposes so we keep it. */ metap->hashm_procid = index_getprocid(rel, 1, HASHPROC); /* * We initialize the index with N buckets, 0 .. N-1, occupying physical * blocks 1 to N. The first freespace bitmap page is in block N+1. Since * N is a power of 2, we can set the masks this way: */ metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1; metap->hashm_highmask = (num_buckets << 1) - 1; MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); /* Set up mapping for one spare page after the initial splitpoints */ metap->hashm_spares[log2_num_buckets] = 1; metap->hashm_ovflpoint = log2_num_buckets; metap->hashm_firstfree = 0; /* * Release buffer lock on the metapage while we initialize buckets. * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS * won't accomplish anything. It's a bad idea to hold buffer locks for * long intervals in any case, since that can block the bgwriter. */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); /* * Initialize the first N buckets */ for (i = 0; i < num_buckets; i++) { /* Allow interrupts, in case N is huge */ CHECK_FOR_INTERRUPTS(); buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum); pg = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = i; pageopaque->hasho_flag = LH_BUCKET_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; _hash_wrtbuf(rel, buf); } /* Now reacquire buffer lock on metapage */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); /* * Initialize first bitmap page */ _hash_initbitmap(rel, metap, num_buckets + 1, forkNum); /* all done */ _hash_wrtbuf(rel, metabuf); return num_buckets; }
/* * _hash_freeovflpage() - * * Remove this overflow page from its bucket's chain, and mark the page as * free. On entry, ovflbuf is write-locked; it is released before exiting. * * Add the tuples (itups) to wbuf in this function. We could do that in the * caller as well, but the advantage of doing it here is we can easily write * the WAL for XLOG_HASH_SQUEEZE_PAGE operation. Addition of tuples and * removal of overflow page has to done as an atomic operation, otherwise * during replay on standby users might find duplicate records. * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. * * Returns the block number of the page that followed the given page * in the bucket, or InvalidBlockNumber if no following page. * * NB: caller must not hold lock on metapage, nor on page, that's next to * ovflbuf in the bucket chain. We don't acquire the lock on page that's * prior to ovflbuf in chain if it is same as wbuf because the caller already * has a lock on same. */ BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy) { HashMetaPage metap; Buffer metabuf; Buffer mapbuf; BlockNumber ovflblkno; BlockNumber prevblkno; BlockNumber blkno; BlockNumber nextblkno; BlockNumber writeblkno; HashPageOpaque ovflopaque; Page ovflpage; Page mappage; uint32 *freep; uint32 ovflbitno; int32 bitmappage, bitmapbit; Bucket bucket PG_USED_FOR_ASSERTS_ONLY; Buffer prevbuf = InvalidBuffer; Buffer nextbuf = InvalidBuffer; bool update_metap = false; /* Get information from the doomed page */ _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE); ovflblkno = BufferGetBlockNumber(ovflbuf); ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); nextblkno = ovflopaque->hasho_nextblkno; prevblkno = ovflopaque->hasho_prevblkno; writeblkno = BufferGetBlockNumber(wbuf); bucket = ovflopaque->hasho_bucket; /* * Fix up the bucket chain. this is a doubly-linked list, so we must fix * up the bucket chain members behind and ahead of the overflow page being * deleted. Concurrency issues are avoided by using lock chaining as * described atop hashbucketcleanup. */ if (BlockNumberIsValid(prevblkno)) { if (prevblkno == writeblkno) prevbuf = wbuf; else prevbuf = _hash_getbuf_with_strategy(rel, prevblkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, bstrategy); } if (BlockNumberIsValid(nextblkno)) nextbuf = _hash_getbuf_with_strategy(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); /* Note: bstrategy is intentionally not used for metapage and bitmap */ /* Read the metapage so we can determine which bitmap page to use */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* Identify which bit to set */ ovflbitno = _hash_ovflblkno_to_bitno(metap, ovflblkno); bitmappage = ovflbitno >> BMPG_SHIFT(metap); bitmapbit = ovflbitno & BMPG_MASK(metap); if (bitmappage >= metap->hashm_nmaps) elog(ERROR, "invalid overflow bit number %u", ovflbitno); blkno = metap->hashm_mapp[bitmappage]; /* Release metapage lock while we access the bitmap page */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); /* read the bitmap page to clear the bitmap bit */ mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); freep = HashPageGetBitmap(mappage); Assert(ISSET(freep, bitmapbit)); /* Get write-lock on metapage to update firstfree */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); /* This operation needs to log multiple tuples, prepare WAL for that */ if (RelationNeedsWAL(rel)) XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS, 4 + nitups); START_CRIT_SECTION(); /* * we have to insert tuples on the "write" page, being careful to preserve * hashkey ordering. (If we insert many tuples into the same "write" page * it would be worth qsort'ing them). */ if (nitups > 0) { _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); MarkBufferDirty(wbuf); } /* * Reinitialize the freed overflow page. Just zeroing the page won't * work, because WAL replay routines expect pages to be initialized. See * explanation of RBM_NORMAL mode atop XLogReadBufferExtended. We are * careful to make the special space valid here so that tools like * pageinspect won't get confused. */ _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); ovflopaque->hasho_prevblkno = InvalidBlockNumber; ovflopaque->hasho_nextblkno = InvalidBlockNumber; ovflopaque->hasho_bucket = -1; ovflopaque->hasho_flag = LH_UNUSED_PAGE; ovflopaque->hasho_page_id = HASHO_PAGE_ID; MarkBufferDirty(ovflbuf); if (BufferIsValid(prevbuf)) { Page prevpage = BufferGetPage(prevbuf); HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); Assert(prevopaque->hasho_bucket == bucket); prevopaque->hasho_nextblkno = nextblkno; MarkBufferDirty(prevbuf); } if (BufferIsValid(nextbuf)) { Page nextpage = BufferGetPage(nextbuf); HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); Assert(nextopaque->hasho_bucket == bucket); nextopaque->hasho_prevblkno = prevblkno; MarkBufferDirty(nextbuf); } /* Clear the bitmap bit to indicate that this overflow page is free */ CLRBIT(freep, bitmapbit); MarkBufferDirty(mapbuf); /* if this is now the first free page, update hashm_firstfree */ if (ovflbitno < metap->hashm_firstfree) { metap->hashm_firstfree = ovflbitno; update_metap = true; MarkBufferDirty(metabuf); } /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_hash_squeeze_page xlrec; XLogRecPtr recptr; int i; xlrec.prevblkno = prevblkno; xlrec.nextblkno = nextblkno; xlrec.ntups = nitups; xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf); xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf); XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage); /* * bucket buffer needs to be registered to ensure that we can acquire * a cleanup lock on it during replay. */ if (!xlrec.is_prim_bucket_same_wrt) XLogRegisterBuffer(0, bucketbuf, REGBUF_STANDARD | REGBUF_NO_IMAGE); XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); if (xlrec.ntups > 0) { XLogRegisterBufData(1, (char *) itup_offsets, nitups * sizeof(OffsetNumber)); for (i = 0; i < nitups; i++) XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); } XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD); /* * If prevpage and the writepage (block in which we are moving tuples * from overflow) are same, then no need to separately register * prevpage. During replay, we can directly update the nextblock in * writepage. */ if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD); if (BufferIsValid(nextbuf)) XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD); XLogRegisterBuffer(5, mapbuf, REGBUF_STANDARD); XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32)); if (update_metap) { XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD); XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32)); } recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE); PageSetLSN(BufferGetPage(wbuf), recptr); PageSetLSN(BufferGetPage(ovflbuf), recptr); if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) PageSetLSN(BufferGetPage(prevbuf), recptr); if (BufferIsValid(nextbuf)) PageSetLSN(BufferGetPage(nextbuf), recptr); PageSetLSN(BufferGetPage(mapbuf), recptr); if (update_metap) PageSetLSN(BufferGetPage(metabuf), recptr); } END_CRIT_SECTION(); /* release previous bucket if it is not same as write bucket */ if (BufferIsValid(prevbuf) && prevblkno != writeblkno) _hash_relbuf(rel, prevbuf); if (BufferIsValid(ovflbuf)) _hash_relbuf(rel, ovflbuf); if (BufferIsValid(nextbuf)) _hash_relbuf(rel, nextbuf); _hash_relbuf(rel, mapbuf); _hash_relbuf(rel, metabuf); return nextblkno; }
/* * _hash_addovflpage * * Add an overflow page to the bucket whose last page is pointed to by 'buf'. * * On entry, the caller must hold a pin but no lock on 'buf'. The pin is * dropped before exiting (we assume the caller is not interested in 'buf' * anymore) if not asked to retain. The pin will be retained only for the * primary bucket. The returned overflow page will be pinned and * write-locked; it is guaranteed to be empty. * * The caller must hold a pin, but no lock, on the metapage buffer. * That buffer is returned in the same state. * * NB: since this could be executed concurrently by multiple processes, * one should not assume that the returned overflow page will be the * immediate successor of the originally passed 'buf'. Additional overflow * pages might have been added to the bucket chain in between. */ Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) { Buffer ovflbuf; Page page; Page ovflpage; HashPageOpaque pageopaque; HashPageOpaque ovflopaque; HashMetaPage metap; Buffer mapbuf = InvalidBuffer; Buffer newmapbuf = InvalidBuffer; BlockNumber blkno; uint32 orig_firstfree; uint32 splitnum; uint32 *freep = NULL; uint32 max_ovflpg; uint32 bit; uint32 bitmap_page_bit; uint32 first_page; uint32 last_bit; uint32 last_page; uint32 i, j; bool page_found = false; /* * Write-lock the tail page. Here, we need to maintain locking order such * that, first acquire the lock on tail page of bucket, then on meta page * to find and lock the bitmap page and if it is found, then lock on meta * page is released, then finally acquire the lock on new overflow buffer. * We need this locking order to avoid deadlock with backends that are * doing inserts. * * Note: We could have avoided locking many buffers here if we made two * WAL records for acquiring an overflow page (one to allocate an overflow * page and another to add it to overflow bucket chain). However, doing * so can leak an overflow page, if the system crashes after allocation. * Needless to say, it is better to have a single record from a * performance point of view as well. */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); /* probably redundant... */ _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); /* loop to find current tail page, in case someone else inserted too */ for (;;) { BlockNumber nextblkno; page = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); nextblkno = pageopaque->hasho_nextblkno; if (!BlockNumberIsValid(nextblkno)) break; /* we assume we do not need to write the unmodified page */ if (retain_pin) { /* pin will be retained only for the primary bucket page */ Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_BUCKET_PAGE); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } else _hash_relbuf(rel, buf); retain_pin = false; buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); } /* Get exclusive lock on the meta page */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); _hash_checkpage(rel, metabuf, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* start search at hashm_firstfree */ orig_firstfree = metap->hashm_firstfree; first_page = orig_firstfree >> BMPG_SHIFT(metap); bit = orig_firstfree & BMPG_MASK(metap); i = first_page; j = bit / BITS_PER_MAP; bit &= ~(BITS_PER_MAP - 1); /* outer loop iterates once per bitmap page */ for (;;) { BlockNumber mapblkno; Page mappage; uint32 last_inpage; /* want to end search with the last existing overflow page */ splitnum = metap->hashm_ovflpoint; max_ovflpg = metap->hashm_spares[splitnum] - 1; last_page = max_ovflpg >> BMPG_SHIFT(metap); last_bit = max_ovflpg & BMPG_MASK(metap); if (i > last_page) break; Assert(i < metap->hashm_nmaps); mapblkno = metap->hashm_mapp[i]; if (i == last_page) last_inpage = last_bit; else last_inpage = BMPGSZ_BIT(metap) - 1; /* Release exclusive lock on metapage while reading bitmap page */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); freep = HashPageGetBitmap(mappage); for (; bit <= last_inpage; j++, bit += BITS_PER_MAP) { if (freep[j] != ALL_SET) { page_found = true; /* Reacquire exclusive lock on the meta page */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); /* convert bit to bit number within page */ bit += _hash_firstfreebit(freep[j]); bitmap_page_bit = bit; /* convert bit to absolute bit number */ bit += (i << BMPG_SHIFT(metap)); /* Calculate address of the recycled overflow page */ blkno = bitno_to_blkno(metap, bit); /* Fetch and init the recycled page */ ovflbuf = _hash_getinitbuf(rel, blkno); goto found; } } /* No free space here, try to advance to next map page */ _hash_relbuf(rel, mapbuf); mapbuf = InvalidBuffer; i++; j = 0; /* scan from start of next map page */ bit = 0; /* Reacquire exclusive lock on the meta page */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); } /* * No free pages --- have to extend the relation to add an overflow page. * First, check to see if we have to add a new bitmap page too. */ if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1)) { /* * We create the new bitmap page with all pages marked "in use". * Actually two pages in the new bitmap's range will exist * immediately: the bitmap page itself, and the following page which * is the one we return to the caller. Both of these are correctly * marked "in use". Subsequent pages do not exist yet, but it is * convenient to pre-mark them as "in use" too. */ bit = metap->hashm_spares[splitnum]; /* metapage already has a write lock */ if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("out of overflow pages in hash index \"%s\"", RelationGetRelationName(rel)))); newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM); } else { /* * Nothing to do here; since the page will be past the last used page, * we know its bitmap bit was preinitialized to "in use". */ } /* Calculate address of the new overflow page */ bit = BufferIsValid(newmapbuf) ? metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum]; blkno = bitno_to_blkno(metap, bit); /* * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the * relation length stays in sync with ours. XXX It's annoying to do this * with metapage write lock held; would be better to use a lock that * doesn't block incoming searches. * * It is okay to hold two buffer locks here (one on tail page of bucket * and other on new overflow page) since there cannot be anyone else * contending for access to ovflbuf. */ ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM); found: /* * Do the update. No ereport(ERROR) until changes are logged. We want to * log the changes for bitmap page and overflow page together to avoid * loss of pages in case the new page is added. */ START_CRIT_SECTION(); if (page_found) { Assert(BufferIsValid(mapbuf)); /* mark page "in use" in the bitmap */ SETBIT(freep, bitmap_page_bit); MarkBufferDirty(mapbuf); } else { /* update the count to indicate new overflow page is added */ metap->hashm_spares[splitnum]++; if (BufferIsValid(newmapbuf)) { _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false); MarkBufferDirty(newmapbuf); /* add the new bitmap page to the metapage's list of bitmaps */ metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf); metap->hashm_nmaps++; metap->hashm_spares[splitnum]++; } MarkBufferDirty(metabuf); /* * for new overflow page, we don't need to explicitly set the bit in * bitmap page, as by default that will be set to "in use". */ } /* * Adjust hashm_firstfree to avoid redundant searches. But don't risk * changing it if someone moved it while we were searching bitmap pages. */ if (metap->hashm_firstfree == orig_firstfree) { metap->hashm_firstfree = bit + 1; MarkBufferDirty(metabuf); } /* initialize new overflow page */ ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); ovflopaque->hasho_nextblkno = InvalidBlockNumber; ovflopaque->hasho_bucket = pageopaque->hasho_bucket; ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; ovflopaque->hasho_page_id = HASHO_PAGE_ID; MarkBufferDirty(ovflbuf); /* logically chain overflow page to previous page */ pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); MarkBufferDirty(buf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; xl_hash_add_ovfl_page xlrec; xlrec.bmpage_found = page_found; xlrec.bmsize = metap->hashm_bmsize; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashAddOvflPage); XLogRegisterBuffer(0, ovflbuf, REGBUF_WILL_INIT); XLogRegisterBufData(0, (char *) &pageopaque->hasho_bucket, sizeof(Bucket)); XLogRegisterBuffer(1, buf, REGBUF_STANDARD); if (BufferIsValid(mapbuf)) { XLogRegisterBuffer(2, mapbuf, REGBUF_STANDARD); XLogRegisterBufData(2, (char *) &bitmap_page_bit, sizeof(uint32)); } if (BufferIsValid(newmapbuf)) XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT); XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD); XLogRegisterBufData(4, (char *) &metap->hashm_firstfree, sizeof(uint32)); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE); PageSetLSN(BufferGetPage(ovflbuf), recptr); PageSetLSN(BufferGetPage(buf), recptr); if (BufferIsValid(mapbuf)) PageSetLSN(BufferGetPage(mapbuf), recptr); if (BufferIsValid(newmapbuf)) PageSetLSN(BufferGetPage(newmapbuf), recptr); PageSetLSN(BufferGetPage(metabuf), recptr); } END_CRIT_SECTION(); if (retain_pin) LockBuffer(buf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, buf); if (BufferIsValid(mapbuf)) _hash_relbuf(rel, mapbuf); LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); if (BufferIsValid(newmapbuf)) _hash_relbuf(rel, newmapbuf); return ovflbuf; }
/* * _hash_metapinit() -- Initialize the metadata page of a hash index, * the two buckets that we begin with and the initial * bitmap page. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ void _hash_metapinit(Relation rel) { MIRROREDLOCK_BUFMGR_DECLARE; HashMetaPage metap; HashPageOpaque pageopaque; Buffer metabuf; Buffer buf; Page pg; int32 data_width; int32 item_width; int32 ffactor; uint16 i; /* safety check */ if (RelationGetNumberOfBlocks(rel) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); /* * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full * as the user-settable fillfactor parameter says. We can compute it * exactly if the index datatype is fixed-width, but for var-width there's * some guessing involved. */ data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid, RelationGetDescr(rel)->attrs[0]->atttypmod); item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) + sizeof(ItemIdData); /* include the line pointer */ ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width; /* keep to a sane range */ if (ffactor < 10) ffactor = 10; /* * We initialize the metapage, the first two bucket pages, and the * first bitmap page in sequence, using _hash_getnewbuf to cause * smgrextend() calls to occur. This ensures that the smgr level * has the right idea of the physical index length. */ // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, HASH_WRITE); pg = BufferGetPage(metabuf); _hash_pageinit(pg, BufferGetPageSize(metabuf)); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = -1; pageopaque->hasho_flag = LH_META_PAGE; pageopaque->hasho_filler = HASHO_FILL; metap = (HashMetaPage) pg; metap->hashm_magic = HASH_MAGIC; metap->hashm_version = HASH_VERSION; metap->hashm_ntuples = 0; metap->hashm_nmaps = 0; metap->hashm_ffactor = ffactor; metap->hashm_bsize = BufferGetPageSize(metabuf); /* find largest bitmap array size that will fit in page size */ for (i = _hash_log2(metap->hashm_bsize); i > 0; --i) { if ((1 << i) <= (metap->hashm_bsize - (MAXALIGN(sizeof(PageHeaderData)) + MAXALIGN(sizeof(HashPageOpaqueData))))) break; } Assert(i > 0); metap->hashm_bmsize = 1 << i; metap->hashm_bmshift = i + BYTE_TO_BIT; Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1)); metap->hashm_procid = index_getprocid(rel, 1, HASHPROC); /* * We initialize the index with two buckets, 0 and 1, occupying physical * blocks 1 and 2. The first freespace bitmap page is in block 3. */ metap->hashm_maxbucket = metap->hashm_lowmask = 1; /* nbuckets - 1 */ metap->hashm_highmask = 3; /* (nbuckets << 1) - 1 */ MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */ metap->hashm_ovflpoint = 1; metap->hashm_firstfree = 0; /* * Initialize the first two buckets */ for (i = 0; i <= 1; i++) { buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE); pg = BufferGetPage(buf); _hash_pageinit(pg, BufferGetPageSize(buf)); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = i; pageopaque->hasho_flag = LH_BUCKET_PAGE; pageopaque->hasho_filler = HASHO_FILL; _hash_wrtbuf(rel, buf); } /* * Initialize first bitmap page */ _hash_initbitmap(rel, metap, 3); /* all done */ _hash_wrtbuf(rel, metabuf); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- }
/* * _hash_init_metabuffer() -- Initialize the metadata page of a hash index. */ void _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid, uint16 ffactor, bool initpage) { HashMetaPage metap; HashPageOpaque pageopaque; Page page; double dnumbuckets; uint32 num_buckets; uint32 spare_index; uint32 i; /* * Choose the number of initial bucket pages to match the fill factor * given the estimated number of tuples. We round up the result to the * total number of buckets which has to be allocated before using its * _hashm_spare element. However always force at least 2 bucket pages. The * upper limit is determined by considerations explained in * _hash_expandtable(). */ dnumbuckets = num_tuples / ffactor; if (dnumbuckets <= 2.0) num_buckets = 2; else if (dnumbuckets >= (double) 0x40000000) num_buckets = 0x40000000; else num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets)); spare_index = _hash_spareindex(num_buckets); Assert(spare_index < HASH_MAX_SPLITPOINTS); page = BufferGetPage(buf); if (initpage) _hash_pageinit(page, BufferGetPageSize(buf)); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = -1; pageopaque->hasho_flag = LH_META_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; metap = HashPageGetMeta(page); metap->hashm_magic = HASH_MAGIC; metap->hashm_version = HASH_VERSION; metap->hashm_ntuples = 0; metap->hashm_nmaps = 0; metap->hashm_ffactor = ffactor; metap->hashm_bsize = HashGetMaxBitmapSize(page); /* find largest bitmap array size that will fit in page size */ for (i = _hash_log2(metap->hashm_bsize); i > 0; --i) { if ((1 << i) <= metap->hashm_bsize) break; } Assert(i > 0); metap->hashm_bmsize = 1 << i; metap->hashm_bmshift = i + BYTE_TO_BIT; Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1)); /* * Label the index with its primary hash support function's OID. This is * pretty useless for normal operation (in fact, hashm_procid is not used * anywhere), but it might be handy for forensic purposes so we keep it. */ metap->hashm_procid = procid; /* * We initialize the index with N buckets, 0 .. N-1, occupying physical * blocks 1 to N. The first freespace bitmap page is in block N+1. */ metap->hashm_maxbucket = num_buckets - 1; /* * Set highmask as next immediate ((2 ^ x) - 1), which should be * sufficient to cover num_buckets. */ metap->hashm_highmask = (1 << (_hash_log2(num_buckets + 1))) - 1; metap->hashm_lowmask = (metap->hashm_highmask >> 1); MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); /* Set up mapping for one spare page after the initial splitpoints */ metap->hashm_spares[spare_index] = 1; metap->hashm_ovflpoint = spare_index; metap->hashm_firstfree = 0; /* * Set pd_lower just past the end of the metadata. This is to log full * page image of metapage in xloginsert.c. */ ((PageHeader) page)->pd_lower = ((char *) metap + sizeof(HashMetaPageData)) - (char *) page; }