/* * _hash_get_oldblock_from_newbucket() -- get the block number of a bucket * from which current (new) bucket is being split. */ BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket) { Bucket old_bucket; uint32 mask; Buffer metabuf; HashMetaPage metap; BlockNumber blkno; /* * To get the old bucket from the current bucket, we need a mask to modulo * into lower half of table. This mask is stored in meta page as * hashm_lowmask, but here we can't rely on the same, because we need a * value of lowmask that was prevalent at the time when bucket split was * started. Masking the most significant bit of new bucket would give us * old bucket. */ mask = (((uint32) 1) << (fls(new_bucket) - 1)) - 1; old_bucket = new_bucket & mask; metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); blkno = BUCKET_TO_BLKNO(metap, old_bucket); _hash_relbuf(rel, metabuf); return blkno; }
/* * replay a hash index insert without split */ static void hash_xlog_insert(XLogReaderState *record) { HashMetaPage metap; XLogRecPtr lsn = record->EndRecPtr; xl_hash_insert *xlrec = (xl_hash_insert *) XLogRecGetData(record); Buffer buffer; Page page; if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { Size datalen; char *datapos = XLogRecGetBlockData(record, 0, &datalen); page = BufferGetPage(buffer); if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, false, false) == InvalidOffsetNumber) elog(PANIC, "hash_xlog_insert: failed to add item"); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) { /* * Note: in normal operation, we'd update the metapage while still * holding lock on the page we inserted into. But during replay it's * not necessary to hold that lock, since no other index updates can * be happening concurrently. */ page = BufferGetPage(buffer); metap = HashPageGetMeta(page); metap->hashm_ntuples += 1; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }
/* * replay a hash index bitmap page */ static void hash_xlog_init_bitmap_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; Buffer bitmapbuf; Buffer metabuf; Page page; HashMetaPage metap; uint32 num_buckets; xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) XLogRecGetData(record); /* * Initialize bitmap page */ bitmapbuf = XLogInitBufferForRedo(record, 0); _hash_initbitmapbuffer(bitmapbuf, xlrec->bmsize, true); PageSetLSN(BufferGetPage(bitmapbuf), lsn); MarkBufferDirty(bitmapbuf); UnlockReleaseBuffer(bitmapbuf); /* add the new bitmap page to the metapage's list of bitmaps */ if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) { /* * Note: in normal operation, we'd update the metapage while still * holding lock on the bitmap page. But during replay it's not * necessary to hold that lock, since nobody can see it yet; the * creating transaction hasn't yet committed. */ page = BufferGetPage(metabuf); metap = HashPageGetMeta(page); num_buckets = metap->hashm_maxbucket + 1; metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1; metap->hashm_nmaps++; PageSetLSN(page, lsn); MarkBufferDirty(metabuf); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); }
/* * _hash_get_newblock_from_oldbucket() -- get the block number of a bucket * that will be generated after split from old bucket. * * This is used to find the new bucket from old bucket based on current table * half. It is mainly required to finish the incomplete splits where we are * sure that not more than one bucket could have split in progress from old * bucket. */ BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket) { Bucket new_bucket; Buffer metabuf; HashMetaPage metap; BlockNumber blkno; metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket, metap->hashm_lowmask, metap->hashm_maxbucket); blkno = BUCKET_TO_BLKNO(metap, new_bucket); _hash_relbuf(rel, metabuf); return blkno; }
/* * _hash_getcachedmetap() -- Returns cached metapage data. * * If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on * the metapage. If not set, we'll set it before returning if we have to * refresh the cache, and return with a pin but no lock on it; caller is * responsible for releasing the pin. * * We refresh the cache if it's not initialized yet or force_refresh is true. */ HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh) { Page page; Assert(metabuf); if (force_refresh || rel->rd_amcache == NULL) { char *cache = NULL; /* * It's important that we don't set rd_amcache to an invalid value. * Either MemoryContextAlloc or _hash_getbuf could fail, so don't * install a pointer to the newly-allocated storage in the actual * relcache entry until both have succeeeded. */ if (rel->rd_amcache == NULL) cache = MemoryContextAlloc(rel->rd_indexcxt, sizeof(HashMetaPageData)); /* Read the metapage. */ if (BufferIsValid(*metabuf)) LockBuffer(*metabuf, BUFFER_LOCK_SHARE); else *metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); page = BufferGetPage(*metabuf); /* Populate the cache. */ if (rel->rd_amcache == NULL) rel->rd_amcache = cache; memcpy(rel->rd_amcache, HashPageGetMeta(page), sizeof(HashMetaPageData)); /* Release metapage lock, but keep the pin. */ LockBuffer(*metabuf, BUFFER_LOCK_UNLOCK); } return (HashMetaPage) rel->rd_amcache; }
/* * replay for update meta page */ static void hash_xlog_update_meta_page(XLogReaderState *record) { HashMetaPage metap; XLogRecPtr lsn = record->EndRecPtr; xl_hash_update_meta_page *xldata = (xl_hash_update_meta_page *) XLogRecGetData(record); Buffer metabuf; Page page; if (XLogReadBufferForRedo(record, 0, &metabuf) == BLK_NEEDS_REDO) { page = BufferGetPage(metabuf); metap = HashPageGetMeta(page); metap->hashm_ntuples = xldata->ntuples; PageSetLSN(page, lsn); MarkBufferDirty(metabuf); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); }
/* * replay allocation of page for split operation */ static void hash_xlog_split_allocate_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) XLogRecGetData(record); Buffer oldbuf; Buffer newbuf; Buffer metabuf; Size datalen PG_USED_FOR_ASSERTS_ONLY; char *data; XLogRedoAction action; /* * To be consistent with normal operation, here we take cleanup locks on * both the old and new buckets even though there can't be any concurrent * inserts. */ /* replay the record for old bucket */ action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &oldbuf); /* * Note that we still update the page even if it was restored from a full * page image, because the special space is not included in the image. */ if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) { Page oldpage; HashPageOpaque oldopaque; oldpage = BufferGetPage(oldbuf); oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage); oldopaque->hasho_flag = xlrec->old_bucket_flag; oldopaque->hasho_prevblkno = xlrec->new_bucket; PageSetLSN(oldpage, lsn); MarkBufferDirty(oldbuf); } /* replay the record for new bucket */ newbuf = XLogInitBufferForRedo(record, 1); _hash_initbuf(newbuf, xlrec->new_bucket, xlrec->new_bucket, xlrec->new_bucket_flag, true); if (!IsBufferCleanupOK(newbuf)) elog(PANIC, "hash_xlog_split_allocate_page: failed to acquire cleanup lock"); MarkBufferDirty(newbuf); PageSetLSN(BufferGetPage(newbuf), lsn); /* * We can release the lock on old bucket early as well but doing here to * consistent with normal operation. */ if (BufferIsValid(oldbuf)) UnlockReleaseBuffer(oldbuf); if (BufferIsValid(newbuf)) UnlockReleaseBuffer(newbuf); /* * Note: in normal operation, we'd update the meta page while still * holding lock on the old and new bucket pages. But during replay it's * not necessary to hold those locks, since no other bucket splits can be * happening concurrently. */ /* replay the record for metapage changes */ if (XLogReadBufferForRedo(record, 2, &metabuf) == BLK_NEEDS_REDO) { Page page; HashMetaPage metap; page = BufferGetPage(metabuf); metap = HashPageGetMeta(page); metap->hashm_maxbucket = xlrec->new_bucket; data = XLogRecGetBlockData(record, 2, &datalen); if (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) { uint32 lowmask; uint32 *highmask; /* extract low and high masks. */ memcpy(&lowmask, data, sizeof(uint32)); highmask = (uint32 *) ((char *) data + sizeof(uint32)); /* update metapage */ metap->hashm_lowmask = lowmask; metap->hashm_highmask = *highmask; data += sizeof(uint32) * 2; } if (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) { uint32 ovflpoint; uint32 *ovflpages; /* extract information of overflow pages. */ memcpy(&ovflpoint, data, sizeof(uint32)); ovflpages = (uint32 *) ((char *) data + sizeof(uint32)); /* update metapage */ metap->hashm_spares[ovflpoint] = *ovflpages; metap->hashm_ovflpoint = ovflpoint; } MarkBufferDirty(metabuf); PageSetLSN(BufferGetPage(metabuf), lsn); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); }
/* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * This function also deletes the tuples that are moved by split to other * bucket. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state) { Relation rel = info->index; double tuples_removed; double num_index_tuples; double orig_ntuples; Bucket orig_maxbucket; Bucket cur_maxbucket; Bucket cur_bucket; Buffer metabuf = InvalidBuffer; HashMetaPage metap; HashMetaPage cachedmetap; tuples_removed = 0; num_index_tuples = 0; /* * We need a copy of the metapage so that we can use its hashm_spares[] * values to compute bucket page addresses, but a cached copy should be * good enough. (If not, we'll detect that further down and refresh the * cache as necessary.) */ cachedmetap = _hash_getcachedmetap(rel, &metabuf, false); Assert(cachedmetap != NULL); orig_maxbucket = cachedmetap->hashm_maxbucket; orig_ntuples = cachedmetap->hashm_ntuples; /* Scan the buckets that we know exist */ cur_bucket = 0; cur_maxbucket = orig_maxbucket; loop_top: while (cur_bucket <= cur_maxbucket) { BlockNumber bucket_blkno; BlockNumber blkno; Buffer bucket_buf; Buffer buf; HashPageOpaque bucket_opaque; Page page; bool split_cleanup = false; /* Get address of bucket's start page */ bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket); blkno = bucket_blkno; /* * We need to acquire a cleanup lock on the primary bucket page to out * wait concurrent scans before deleting the dead tuples. */ buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBufferForCleanup(buf); _hash_checkpage(rel, buf, LH_BUCKET_PAGE); page = BufferGetPage(buf); bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* * If the bucket contains tuples that are moved by split, then we need * to delete such tuples. We can't delete such tuples if the split * operation on bucket is not finished as those are needed by scans. */ if (!H_BUCKET_BEING_SPLIT(bucket_opaque) && H_NEEDS_SPLIT_CLEANUP(bucket_opaque)) { split_cleanup = true; /* * This bucket might have been split since we last held a lock on * the metapage. If so, hashm_maxbucket, hashm_highmask and * hashm_lowmask might be old enough to cause us to fail to remove * tuples left behind by the most recent split. To prevent that, * now that the primary page of the target bucket has been locked * (and thus can't be further split), check whether we need to * update our cached metapage data. */ Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber); if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket) { cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); Assert(cachedmetap != NULL); } } bucket_buf = buf; hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy, cachedmetap->hashm_maxbucket, cachedmetap->hashm_highmask, cachedmetap->hashm_lowmask, &tuples_removed, &num_index_tuples, split_cleanup, callback, callback_state); _hash_dropbuf(rel, bucket_buf); /* Advance to next bucket */ cur_bucket++; } if (BufferIsInvalid(metabuf)) metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE); /* Write-lock metapage and check for split since we started */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); metap = HashPageGetMeta(BufferGetPage(metabuf)); if (cur_maxbucket != metap->hashm_maxbucket) { /* There's been a split, so process the additional bucket(s) */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); Assert(cachedmetap != NULL); cur_maxbucket = cachedmetap->hashm_maxbucket; goto loop_top; } /* Okay, we're really done. Update tuple count in metapage. */ START_CRIT_SECTION(); if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) { /* * No one has split or inserted anything since start of scan, so * believe our count as gospel. */ metap->hashm_ntuples = num_index_tuples; } else { /* * Otherwise, our count is untrustworthy since we may have * double-scanned tuples in split buckets. Proceed by dead-reckoning. * (Note: we still return estimated_count = false, because using this * count is better than not updating reltuples at all.) */ if (metap->hashm_ntuples > tuples_removed) metap->hashm_ntuples -= tuples_removed; else metap->hashm_ntuples = 0; num_index_tuples = metap->hashm_ntuples; } MarkBufferDirty(metabuf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_hash_update_meta_page xlrec; XLogRecPtr recptr; xlrec.ntuples = metap->hashm_ntuples; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage); XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE); PageSetLSN(BufferGetPage(metabuf), recptr); } END_CRIT_SECTION(); _hash_relbuf(rel, metabuf); /* return statistics */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); stats->estimated_count = false; stats->num_index_tuples = num_index_tuples; stats->tuples_removed += tuples_removed; /* hashvacuumcleanup will fill in num_pages */ return stats; }
/* * Verify that the given bytea contains a HASH page, or die in the attempt. * A pointer to a palloc'd, properly aligned copy of the page is returned. */ static Page verify_hash_page(bytea *raw_page, int flags) { Page page = get_page_from_raw(raw_page); int pagetype = LH_UNUSED_PAGE; /* Treat new pages as unused. */ if (!PageIsNew(page)) { HashPageOpaque pageopaque; if (PageGetSpecialSize(page) != MAXALIGN(sizeof(HashPageOpaqueData))) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index table contains corrupted page"))); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); if (pageopaque->hasho_page_id != HASHO_PAGE_ID) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("page is not a hash page"), errdetail("Expected %08x, got %08x.", HASHO_PAGE_ID, pageopaque->hasho_page_id))); pagetype = pageopaque->hasho_flag & LH_PAGE_TYPE; } /* Check that page type is sane. */ if (pagetype != LH_OVERFLOW_PAGE && pagetype != LH_BUCKET_PAGE && pagetype != LH_BITMAP_PAGE && pagetype != LH_META_PAGE && pagetype != LH_UNUSED_PAGE) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid hash page type %08x", pagetype))); /* If requested, verify page type. */ if (flags != 0 && (pagetype & flags) == 0) { switch (flags) { case LH_META_PAGE: ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("page is not a hash meta page"))); case LH_BUCKET_PAGE | LH_OVERFLOW_PAGE: ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("page is not a hash bucket or overflow page"))); case LH_OVERFLOW_PAGE: ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("page is not a hash overflow page"))); default: elog(ERROR, "hash page of type %08x not in mask %08x", pagetype, flags); } } /* * If it is the metapage, also verify magic number and version. */ if (pagetype == LH_META_PAGE) { HashMetaPage metap = HashPageGetMeta(page); if (metap->hashm_magic != HASH_MAGIC) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("invalid magic number for metadata"), errdetail("Expected 0x%08x, got 0x%08x.", HASH_MAGIC, metap->hashm_magic))); if (metap->hashm_version != HASH_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("invalid version for metadata"), errdetail("Expected %d, got %d", HASH_VERSION, metap->hashm_version))); } return page; }
/* ------------------------------------------------ * hash_bitmap_info() * * Get bitmap information for a particular overflow page * * Usage: SELECT * FROM hash_bitmap_info('con_hash_index'::regclass, 5); * ------------------------------------------------ */ Datum hash_bitmap_info(PG_FUNCTION_ARGS) { Oid indexRelid = PG_GETARG_OID(0); uint64 ovflblkno = PG_GETARG_INT64(1); HashMetaPage metap; Buffer metabuf, mapbuf; BlockNumber bitmapblkno; Page mappage; bool bit = false; TupleDesc tupleDesc; Relation indexRel; uint32 ovflbitno; int32 bitmappage, bitmapbit; HeapTuple tuple; int i, j; Datum values[3]; bool nulls[3]; uint32 *freep; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use raw page functions")))); indexRel = index_open(indexRelid, AccessShareLock); if (!IS_HASH(indexRel)) elog(ERROR, "relation \"%s\" is not a hash index", RelationGetRelationName(indexRel)); if (RELATION_IS_OTHER_TEMP(indexRel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary tables of other sessions"))); if (ovflblkno >= RelationGetNumberOfBlocks(indexRel)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("block number " UINT64_FORMAT " is out of range for relation \"%s\"", ovflblkno, RelationGetRelationName(indexRel)))); /* Read the metapage so we can determine which bitmap page to use */ metabuf = _hash_getbuf(indexRel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* * Reject attempt to read the bit for a metapage or bitmap page; this is * only meaningful for overflow pages. */ if (ovflblkno == 0) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid overflow block number %u", (BlockNumber) ovflblkno))); for (i = 0; i < metap->hashm_nmaps; i++) if (metap->hashm_mapp[i] == ovflblkno) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid overflow block number %u", (BlockNumber) ovflblkno))); /* * Identify overflow bit number. This will error out for primary bucket * pages, and we've already rejected the metapage and bitmap pages above. */ ovflbitno = _hash_ovflblkno_to_bitno(metap, (BlockNumber) ovflblkno); bitmappage = ovflbitno >> BMPG_SHIFT(metap); bitmapbit = ovflbitno & BMPG_MASK(metap); if (bitmappage >= metap->hashm_nmaps) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid overflow block number %u", (BlockNumber) ovflblkno))); bitmapblkno = metap->hashm_mapp[bitmappage]; _hash_relbuf(indexRel, metabuf); /* Check the status of bitmap bit for overflow page */ mapbuf = _hash_getbuf(indexRel, bitmapblkno, HASH_READ, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); freep = HashPageGetBitmap(mappage); bit = ISSET(freep, bitmapbit) != 0; _hash_relbuf(indexRel, mapbuf); index_close(indexRel, AccessShareLock); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); tupleDesc = BlessTupleDesc(tupleDesc); MemSet(nulls, 0, sizeof(nulls)); j = 0; values[j++] = Int64GetDatum((int64) bitmapblkno); values[j++] = Int32GetDatum(bitmapbit); values[j++] = BoolGetDatum(bit); tuple = heap_form_tuple(tupleDesc, values, nulls); PG_RETURN_DATUM(HeapTupleGetDatum(tuple)); }
/* * _hash_metapinit() -- Initialize the metadata page of a hash index, * the initial buckets, and the initial bitmap page. * * The initial number of buckets is dependent on num_tuples, an estimate * of the number of tuples to be loaded into the index initially. The * chosen number of buckets is returned. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ uint32 _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) { HashMetaPage metap; HashPageOpaque pageopaque; Buffer metabuf; Buffer buf; Page pg; int32 data_width; int32 item_width; int32 ffactor; double dnumbuckets; uint32 num_buckets; uint32 log2_num_buckets; uint32 i; /* safety check */ if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); /* * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full * as the user-settable fillfactor parameter says. We can compute it * exactly since the index datatype (i.e. uint32 hash key) is fixed-width. */ data_width = sizeof(uint32); item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) + sizeof(ItemIdData); /* include the line pointer */ ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width; /* keep to a sane range */ if (ffactor < 10) ffactor = 10; /* * Choose the number of initial bucket pages to match the fill factor * given the estimated number of tuples. We round up the result to the * next power of 2, however, and always force at least 2 bucket pages. The * upper limit is determined by considerations explained in * _hash_expandtable(). */ dnumbuckets = num_tuples / ffactor; if (dnumbuckets <= 2.0) num_buckets = 2; else if (dnumbuckets >= (double) 0x40000000) num_buckets = 0x40000000; else num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets); log2_num_buckets = _hash_log2(num_buckets); Assert(num_buckets == (((uint32) 1) << log2_num_buckets)); Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS); /* * We initialize the metapage, the first N bucket pages, and the first * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() * calls to occur. This ensures that the smgr level has the right idea of * the physical index length. */ metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); pg = BufferGetPage(metabuf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = -1; pageopaque->hasho_flag = LH_META_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; metap = HashPageGetMeta(pg); metap->hashm_magic = HASH_MAGIC; metap->hashm_version = HASH_VERSION; metap->hashm_ntuples = 0; metap->hashm_nmaps = 0; metap->hashm_ffactor = ffactor; metap->hashm_bsize = HashGetMaxBitmapSize(pg); /* find largest bitmap array size that will fit in page size */ for (i = _hash_log2(metap->hashm_bsize); i > 0; --i) { if ((1 << i) <= metap->hashm_bsize) break; } Assert(i > 0); metap->hashm_bmsize = 1 << i; metap->hashm_bmshift = i + BYTE_TO_BIT; Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1)); /* * Label the index with its primary hash support function's OID. This is * pretty useless for normal operation (in fact, hashm_procid is not used * anywhere), but it might be handy for forensic purposes so we keep it. */ metap->hashm_procid = index_getprocid(rel, 1, HASHPROC); /* * We initialize the index with N buckets, 0 .. N-1, occupying physical * blocks 1 to N. The first freespace bitmap page is in block N+1. Since * N is a power of 2, we can set the masks this way: */ metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1; metap->hashm_highmask = (num_buckets << 1) - 1; MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); /* Set up mapping for one spare page after the initial splitpoints */ metap->hashm_spares[log2_num_buckets] = 1; metap->hashm_ovflpoint = log2_num_buckets; metap->hashm_firstfree = 0; /* * Release buffer lock on the metapage while we initialize buckets. * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS * won't accomplish anything. It's a bad idea to hold buffer locks for * long intervals in any case, since that can block the bgwriter. */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); /* * Initialize the first N buckets */ for (i = 0; i < num_buckets; i++) { /* Allow interrupts, in case N is huge */ CHECK_FOR_INTERRUPTS(); buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum); pg = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = i; pageopaque->hasho_flag = LH_BUCKET_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; _hash_wrtbuf(rel, buf); } /* Now reacquire buffer lock on metapage */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); /* * Initialize first bitmap page */ _hash_initbitmap(rel, metap, num_buckets + 1, forkNum); /* all done */ _hash_wrtbuf(rel, metabuf); return num_buckets; }
/* * _hash_freeovflpage() - * * Remove this overflow page from its bucket's chain, and mark the page as * free. On entry, ovflbuf is write-locked; it is released before exiting. * * Add the tuples (itups) to wbuf in this function. We could do that in the * caller as well, but the advantage of doing it here is we can easily write * the WAL for XLOG_HASH_SQUEEZE_PAGE operation. Addition of tuples and * removal of overflow page has to done as an atomic operation, otherwise * during replay on standby users might find duplicate records. * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. * * Returns the block number of the page that followed the given page * in the bucket, or InvalidBlockNumber if no following page. * * NB: caller must not hold lock on metapage, nor on page, that's next to * ovflbuf in the bucket chain. We don't acquire the lock on page that's * prior to ovflbuf in chain if it is same as wbuf because the caller already * has a lock on same. */ BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy) { HashMetaPage metap; Buffer metabuf; Buffer mapbuf; BlockNumber ovflblkno; BlockNumber prevblkno; BlockNumber blkno; BlockNumber nextblkno; BlockNumber writeblkno; HashPageOpaque ovflopaque; Page ovflpage; Page mappage; uint32 *freep; uint32 ovflbitno; int32 bitmappage, bitmapbit; Bucket bucket PG_USED_FOR_ASSERTS_ONLY; Buffer prevbuf = InvalidBuffer; Buffer nextbuf = InvalidBuffer; bool update_metap = false; /* Get information from the doomed page */ _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE); ovflblkno = BufferGetBlockNumber(ovflbuf); ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); nextblkno = ovflopaque->hasho_nextblkno; prevblkno = ovflopaque->hasho_prevblkno; writeblkno = BufferGetBlockNumber(wbuf); bucket = ovflopaque->hasho_bucket; /* * Fix up the bucket chain. this is a doubly-linked list, so we must fix * up the bucket chain members behind and ahead of the overflow page being * deleted. Concurrency issues are avoided by using lock chaining as * described atop hashbucketcleanup. */ if (BlockNumberIsValid(prevblkno)) { if (prevblkno == writeblkno) prevbuf = wbuf; else prevbuf = _hash_getbuf_with_strategy(rel, prevblkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, bstrategy); } if (BlockNumberIsValid(nextblkno)) nextbuf = _hash_getbuf_with_strategy(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); /* Note: bstrategy is intentionally not used for metapage and bitmap */ /* Read the metapage so we can determine which bitmap page to use */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* Identify which bit to set */ ovflbitno = _hash_ovflblkno_to_bitno(metap, ovflblkno); bitmappage = ovflbitno >> BMPG_SHIFT(metap); bitmapbit = ovflbitno & BMPG_MASK(metap); if (bitmappage >= metap->hashm_nmaps) elog(ERROR, "invalid overflow bit number %u", ovflbitno); blkno = metap->hashm_mapp[bitmappage]; /* Release metapage lock while we access the bitmap page */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); /* read the bitmap page to clear the bitmap bit */ mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); freep = HashPageGetBitmap(mappage); Assert(ISSET(freep, bitmapbit)); /* Get write-lock on metapage to update firstfree */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); /* This operation needs to log multiple tuples, prepare WAL for that */ if (RelationNeedsWAL(rel)) XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS, 4 + nitups); START_CRIT_SECTION(); /* * we have to insert tuples on the "write" page, being careful to preserve * hashkey ordering. (If we insert many tuples into the same "write" page * it would be worth qsort'ing them). */ if (nitups > 0) { _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); MarkBufferDirty(wbuf); } /* * Reinitialize the freed overflow page. Just zeroing the page won't * work, because WAL replay routines expect pages to be initialized. See * explanation of RBM_NORMAL mode atop XLogReadBufferExtended. We are * careful to make the special space valid here so that tools like * pageinspect won't get confused. */ _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); ovflopaque->hasho_prevblkno = InvalidBlockNumber; ovflopaque->hasho_nextblkno = InvalidBlockNumber; ovflopaque->hasho_bucket = -1; ovflopaque->hasho_flag = LH_UNUSED_PAGE; ovflopaque->hasho_page_id = HASHO_PAGE_ID; MarkBufferDirty(ovflbuf); if (BufferIsValid(prevbuf)) { Page prevpage = BufferGetPage(prevbuf); HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); Assert(prevopaque->hasho_bucket == bucket); prevopaque->hasho_nextblkno = nextblkno; MarkBufferDirty(prevbuf); } if (BufferIsValid(nextbuf)) { Page nextpage = BufferGetPage(nextbuf); HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); Assert(nextopaque->hasho_bucket == bucket); nextopaque->hasho_prevblkno = prevblkno; MarkBufferDirty(nextbuf); } /* Clear the bitmap bit to indicate that this overflow page is free */ CLRBIT(freep, bitmapbit); MarkBufferDirty(mapbuf); /* if this is now the first free page, update hashm_firstfree */ if (ovflbitno < metap->hashm_firstfree) { metap->hashm_firstfree = ovflbitno; update_metap = true; MarkBufferDirty(metabuf); } /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_hash_squeeze_page xlrec; XLogRecPtr recptr; int i; xlrec.prevblkno = prevblkno; xlrec.nextblkno = nextblkno; xlrec.ntups = nitups; xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf); xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf); XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage); /* * bucket buffer needs to be registered to ensure that we can acquire * a cleanup lock on it during replay. */ if (!xlrec.is_prim_bucket_same_wrt) XLogRegisterBuffer(0, bucketbuf, REGBUF_STANDARD | REGBUF_NO_IMAGE); XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); if (xlrec.ntups > 0) { XLogRegisterBufData(1, (char *) itup_offsets, nitups * sizeof(OffsetNumber)); for (i = 0; i < nitups; i++) XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); } XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD); /* * If prevpage and the writepage (block in which we are moving tuples * from overflow) are same, then no need to separately register * prevpage. During replay, we can directly update the nextblock in * writepage. */ if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD); if (BufferIsValid(nextbuf)) XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD); XLogRegisterBuffer(5, mapbuf, REGBUF_STANDARD); XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32)); if (update_metap) { XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD); XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32)); } recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE); PageSetLSN(BufferGetPage(wbuf), recptr); PageSetLSN(BufferGetPage(ovflbuf), recptr); if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) PageSetLSN(BufferGetPage(prevbuf), recptr); if (BufferIsValid(nextbuf)) PageSetLSN(BufferGetPage(nextbuf), recptr); PageSetLSN(BufferGetPage(mapbuf), recptr); if (update_metap) PageSetLSN(BufferGetPage(metabuf), recptr); } END_CRIT_SECTION(); /* release previous bucket if it is not same as write bucket */ if (BufferIsValid(prevbuf) && prevblkno != writeblkno) _hash_relbuf(rel, prevbuf); if (BufferIsValid(ovflbuf)) _hash_relbuf(rel, ovflbuf); if (BufferIsValid(nextbuf)) _hash_relbuf(rel, nextbuf); _hash_relbuf(rel, mapbuf); _hash_relbuf(rel, metabuf); return nextblkno; }
/* * _hash_addovflpage * * Add an overflow page to the bucket whose last page is pointed to by 'buf'. * * On entry, the caller must hold a pin but no lock on 'buf'. The pin is * dropped before exiting (we assume the caller is not interested in 'buf' * anymore) if not asked to retain. The pin will be retained only for the * primary bucket. The returned overflow page will be pinned and * write-locked; it is guaranteed to be empty. * * The caller must hold a pin, but no lock, on the metapage buffer. * That buffer is returned in the same state. * * NB: since this could be executed concurrently by multiple processes, * one should not assume that the returned overflow page will be the * immediate successor of the originally passed 'buf'. Additional overflow * pages might have been added to the bucket chain in between. */ Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) { Buffer ovflbuf; Page page; Page ovflpage; HashPageOpaque pageopaque; HashPageOpaque ovflopaque; HashMetaPage metap; Buffer mapbuf = InvalidBuffer; Buffer newmapbuf = InvalidBuffer; BlockNumber blkno; uint32 orig_firstfree; uint32 splitnum; uint32 *freep = NULL; uint32 max_ovflpg; uint32 bit; uint32 bitmap_page_bit; uint32 first_page; uint32 last_bit; uint32 last_page; uint32 i, j; bool page_found = false; /* * Write-lock the tail page. Here, we need to maintain locking order such * that, first acquire the lock on tail page of bucket, then on meta page * to find and lock the bitmap page and if it is found, then lock on meta * page is released, then finally acquire the lock on new overflow buffer. * We need this locking order to avoid deadlock with backends that are * doing inserts. * * Note: We could have avoided locking many buffers here if we made two * WAL records for acquiring an overflow page (one to allocate an overflow * page and another to add it to overflow bucket chain). However, doing * so can leak an overflow page, if the system crashes after allocation. * Needless to say, it is better to have a single record from a * performance point of view as well. */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); /* probably redundant... */ _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); /* loop to find current tail page, in case someone else inserted too */ for (;;) { BlockNumber nextblkno; page = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); nextblkno = pageopaque->hasho_nextblkno; if (!BlockNumberIsValid(nextblkno)) break; /* we assume we do not need to write the unmodified page */ if (retain_pin) { /* pin will be retained only for the primary bucket page */ Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_BUCKET_PAGE); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } else _hash_relbuf(rel, buf); retain_pin = false; buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); } /* Get exclusive lock on the meta page */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); _hash_checkpage(rel, metabuf, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* start search at hashm_firstfree */ orig_firstfree = metap->hashm_firstfree; first_page = orig_firstfree >> BMPG_SHIFT(metap); bit = orig_firstfree & BMPG_MASK(metap); i = first_page; j = bit / BITS_PER_MAP; bit &= ~(BITS_PER_MAP - 1); /* outer loop iterates once per bitmap page */ for (;;) { BlockNumber mapblkno; Page mappage; uint32 last_inpage; /* want to end search with the last existing overflow page */ splitnum = metap->hashm_ovflpoint; max_ovflpg = metap->hashm_spares[splitnum] - 1; last_page = max_ovflpg >> BMPG_SHIFT(metap); last_bit = max_ovflpg & BMPG_MASK(metap); if (i > last_page) break; Assert(i < metap->hashm_nmaps); mapblkno = metap->hashm_mapp[i]; if (i == last_page) last_inpage = last_bit; else last_inpage = BMPGSZ_BIT(metap) - 1; /* Release exclusive lock on metapage while reading bitmap page */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); freep = HashPageGetBitmap(mappage); for (; bit <= last_inpage; j++, bit += BITS_PER_MAP) { if (freep[j] != ALL_SET) { page_found = true; /* Reacquire exclusive lock on the meta page */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); /* convert bit to bit number within page */ bit += _hash_firstfreebit(freep[j]); bitmap_page_bit = bit; /* convert bit to absolute bit number */ bit += (i << BMPG_SHIFT(metap)); /* Calculate address of the recycled overflow page */ blkno = bitno_to_blkno(metap, bit); /* Fetch and init the recycled page */ ovflbuf = _hash_getinitbuf(rel, blkno); goto found; } } /* No free space here, try to advance to next map page */ _hash_relbuf(rel, mapbuf); mapbuf = InvalidBuffer; i++; j = 0; /* scan from start of next map page */ bit = 0; /* Reacquire exclusive lock on the meta page */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); } /* * No free pages --- have to extend the relation to add an overflow page. * First, check to see if we have to add a new bitmap page too. */ if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1)) { /* * We create the new bitmap page with all pages marked "in use". * Actually two pages in the new bitmap's range will exist * immediately: the bitmap page itself, and the following page which * is the one we return to the caller. Both of these are correctly * marked "in use". Subsequent pages do not exist yet, but it is * convenient to pre-mark them as "in use" too. */ bit = metap->hashm_spares[splitnum]; /* metapage already has a write lock */ if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("out of overflow pages in hash index \"%s\"", RelationGetRelationName(rel)))); newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM); } else { /* * Nothing to do here; since the page will be past the last used page, * we know its bitmap bit was preinitialized to "in use". */ } /* Calculate address of the new overflow page */ bit = BufferIsValid(newmapbuf) ? metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum]; blkno = bitno_to_blkno(metap, bit); /* * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the * relation length stays in sync with ours. XXX It's annoying to do this * with metapage write lock held; would be better to use a lock that * doesn't block incoming searches. * * It is okay to hold two buffer locks here (one on tail page of bucket * and other on new overflow page) since there cannot be anyone else * contending for access to ovflbuf. */ ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM); found: /* * Do the update. No ereport(ERROR) until changes are logged. We want to * log the changes for bitmap page and overflow page together to avoid * loss of pages in case the new page is added. */ START_CRIT_SECTION(); if (page_found) { Assert(BufferIsValid(mapbuf)); /* mark page "in use" in the bitmap */ SETBIT(freep, bitmap_page_bit); MarkBufferDirty(mapbuf); } else { /* update the count to indicate new overflow page is added */ metap->hashm_spares[splitnum]++; if (BufferIsValid(newmapbuf)) { _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false); MarkBufferDirty(newmapbuf); /* add the new bitmap page to the metapage's list of bitmaps */ metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf); metap->hashm_nmaps++; metap->hashm_spares[splitnum]++; } MarkBufferDirty(metabuf); /* * for new overflow page, we don't need to explicitly set the bit in * bitmap page, as by default that will be set to "in use". */ } /* * Adjust hashm_firstfree to avoid redundant searches. But don't risk * changing it if someone moved it while we were searching bitmap pages. */ if (metap->hashm_firstfree == orig_firstfree) { metap->hashm_firstfree = bit + 1; MarkBufferDirty(metabuf); } /* initialize new overflow page */ ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); ovflopaque->hasho_nextblkno = InvalidBlockNumber; ovflopaque->hasho_bucket = pageopaque->hasho_bucket; ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; ovflopaque->hasho_page_id = HASHO_PAGE_ID; MarkBufferDirty(ovflbuf); /* logically chain overflow page to previous page */ pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); MarkBufferDirty(buf); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; xl_hash_add_ovfl_page xlrec; xlrec.bmpage_found = page_found; xlrec.bmsize = metap->hashm_bmsize; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashAddOvflPage); XLogRegisterBuffer(0, ovflbuf, REGBUF_WILL_INIT); XLogRegisterBufData(0, (char *) &pageopaque->hasho_bucket, sizeof(Bucket)); XLogRegisterBuffer(1, buf, REGBUF_STANDARD); if (BufferIsValid(mapbuf)) { XLogRegisterBuffer(2, mapbuf, REGBUF_STANDARD); XLogRegisterBufData(2, (char *) &bitmap_page_bit, sizeof(uint32)); } if (BufferIsValid(newmapbuf)) XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT); XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD); XLogRegisterBufData(4, (char *) &metap->hashm_firstfree, sizeof(uint32)); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE); PageSetLSN(BufferGetPage(ovflbuf), recptr); PageSetLSN(BufferGetPage(buf), recptr); if (BufferIsValid(mapbuf)) PageSetLSN(BufferGetPage(mapbuf), recptr); if (BufferIsValid(newmapbuf)) PageSetLSN(BufferGetPage(newmapbuf), recptr); PageSetLSN(BufferGetPage(metabuf), recptr); } END_CRIT_SECTION(); if (retain_pin) LockBuffer(buf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, buf); if (BufferIsValid(mapbuf)) _hash_relbuf(rel, mapbuf); LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); if (BufferIsValid(newmapbuf)) _hash_relbuf(rel, newmapbuf); return ovflbuf; }
/* * _hash_doinsert() -- Handle insertion of a single index tuple. * * This routine is called by the public interface routines, hashbuild * and hashinsert. By here, itup is completely filled in. */ void _hash_doinsert(Relation rel, IndexTuple itup) { Buffer buf; Buffer metabuf; HashMetaPage metap; BlockNumber blkno; Page page; HashPageOpaque pageopaque; Size itemsz; bool do_expand; uint32 hashkey; Bucket bucket; /* * Get the hash key for the item (it's stored in the index tuple itself). */ hashkey = _hash_get_indextuple_hashkey(itup); /* compute item size too */ itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we * need to be consistent */ /* * Acquire shared split lock so we can compute the target bucket safely * (see README). */ _hash_getlock(rel, 0, HASH_SHARE); /* Read the metapage */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* * Check whether the item can fit on a hash page at all. (Eventually, we * ought to try to apply TOAST methods if not.) Note that at this point, * itemsz doesn't include the ItemId. * * XXX this is useless code if we are only storing hash keys. */ if (itemsz > HashMaxItemSize((Page) metap)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds hash maximum %lu", (unsigned long) itemsz, (unsigned long) HashMaxItemSize((Page) metap)), errhint("Values larger than a buffer page cannot be indexed."))); /* * Compute the target bucket number, and convert to block number. */ bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask); blkno = BUCKET_TO_BLKNO(metap, bucket); /* release lock on metapage, but keep pin since we'll need it again */ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); /* * Acquire share lock on target bucket; then we can release split lock. */ _hash_getlock(rel, blkno, HASH_SHARE); _hash_droplock(rel, 0, HASH_SHARE); /* Fetch the primary bucket page for the bucket */ buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE); page = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(pageopaque->hasho_bucket == bucket); /* Do the insertion */ while (PageGetFreeSpace(page) < itemsz) { /* * no space on this page; check for an overflow page */ BlockNumber nextblkno = pageopaque->hasho_nextblkno; if (BlockNumberIsValid(nextblkno)) { /* * ovfl page exists; go get it. if it doesn't have room, we'll * find out next pass through the loop test above. */ _hash_relbuf(rel, buf); buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); page = BufferGetPage(buf); } else { /* * we're at the end of the bucket chain and we haven't found a * page with enough room. allocate a new overflow page. */ /* release our write lock without modifying buffer */ _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK); /* chain to a new overflow page */ buf = _hash_addovflpage(rel, metabuf, buf); page = BufferGetPage(buf); /* should fit now, given test above */ Assert(PageGetFreeSpace(page) >= itemsz); } pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE); Assert(pageopaque->hasho_bucket == bucket); } /* found page with enough space, so add the item here */ (void) _hash_pgaddtup(rel, buf, itemsz, itup); /* write and release the modified page */ _hash_wrtbuf(rel, buf); /* We can drop the bucket lock now */ _hash_droplock(rel, blkno, HASH_SHARE); /* * Write-lock the metapage so we can increment the tuple count. After * incrementing it, check to see if it's time for a split. */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); metap->hashm_ntuples += 1; /* Make sure this stays in sync with _hash_expandtable() */ do_expand = metap->hashm_ntuples > (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1); /* Write out the metapage and drop lock, but keep pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); /* Attempt to split if a split is needed */ if (do_expand) _hash_expandtable(rel, metabuf); /* Finally drop our pin on the metapage */ _hash_dropbuf(rel, metabuf); }
/* * Attempt to expand the hash table by creating one new bucket. * * This will silently do nothing if we don't get cleanup lock on old or * new bucket. * * Complete the pending splits and remove the tuples from old bucket, * if there are any left over from the previous split. * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. */ void _hash_expandtable(Relation rel, Buffer metabuf) { HashMetaPage metap; Bucket old_bucket; Bucket new_bucket; uint32 spare_ndx; BlockNumber start_oblkno; BlockNumber start_nblkno; Buffer buf_nblkno; Buffer buf_oblkno; Page opage; Page npage; HashPageOpaque oopaque; HashPageOpaque nopaque; uint32 maxbucket; uint32 highmask; uint32 lowmask; bool metap_update_masks = false; bool metap_update_splitpoint = false; restart_expand: /* * Write-lock the meta page. It used to be necessary to acquire a * heavyweight lock to begin a split, but that is no longer required. */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); _hash_checkpage(rel, metabuf, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* * Check to see if split is still needed; someone else might have already * done one while we waited for the lock. * * Make sure this stays in sync with _hash_doinsert() */ if (metap->hashm_ntuples <= (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1)) goto fail; /* * Can't split anymore if maxbucket has reached its maximum possible * value. * * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because * the calculation maxbucket+1 mustn't overflow). Currently we restrict * to half that because of overflow looping in _hash_log2() and * insufficient space in hashm_spares[]. It's moot anyway because an * index with 2^32 buckets would certainly overflow BlockNumber and hence * _hash_alloc_buckets() would fail, but if we supported buckets smaller * than a disk block then this would be an independent constraint. * * If you change this, see also the maximum initial number of buckets in * _hash_init(). */ if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE) goto fail; /* * Determine which bucket is to be split, and attempt to take cleanup lock * on the old bucket. If we can't get the lock, give up. * * The cleanup lock protects us not only against other backends, but * against our own backend as well. * * The cleanup lock is mainly to protect the split from concurrent * inserts. See src/backend/access/hash/README, Lock Definitions for * further details. Due to this locking restriction, if there is any * pending scan, the split will give up which is not good, but harmless. */ new_bucket = metap->hashm_maxbucket + 1; old_bucket = (new_bucket & metap->hashm_lowmask); start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket); buf_oblkno = _hash_getbuf_with_condlock_cleanup(rel, start_oblkno, LH_BUCKET_PAGE); if (!buf_oblkno) goto fail; opage = BufferGetPage(buf_oblkno); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); /* * We want to finish the split from a bucket as there is no apparent * benefit by not doing so and it will make the code complicated to finish * the split that involves multiple buckets considering the case where new * split also fails. We don't need to consider the new bucket for * completing the split here as it is not possible that a re-split of new * bucket starts when there is still a pending split from old bucket. */ if (H_BUCKET_BEING_SPLIT(oopaque)) { /* * Copy bucket mapping info now; refer the comment in code below where * we copy this information before calling _hash_splitbucket to see * why this is okay. */ maxbucket = metap->hashm_maxbucket; highmask = metap->hashm_highmask; lowmask = metap->hashm_lowmask; /* * Release the lock on metapage and old_bucket, before completing the * split. */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); LockBuffer(buf_oblkno, BUFFER_LOCK_UNLOCK); _hash_finish_split(rel, metabuf, buf_oblkno, old_bucket, maxbucket, highmask, lowmask); /* release the pin on old buffer and retry for expand. */ _hash_dropbuf(rel, buf_oblkno); goto restart_expand; } /* * Clean the tuples remained from the previous split. This operation * requires cleanup lock and we already have one on the old bucket, so * let's do it. We also don't want to allow further splits from the bucket * till the garbage of previous split is cleaned. This has two * advantages; first, it helps in avoiding the bloat due to garbage and * second is, during cleanup of bucket, we are always sure that the * garbage tuples belong to most recently split bucket. On the contrary, * if we allow cleanup of bucket after meta page is updated to indicate * the new split and before the actual split, the cleanup operation won't * be able to decide whether the tuple has been moved to the newly created * bucket and ended up deleting such tuples. */ if (H_NEEDS_SPLIT_CLEANUP(oopaque)) { /* * Copy bucket mapping info now; refer to the comment in code below * where we copy this information before calling _hash_splitbucket to * see why this is okay. */ maxbucket = metap->hashm_maxbucket; highmask = metap->hashm_highmask; lowmask = metap->hashm_lowmask; /* Release the metapage lock. */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); hashbucketcleanup(rel, old_bucket, buf_oblkno, start_oblkno, NULL, maxbucket, highmask, lowmask, NULL, NULL, true, NULL, NULL); _hash_dropbuf(rel, buf_oblkno); goto restart_expand; } /* * There shouldn't be any active scan on new bucket. * * Note: it is safe to compute the new bucket's blkno here, even though we * may still need to update the BUCKET_TO_BLKNO mapping. This is because * the current value of hashm_spares[hashm_ovflpoint] correctly shows * where we are going to put a new splitpoint's worth of buckets. */ start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket); /* * If the split point is increasing we need to allocate a new batch of * bucket pages. */ spare_ndx = _hash_spareindex(new_bucket + 1); if (spare_ndx > metap->hashm_ovflpoint) { uint32 buckets_to_add; Assert(spare_ndx == metap->hashm_ovflpoint + 1); /* * We treat allocation of buckets as a separate WAL-logged action. * Even if we fail after this operation, won't leak bucket pages; * rather, the next split will consume this space. In any case, even * without failure we don't use all the space in one split operation. */ buckets_to_add = _hash_get_totalbuckets(spare_ndx) - new_bucket; if (!_hash_alloc_buckets(rel, start_nblkno, buckets_to_add)) { /* can't split due to BlockNumber overflow */ _hash_relbuf(rel, buf_oblkno); goto fail; } } /* * Physically allocate the new bucket's primary page. We want to do this * before changing the metapage's mapping info, in case we can't get the * disk space. Ideally, we don't need to check for cleanup lock on new * bucket as no other backend could find this bucket unless meta page is * updated. However, it is good to be consistent with old bucket locking. */ buf_nblkno = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM); if (!IsBufferCleanupOK(buf_nblkno)) { _hash_relbuf(rel, buf_oblkno); _hash_relbuf(rel, buf_nblkno); goto fail; } /* * Since we are scribbling on the pages in the shared buffers, establish a * critical section. Any failure in this next code leaves us with a big * problem: the metapage is effectively corrupt but could get written back * to disk. */ START_CRIT_SECTION(); /* * Okay to proceed with split. Update the metapage bucket mapping info. */ metap->hashm_maxbucket = new_bucket; if (new_bucket > metap->hashm_highmask) { /* Starting a new doubling */ metap->hashm_lowmask = metap->hashm_highmask; metap->hashm_highmask = new_bucket | metap->hashm_lowmask; metap_update_masks = true; } /* * If the split point is increasing we need to adjust the hashm_spares[] * array and hashm_ovflpoint so that future overflow pages will be created * beyond this new batch of bucket pages. */ if (spare_ndx > metap->hashm_ovflpoint) { metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint]; metap->hashm_ovflpoint = spare_ndx; metap_update_splitpoint = true; } MarkBufferDirty(metabuf); /* * Copy bucket mapping info now; this saves re-accessing the meta page * inside _hash_splitbucket's inner loop. Note that once we drop the * split lock, other splits could begin, so these values might be out of * date before _hash_splitbucket finishes. That's okay, since all it * needs is to tell which of these two buckets to map hashkeys into. */ maxbucket = metap->hashm_maxbucket; highmask = metap->hashm_highmask; lowmask = metap->hashm_lowmask; opage = BufferGetPage(buf_oblkno); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); /* * Mark the old bucket to indicate that split is in progress. (At * operation end, we will clear the split-in-progress flag.) Also, for a * primary bucket page, hasho_prevblkno stores the number of buckets that * existed as of the last split, so we must update that value here. */ oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT; oopaque->hasho_prevblkno = maxbucket; MarkBufferDirty(buf_oblkno); npage = BufferGetPage(buf_nblkno); /* * initialize the new bucket's primary page and mark it to indicate that * split is in progress. */ nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); nopaque->hasho_prevblkno = maxbucket; nopaque->hasho_nextblkno = InvalidBlockNumber; nopaque->hasho_bucket = new_bucket; nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED; nopaque->hasho_page_id = HASHO_PAGE_ID; MarkBufferDirty(buf_nblkno); /* XLOG stuff */ if (RelationNeedsWAL(rel)) { xl_hash_split_allocate_page xlrec; XLogRecPtr recptr; xlrec.new_bucket = maxbucket; xlrec.old_bucket_flag = oopaque->hasho_flag; xlrec.new_bucket_flag = nopaque->hasho_flag; xlrec.flags = 0; XLogBeginInsert(); XLogRegisterBuffer(0, buf_oblkno, REGBUF_STANDARD); XLogRegisterBuffer(1, buf_nblkno, REGBUF_WILL_INIT); XLogRegisterBuffer(2, metabuf, REGBUF_STANDARD); if (metap_update_masks) { xlrec.flags |= XLH_SPLIT_META_UPDATE_MASKS; XLogRegisterBufData(2, (char *) &metap->hashm_lowmask, sizeof(uint32)); XLogRegisterBufData(2, (char *) &metap->hashm_highmask, sizeof(uint32)); } if (metap_update_splitpoint) { xlrec.flags |= XLH_SPLIT_META_UPDATE_SPLITPOINT; XLogRegisterBufData(2, (char *) &metap->hashm_ovflpoint, sizeof(uint32)); XLogRegisterBufData(2, (char *) &metap->hashm_spares[metap->hashm_ovflpoint], sizeof(uint32)); } XLogRegisterData((char *) &xlrec, SizeOfHashSplitAllocPage); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_ALLOCATE_PAGE); PageSetLSN(BufferGetPage(buf_oblkno), recptr); PageSetLSN(BufferGetPage(buf_nblkno), recptr); PageSetLSN(BufferGetPage(metabuf), recptr); } END_CRIT_SECTION(); /* drop lock, but keep pin */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); /* Relocate records to the new bucket */ _hash_splitbucket(rel, metabuf, old_bucket, new_bucket, buf_oblkno, buf_nblkno, NULL, maxbucket, highmask, lowmask); /* all done, now release the pins on primary buckets. */ _hash_dropbuf(rel, buf_oblkno); _hash_dropbuf(rel, buf_nblkno); return; /* Here if decide not to split or fail to acquire old bucket lock */ fail: /* We didn't write the metapage, so just drop lock */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); }
/* * _hash_init_metabuffer() -- Initialize the metadata page of a hash index. */ void _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid, uint16 ffactor, bool initpage) { HashMetaPage metap; HashPageOpaque pageopaque; Page page; double dnumbuckets; uint32 num_buckets; uint32 spare_index; uint32 i; /* * Choose the number of initial bucket pages to match the fill factor * given the estimated number of tuples. We round up the result to the * total number of buckets which has to be allocated before using its * _hashm_spare element. However always force at least 2 bucket pages. The * upper limit is determined by considerations explained in * _hash_expandtable(). */ dnumbuckets = num_tuples / ffactor; if (dnumbuckets <= 2.0) num_buckets = 2; else if (dnumbuckets >= (double) 0x40000000) num_buckets = 0x40000000; else num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets)); spare_index = _hash_spareindex(num_buckets); Assert(spare_index < HASH_MAX_SPLITPOINTS); page = BufferGetPage(buf); if (initpage) _hash_pageinit(page, BufferGetPageSize(buf)); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = -1; pageopaque->hasho_flag = LH_META_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; metap = HashPageGetMeta(page); metap->hashm_magic = HASH_MAGIC; metap->hashm_version = HASH_VERSION; metap->hashm_ntuples = 0; metap->hashm_nmaps = 0; metap->hashm_ffactor = ffactor; metap->hashm_bsize = HashGetMaxBitmapSize(page); /* find largest bitmap array size that will fit in page size */ for (i = _hash_log2(metap->hashm_bsize); i > 0; --i) { if ((1 << i) <= metap->hashm_bsize) break; } Assert(i > 0); metap->hashm_bmsize = 1 << i; metap->hashm_bmshift = i + BYTE_TO_BIT; Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1)); /* * Label the index with its primary hash support function's OID. This is * pretty useless for normal operation (in fact, hashm_procid is not used * anywhere), but it might be handy for forensic purposes so we keep it. */ metap->hashm_procid = procid; /* * We initialize the index with N buckets, 0 .. N-1, occupying physical * blocks 1 to N. The first freespace bitmap page is in block N+1. */ metap->hashm_maxbucket = num_buckets - 1; /* * Set highmask as next immediate ((2 ^ x) - 1), which should be * sufficient to cover num_buckets. */ metap->hashm_highmask = (1 << (_hash_log2(num_buckets + 1))) - 1; metap->hashm_lowmask = (metap->hashm_highmask >> 1); MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); /* Set up mapping for one spare page after the initial splitpoints */ metap->hashm_spares[spare_index] = 1; metap->hashm_ovflpoint = spare_index; metap->hashm_firstfree = 0; /* * Set pd_lower just past the end of the metadata. This is to log full * page image of metapage in xloginsert.c. */ ((PageHeader) page)->pd_lower = ((char *) metap + sizeof(HashMetaPageData)) - (char *) page; }
/* * _hash_init() -- Initialize the metadata page of a hash index, * the initial buckets, and the initial bitmap page. * * The initial number of buckets is dependent on num_tuples, an estimate * of the number of tuples to be loaded into the index initially. The * chosen number of buckets is returned. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum) { Buffer metabuf; Buffer buf; Buffer bitmapbuf; Page pg; HashMetaPage metap; RegProcedure procid; int32 data_width; int32 item_width; int32 ffactor; uint32 num_buckets; uint32 i; bool use_wal; /* safety check */ if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); /* * WAL log creation of pages if the relation is persistent, or this is the * init fork. Init forks for unlogged relations always need to be WAL * logged. */ use_wal = RelationNeedsWAL(rel) || forkNum == INIT_FORKNUM; /* * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full * as the user-settable fillfactor parameter says. We can compute it * exactly since the index datatype (i.e. uint32 hash key) is fixed-width. */ data_width = sizeof(uint32); item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) + sizeof(ItemIdData); /* include the line pointer */ ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width; /* keep to a sane range */ if (ffactor < 10) ffactor = 10; procid = index_getprocid(rel, 1, HASHSTANDARD_PROC); /* * We initialize the metapage, the first N bucket pages, and the first * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() * calls to occur. This ensures that the smgr level has the right idea of * the physical index length. * * Critical section not required, because on error the creation of the * whole relation will be rolled back. */ metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); _hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false); MarkBufferDirty(metabuf); pg = BufferGetPage(metabuf); metap = HashPageGetMeta(pg); /* XLOG stuff */ if (use_wal) { xl_hash_init_meta_page xlrec; XLogRecPtr recptr; xlrec.num_tuples = num_tuples; xlrec.procid = metap->hashm_procid; xlrec.ffactor = metap->hashm_ffactor; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashInitMetaPage); XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE); PageSetLSN(BufferGetPage(metabuf), recptr); } num_buckets = metap->hashm_maxbucket + 1; /* * Release buffer lock on the metapage while we initialize buckets. * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS * won't accomplish anything. It's a bad idea to hold buffer locks for * long intervals in any case, since that can block the bgwriter. */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); /* * Initialize and WAL Log the first N buckets */ for (i = 0; i < num_buckets; i++) { BlockNumber blkno; /* Allow interrupts, in case N is huge */ CHECK_FOR_INTERRUPTS(); blkno = BUCKET_TO_BLKNO(metap, i); buf = _hash_getnewbuf(rel, blkno, forkNum); _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false); MarkBufferDirty(buf); if (use_wal) log_newpage(&rel->rd_node, forkNum, blkno, BufferGetPage(buf), true); _hash_relbuf(rel, buf); } /* Now reacquire buffer lock on metapage */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); /* * Initialize bitmap page */ bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum); _hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false); MarkBufferDirty(bitmapbuf); /* add the new bitmap page to the metapage's list of bitmaps */ /* metapage already has a write lock */ if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("out of overflow pages in hash index \"%s\"", RelationGetRelationName(rel)))); metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1; metap->hashm_nmaps++; MarkBufferDirty(metabuf); /* XLOG stuff */ if (use_wal) { xl_hash_init_bitmap_page xlrec; XLogRecPtr recptr; xlrec.bmsize = metap->hashm_bmsize; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHashInitBitmapPage); XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT); /* * This is safe only because nobody else can be modifying the index at * this stage; it's only visible to the transaction that is creating * it. */ XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE); PageSetLSN(BufferGetPage(bitmapbuf), recptr); PageSetLSN(BufferGetPage(metabuf), recptr); } /* all done */ _hash_relbuf(rel, bitmapbuf); _hash_relbuf(rel, metabuf); return num_buckets; }
/* * replay squeeze page operation of hash index */ static void hash_xlog_squeeze_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) XLogRecGetData(record); Buffer bucketbuf = InvalidBuffer; Buffer writebuf; Buffer ovflbuf; Buffer prevbuf = InvalidBuffer; Buffer mapbuf; XLogRedoAction action; /* * Ensure we have a cleanup lock on primary bucket page before we start * with the actual replay operation. This is to ensure that neither a * scan can start nor a scan can be already-in-progress during the replay * of this operation. If we allow scans during this operation, then they * can miss some records or show the same record multiple times. */ if (xldata->is_prim_bucket_same_wrt) action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf); else { /* * we don't care for return value as the purpose of reading bucketbuf * is to ensure a cleanup lock on primary bucket page. */ (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); action = XLogReadBufferForRedo(record, 1, &writebuf); } /* replay the record for adding entries in overflow buffer */ if (action == BLK_NEEDS_REDO) { Page writepage; char *begin; char *data; Size datalen; uint16 ninserted = 0; data = begin = XLogRecGetBlockData(record, 1, &datalen); writepage = (Page) BufferGetPage(writebuf); if (xldata->ntups > 0) { OffsetNumber *towrite = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntups; while (data - begin < datalen) { IndexTuple itup = (IndexTuple) data; Size itemsz; OffsetNumber l; itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); data += itemsz; l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); if (l == InvalidOffsetNumber) elog(ERROR, "hash_xlog_squeeze_page: failed to add item to hash index page, size %d bytes", (int) itemsz); ninserted++; } } /* * number of tuples inserted must be same as requested in REDO record. */ Assert(ninserted == xldata->ntups); /* * if the page on which are adding tuples is a page previous to freed * overflow page, then update its nextblno. */ if (xldata->is_prev_bucket_same_wrt) { HashPageOpaque writeopaque = (HashPageOpaque) PageGetSpecialPointer(writepage); writeopaque->hasho_nextblkno = xldata->nextblkno; } PageSetLSN(writepage, lsn); MarkBufferDirty(writebuf); } /* replay the record for initializing overflow buffer */ if (XLogReadBufferForRedo(record, 2, &ovflbuf) == BLK_NEEDS_REDO) { Page ovflpage; ovflpage = BufferGetPage(ovflbuf); _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); PageSetLSN(ovflpage, lsn); MarkBufferDirty(ovflbuf); } if (BufferIsValid(ovflbuf)) UnlockReleaseBuffer(ovflbuf); /* replay the record for page previous to the freed overflow page */ if (!xldata->is_prev_bucket_same_wrt && XLogReadBufferForRedo(record, 3, &prevbuf) == BLK_NEEDS_REDO) { Page prevpage = BufferGetPage(prevbuf); HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); prevopaque->hasho_nextblkno = xldata->nextblkno; PageSetLSN(prevpage, lsn); MarkBufferDirty(prevbuf); } if (BufferIsValid(prevbuf)) UnlockReleaseBuffer(prevbuf); /* replay the record for page next to the freed overflow page */ if (XLogRecHasBlockRef(record, 4)) { Buffer nextbuf; if (XLogReadBufferForRedo(record, 4, &nextbuf) == BLK_NEEDS_REDO) { Page nextpage = BufferGetPage(nextbuf); HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); nextopaque->hasho_prevblkno = xldata->prevblkno; PageSetLSN(nextpage, lsn); MarkBufferDirty(nextbuf); } if (BufferIsValid(nextbuf)) UnlockReleaseBuffer(nextbuf); } if (BufferIsValid(writebuf)) UnlockReleaseBuffer(writebuf); if (BufferIsValid(bucketbuf)) UnlockReleaseBuffer(bucketbuf); /* * Note: in normal operation, we'd update the bitmap and meta page while * still holding lock on the primary bucket page and overflow pages. But * during replay it's not necessary to hold those locks, since no other * index updates can be happening concurrently. */ /* replay the record for bitmap page */ if (XLogReadBufferForRedo(record, 5, &mapbuf) == BLK_NEEDS_REDO) { Page mappage = (Page) BufferGetPage(mapbuf); uint32 *freep = NULL; char *data; uint32 *bitmap_page_bit; Size datalen; freep = HashPageGetBitmap(mappage); data = XLogRecGetBlockData(record, 5, &datalen); bitmap_page_bit = (uint32 *) data; CLRBIT(freep, *bitmap_page_bit); PageSetLSN(mappage, lsn); MarkBufferDirty(mapbuf); } if (BufferIsValid(mapbuf)) UnlockReleaseBuffer(mapbuf); /* replay the record for meta page */ if (XLogRecHasBlockRef(record, 6)) { Buffer metabuf; if (XLogReadBufferForRedo(record, 6, &metabuf) == BLK_NEEDS_REDO) { HashMetaPage metap; Page page; char *data; uint32 *firstfree_ovflpage; Size datalen; data = XLogRecGetBlockData(record, 6, &datalen); firstfree_ovflpage = (uint32 *) data; page = BufferGetPage(metabuf); metap = HashPageGetMeta(page); metap->hashm_firstfree = *firstfree_ovflpage; PageSetLSN(page, lsn); MarkBufferDirty(metabuf); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); } }
/* * Attempt to expand the hash table by creating one new bucket. * * This will silently do nothing if it cannot get the needed locks. * * The caller should hold no locks on the hash index. * * The caller must hold a pin, but no lock, on the metapage buffer. * The buffer is returned in the same state. */ void _hash_expandtable(Relation rel, Buffer metabuf) { HashMetaPage metap; Bucket old_bucket; Bucket new_bucket; uint32 spare_ndx; BlockNumber start_oblkno; BlockNumber start_nblkno; uint32 maxbucket; uint32 highmask; uint32 lowmask; /* * Obtain the page-zero lock to assert the right to begin a split (see * README). * * Note: deadlock should be impossible here. Our own backend could only be * holding bucket sharelocks due to stopped indexscans; those will not * block other holders of the page-zero lock, who are only interested in * acquiring bucket sharelocks themselves. Exclusive bucket locks are * only taken here and in hashbulkdelete, and neither of these operations * needs any additional locks to complete. (If, due to some flaw in this * reasoning, we manage to deadlock anyway, it's okay to error out; the * index will be left in a consistent state.) */ _hash_getlock(rel, 0, HASH_EXCLUSIVE); /* Write-lock the meta page */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); _hash_checkpage(rel, metabuf, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* * Check to see if split is still needed; someone else might have already * done one while we waited for the lock. * * Make sure this stays in sync with _hash_doinsert() */ if (metap->hashm_ntuples <= (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1)) goto fail; /* * Can't split anymore if maxbucket has reached its maximum possible * value. * * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because * the calculation maxbucket+1 mustn't overflow). Currently we restrict * to half that because of overflow looping in _hash_log2() and * insufficient space in hashm_spares[]. It's moot anyway because an * index with 2^32 buckets would certainly overflow BlockNumber and hence * _hash_alloc_buckets() would fail, but if we supported buckets smaller * than a disk block then this would be an independent constraint. * * If you change this, see also the maximum initial number of buckets in * _hash_metapinit(). */ if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE) goto fail; /* * Determine which bucket is to be split, and attempt to lock the old * bucket. If we can't get the lock, give up. * * The lock protects us against other backends, but not against our own * backend. Must check for active scans separately. */ new_bucket = metap->hashm_maxbucket + 1; old_bucket = (new_bucket & metap->hashm_lowmask); start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket); if (_hash_has_active_scan(rel, old_bucket)) goto fail; if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE)) goto fail; /* * Likewise lock the new bucket (should never fail). * * Note: it is safe to compute the new bucket's blkno here, even though we * may still need to update the BUCKET_TO_BLKNO mapping. This is because * the current value of hashm_spares[hashm_ovflpoint] correctly shows * where we are going to put a new splitpoint's worth of buckets. */ start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket); if (_hash_has_active_scan(rel, new_bucket)) elog(ERROR, "scan in progress on supposedly new bucket"); if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE)) elog(ERROR, "could not get lock on supposedly new bucket"); /* * If the split point is increasing (hashm_maxbucket's log base 2 * increases), we need to allocate a new batch of bucket pages. */ spare_ndx = _hash_log2(new_bucket + 1); if (spare_ndx > metap->hashm_ovflpoint) { Assert(spare_ndx == metap->hashm_ovflpoint + 1); /* * The number of buckets in the new splitpoint is equal to the total * number already in existence, i.e. new_bucket. Currently this maps * one-to-one to blocks required, but someday we may need a more * complicated calculation here. */ if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket)) { /* can't split due to BlockNumber overflow */ _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE); _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE); goto fail; } } /* * Okay to proceed with split. Update the metapage bucket mapping info. * * Since we are scribbling on the metapage data right in the shared * buffer, any failure in this next little bit leaves us with a big * problem: the metapage is effectively corrupt but could get written back * to disk. We don't really expect any failure, but just to be sure, * establish a critical section. */ START_CRIT_SECTION(); metap->hashm_maxbucket = new_bucket; if (new_bucket > metap->hashm_highmask) { /* Starting a new doubling */ metap->hashm_lowmask = metap->hashm_highmask; metap->hashm_highmask = new_bucket | metap->hashm_lowmask; } /* * If the split point is increasing (hashm_maxbucket's log base 2 * increases), we need to adjust the hashm_spares[] array and * hashm_ovflpoint so that future overflow pages will be created beyond * this new batch of bucket pages. */ if (spare_ndx > metap->hashm_ovflpoint) { metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint]; metap->hashm_ovflpoint = spare_ndx; } /* Done mucking with metapage */ END_CRIT_SECTION(); /* * Copy bucket mapping info now; this saves re-accessing the meta page * inside _hash_splitbucket's inner loop. Note that once we drop the * split lock, other splits could begin, so these values might be out of * date before _hash_splitbucket finishes. That's okay, since all it * needs is to tell which of these two buckets to map hashkeys into. */ maxbucket = metap->hashm_maxbucket; highmask = metap->hashm_highmask; lowmask = metap->hashm_lowmask; /* Write out the metapage and drop lock, but keep pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); /* Release split lock; okay for other splits to occur now */ _hash_droplock(rel, 0, HASH_EXCLUSIVE); /* Relocate records to the new bucket */ _hash_splitbucket(rel, metabuf, old_bucket, new_bucket, start_oblkno, start_nblkno, maxbucket, highmask, lowmask); /* Release bucket locks, allowing others to access them */ _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE); _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE); return; /* Here if decide not to split or fail to acquire old bucket lock */ fail: /* We didn't write the metapage, so just drop lock */ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); /* Release split lock */ _hash_droplock(rel, 0, HASH_EXCLUSIVE); }
/* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * This function also deletes the tuples that are moved by split to other * bucket. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state) { Relation rel = info->index; double tuples_removed; double num_index_tuples; double orig_ntuples; Bucket orig_maxbucket; Bucket cur_maxbucket; Bucket cur_bucket; Buffer metabuf; HashMetaPage metap; HashMetaPageData local_metapage; tuples_removed = 0; num_index_tuples = 0; /* * Read the metapage to fetch original bucket and tuple counts. Also, we * keep a copy of the last-seen metapage so that we can use its * hashm_spares[] values to compute bucket page addresses. This is a bit * hokey but perfectly safe, since the interesting entries in the spares * array cannot change under us; and it beats rereading the metapage for * each bucket. */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); orig_maxbucket = metap->hashm_maxbucket; orig_ntuples = metap->hashm_ntuples; memcpy(&local_metapage, metap, sizeof(local_metapage)); _hash_relbuf(rel, metabuf); /* Scan the buckets that we know exist */ cur_bucket = 0; cur_maxbucket = orig_maxbucket; loop_top: while (cur_bucket <= cur_maxbucket) { BlockNumber bucket_blkno; BlockNumber blkno; Buffer bucket_buf; Buffer buf; HashPageOpaque bucket_opaque; Page page; bool split_cleanup = false; /* Get address of bucket's start page */ bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket); blkno = bucket_blkno; /* * We need to acquire a cleanup lock on the primary bucket page to out * wait concurrent scans before deleting the dead tuples. */ buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); LockBufferForCleanup(buf); _hash_checkpage(rel, buf, LH_BUCKET_PAGE); page = BufferGetPage(buf); bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* * If the bucket contains tuples that are moved by split, then we need * to delete such tuples. We can't delete such tuples if the split * operation on bucket is not finished as those are needed by scans. */ if (!H_BUCKET_BEING_SPLIT(bucket_opaque) && H_NEEDS_SPLIT_CLEANUP(bucket_opaque)) split_cleanup = true; bucket_buf = buf; hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy, local_metapage.hashm_maxbucket, local_metapage.hashm_highmask, local_metapage.hashm_lowmask, &tuples_removed, &num_index_tuples, split_cleanup, callback, callback_state); _hash_dropbuf(rel, bucket_buf); /* Advance to next bucket */ cur_bucket++; } /* Write-lock metapage and check for split since we started */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); if (cur_maxbucket != metap->hashm_maxbucket) { /* There's been a split, so process the additional bucket(s) */ cur_maxbucket = metap->hashm_maxbucket; memcpy(&local_metapage, metap, sizeof(local_metapage)); _hash_relbuf(rel, metabuf); goto loop_top; } /* Okay, we're really done. Update tuple count in metapage. */ if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) { /* * No one has split or inserted anything since start of scan, so * believe our count as gospel. */ metap->hashm_ntuples = num_index_tuples; } else { /* * Otherwise, our count is untrustworthy since we may have * double-scanned tuples in split buckets. Proceed by dead-reckoning. * (Note: we still return estimated_count = false, because using this * count is better than not updating reltuples at all.) */ if (metap->hashm_ntuples > tuples_removed) metap->hashm_ntuples -= tuples_removed; else metap->hashm_ntuples = 0; num_index_tuples = metap->hashm_ntuples; } _hash_wrtbuf(rel, metabuf); /* return statistics */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); stats->estimated_count = false; stats->num_index_tuples = num_index_tuples; stats->tuples_removed += tuples_removed; /* hashvacuumcleanup will fill in num_pages */ return stats; }
/* ------------------------------------------------ * hash_metapage_info() * * Get the meta-page information for a hash index * * Usage: SELECT * FROM hash_metapage_info(get_raw_page('con_hash_index', 0)) * ------------------------------------------------ */ Datum hash_metapage_info(PG_FUNCTION_ARGS) { bytea *raw_page = PG_GETARG_BYTEA_P(0); Page page; HashMetaPageData *metad; TupleDesc tupleDesc; HeapTuple tuple; int i, j; Datum values[16]; bool nulls[16]; Datum spares[HASH_MAX_SPLITPOINTS]; Datum mapp[HASH_MAX_BITMAPS]; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use raw page functions")))); page = verify_hash_page(raw_page, LH_META_PAGE); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); tupleDesc = BlessTupleDesc(tupleDesc); metad = HashPageGetMeta(page); MemSet(nulls, 0, sizeof(nulls)); j = 0; values[j++] = Int64GetDatum((int64) metad->hashm_magic); values[j++] = Int64GetDatum((int64) metad->hashm_version); values[j++] = Float8GetDatum(metad->hashm_ntuples); values[j++] = Int32GetDatum((int32) metad->hashm_ffactor); values[j++] = Int32GetDatum((int32) metad->hashm_bsize); values[j++] = Int32GetDatum((int32) metad->hashm_bmsize); values[j++] = Int32GetDatum((int32) metad->hashm_bmshift); values[j++] = Int64GetDatum((int64) metad->hashm_maxbucket); values[j++] = Int64GetDatum((int64) metad->hashm_highmask); values[j++] = Int64GetDatum((int64) metad->hashm_lowmask); values[j++] = Int64GetDatum((int64) metad->hashm_ovflpoint); values[j++] = Int64GetDatum((int64) metad->hashm_firstfree); values[j++] = Int64GetDatum((int64) metad->hashm_nmaps); values[j++] = ObjectIdGetDatum((Oid) metad->hashm_procid); for (i = 0; i < HASH_MAX_SPLITPOINTS; i++) spares[i] = Int64GetDatum((int64) metad->hashm_spares[i]); values[j++] = PointerGetDatum(construct_array(spares, HASH_MAX_SPLITPOINTS, INT8OID, 8, FLOAT8PASSBYVAL, 'd')); for (i = 0; i < HASH_MAX_BITMAPS; i++) mapp[i] = Int64GetDatum((int64) metad->hashm_mapp[i]); values[j++] = PointerGetDatum(construct_array(mapp, HASH_MAX_BITMAPS, INT8OID, 8, FLOAT8PASSBYVAL, 'd')); tuple = heap_form_tuple(tupleDesc, values, nulls); PG_RETURN_DATUM(HeapTupleGetDatum(tuple)); }
/* * _hash_checkpage -- sanity checks on the format of all hash pages * * If flags is not zero, it is a bitwise OR of the acceptable values of * hasho_flag. */ void _hash_checkpage(Relation rel, Buffer buf, int flags) { Page page = BufferGetPage(buf); /* * ReadBuffer verifies that every newly-read page passes * PageHeaderIsValid, which means it either contains a reasonably sane * page header or is all-zero. We have to defend against the all-zero * case, however. */ if (PageIsNew(page)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" contains unexpected zero page at block %u", RelationGetRelationName(rel), BufferGetBlockNumber(buf)), errhint("Please REINDEX it."))); /* * Additionally check that the special area looks sane. */ if (PageGetSpecialSize(page) != MAXALIGN(sizeof(HashPageOpaqueData))) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" contains corrupted page at block %u", RelationGetRelationName(rel), BufferGetBlockNumber(buf)), errhint("Please REINDEX it."))); if (flags) { HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page); if ((opaque->hasho_flag & flags) == 0) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" contains corrupted page at block %u", RelationGetRelationName(rel), BufferGetBlockNumber(buf)), errhint("Please REINDEX it."))); } /* * When checking the metapage, also verify magic number and version. */ if (flags == LH_META_PAGE) { HashMetaPage metap = HashPageGetMeta(page); if (metap->hashm_magic != HASH_MAGIC) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" is not a hash index", RelationGetRelationName(rel)))); if (metap->hashm_version != HASH_VERSION) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" has wrong hash version", RelationGetRelationName(rel)), errhint("Please REINDEX it."))); } }
/* * _hash_first() -- Find the first item in a scan. * * Find the first item in the index that * satisfies the qualification associated with the scan descriptor. On * success, the page containing the current index tuple is read locked * and pinned, and the scan's opaque data entry is updated to * include the buffer. */ bool _hash_first(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; HashScanOpaque so = (HashScanOpaque) scan->opaque; ScanKey cur; uint32 hashkey; Bucket bucket; BlockNumber blkno; Buffer buf; Buffer metabuf; Page page; HashPageOpaque opaque; HashMetaPage metap; IndexTuple itup; ItemPointer current; OffsetNumber offnum; pgstat_count_index_scan(rel); current = &(so->hashso_curpos); ItemPointerSetInvalid(current); /* * We do not support hash scans with no index qualification, because we * would have to read the whole index rather than just one bucket. That * creates a whole raft of problems, since we haven't got a practical way * to lock all the buckets against splits or compactions. */ if (scan->numberOfKeys < 1) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("hash indexes do not support whole-index scans"))); /* There may be more than one index qual, but we hash only the first */ cur = &scan->keyData[0]; /* We support only single-column hash indexes */ Assert(cur->sk_attno == 1); /* And there's only one operator strategy, too */ Assert(cur->sk_strategy == HTEqualStrategyNumber); /* * If the constant in the index qual is NULL, assume it cannot match any * items in the index. */ if (cur->sk_flags & SK_ISNULL) return false; /* * Okay to compute the hash key. We want to do this before acquiring any * locks, in case a user-defined hash function happens to be slow. * * If scankey operator is not a cross-type comparison, we can use the * cached hash function; otherwise gotta look it up in the catalogs. * * We support the convention that sk_subtype == InvalidOid means the * opclass input type; this is a hack to simplify life for ScanKeyInit(). */ if (cur->sk_subtype == rel->rd_opcintype[0] || cur->sk_subtype == InvalidOid) hashkey = _hash_datum2hashkey(rel, cur->sk_argument); else hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument, cur->sk_subtype); so->hashso_sk_hash = hashkey; /* * Acquire shared split lock so we can compute the target bucket safely * (see README). */ _hash_getlock(rel, 0, HASH_SHARE); /* Read the metapage */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* * Compute the target bucket number, and convert to block number. */ bucket = _hash_hashkey2bucket(hashkey, metap->hashm_maxbucket, metap->hashm_highmask, metap->hashm_lowmask); blkno = BUCKET_TO_BLKNO(metap, bucket); /* done with the metapage */ _hash_relbuf(rel, metabuf); /* * Acquire share lock on target bucket; then we can release split lock. */ _hash_getlock(rel, blkno, HASH_SHARE); _hash_droplock(rel, 0, HASH_SHARE); /* Update scan opaque state to show we have lock on the bucket */ so->hashso_bucket = bucket; so->hashso_bucket_valid = true; so->hashso_bucket_blkno = blkno; /* Fetch the primary bucket page for the bucket */ buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(opaque->hasho_bucket == bucket); /* If a backwards scan is requested, move to the end of the chain */ if (ScanDirectionIsBackward(dir)) { while (BlockNumberIsValid(opaque->hasho_nextblkno)) _hash_readnext(rel, &buf, &page, &opaque); } /* Now find the first tuple satisfying the qualification */ if (!_hash_step(scan, &buf, dir)) return false; /* if we're here, _hash_step found a valid tuple */ offnum = ItemPointerGetOffsetNumber(current); _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); page = BufferGetPage(buf); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); so->hashso_heappos = itup->t_tid; return true; }
/* * _hash_freeovflpage() - * * Remove this overflow page from its bucket's chain, and mark the page as * free. On entry, ovflbuf is write-locked; it is released before exiting. * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. * * Returns the block number of the page that followed the given page * in the bucket, or InvalidBlockNumber if no following page. * * NB: caller must not hold lock on metapage, nor on either page that's * adjacent in the bucket chain. The caller had better hold exclusive lock * on the bucket, too. */ BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, BufferAccessStrategy bstrategy) { HashMetaPage metap; Buffer metabuf; Buffer mapbuf; BlockNumber ovflblkno; BlockNumber prevblkno; BlockNumber blkno; BlockNumber nextblkno; HashPageOpaque ovflopaque; Page ovflpage; Page mappage; uint32 *freep; uint32 ovflbitno; int32 bitmappage, bitmapbit; Bucket bucket PG_USED_FOR_ASSERTS_ONLY; /* Get information from the doomed page */ _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE); ovflblkno = BufferGetBlockNumber(ovflbuf); ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); nextblkno = ovflopaque->hasho_nextblkno; prevblkno = ovflopaque->hasho_prevblkno; bucket = ovflopaque->hasho_bucket; /* * Zero the page for debugging's sake; then write and release it. (Note: * if we failed to zero the page here, we'd have problems with the Assert * in _hash_pageinit() when the page is reused.) */ MemSet(ovflpage, 0, BufferGetPageSize(ovflbuf)); _hash_wrtbuf(rel, ovflbuf); /* * Fix up the bucket chain. this is a doubly-linked list, so we must fix * up the bucket chain members behind and ahead of the overflow page being * deleted. No concurrency issues since we hold exclusive lock on the * entire bucket. */ if (BlockNumberIsValid(prevblkno)) { Buffer prevbuf = _hash_getbuf_with_strategy(rel, prevblkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, bstrategy); Page prevpage = BufferGetPage(prevbuf); HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); Assert(prevopaque->hasho_bucket == bucket); prevopaque->hasho_nextblkno = nextblkno; _hash_wrtbuf(rel, prevbuf); } if (BlockNumberIsValid(nextblkno)) { Buffer nextbuf = _hash_getbuf_with_strategy(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); Page nextpage = BufferGetPage(nextbuf); HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); Assert(nextopaque->hasho_bucket == bucket); nextopaque->hasho_prevblkno = prevblkno; _hash_wrtbuf(rel, nextbuf); } /* Note: bstrategy is intentionally not used for metapage and bitmap */ /* Read the metapage so we can determine which bitmap page to use */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* Identify which bit to set */ ovflbitno = blkno_to_bitno(metap, ovflblkno); bitmappage = ovflbitno >> BMPG_SHIFT(metap); bitmapbit = ovflbitno & BMPG_MASK(metap); if (bitmappage >= metap->hashm_nmaps) elog(ERROR, "invalid overflow bit number %u", ovflbitno); blkno = metap->hashm_mapp[bitmappage]; /* Release metapage lock while we access the bitmap page */ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); /* Clear the bitmap bit to indicate that this overflow page is free */ mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); freep = HashPageGetBitmap(mappage); Assert(ISSET(freep, bitmapbit)); CLRBIT(freep, bitmapbit); _hash_wrtbuf(rel, mapbuf); /* Get write-lock on metapage to update firstfree */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); /* if this is now the first free page, update hashm_firstfree */ if (ovflbitno < metap->hashm_firstfree) { metap->hashm_firstfree = ovflbitno; _hash_wrtbuf(rel, metabuf); } else { /* no need to change metapage */ _hash_relbuf(rel, metabuf); } return nextblkno; }
/* ------------------------------------------------------ * pgstathashindex() * * Usage: SELECT * FROM pgstathashindex('hashindex'); * ------------------------------------------------------ */ Datum pgstathashindex(PG_FUNCTION_ARGS) { Oid relid = PG_GETARG_OID(0); BlockNumber nblocks; BlockNumber blkno; Relation rel; HashIndexStat stats; BufferAccessStrategy bstrategy; HeapTuple tuple; TupleDesc tupleDesc; Datum values[8]; bool nulls[8]; Buffer metabuf; HashMetaPage metap; float8 free_percent; uint64 total_space; rel = index_open(relid, AccessShareLock); /* index_open() checks that it's an index */ if (!IS_HASH(rel)) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("relation \"%s\" is not a HASH index", RelationGetRelationName(rel)))); /* * Reject attempts to read non-local temporary relations; we would be * likely to get wrong data since we have no visibility into the owning * session's local buffers. */ if (RELATION_IS_OTHER_TEMP(rel)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary indexes of other sessions"))); /* Get the information we need from the metapage. */ memset(&stats, 0, sizeof(stats)); metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); stats.version = metap->hashm_version; stats.space_per_page = metap->hashm_bsize; _hash_relbuf(rel, metabuf); /* Get the current relation length */ nblocks = RelationGetNumberOfBlocks(rel); /* prepare access strategy for this index */ bstrategy = GetAccessStrategy(BAS_BULKREAD); /* Start from blkno 1 as 0th block is metapage */ for (blkno = 1; blkno < nblocks; blkno++) { Buffer buf; Page page; CHECK_FOR_INTERRUPTS(); buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buf, BUFFER_LOCK_SHARE); page = (Page) BufferGetPage(buf); if (PageIsNew(page)) stats.unused_pages++; else if (PageGetSpecialSize(page) != MAXALIGN(sizeof(HashPageOpaqueData))) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" contains corrupted page at block %u", RelationGetRelationName(rel), BufferGetBlockNumber(buf)))); else { HashPageOpaque opaque; int pagetype; opaque = (HashPageOpaque) PageGetSpecialPointer(page); pagetype = opaque->hasho_flag & LH_PAGE_TYPE; if (pagetype == LH_BUCKET_PAGE) { stats.bucket_pages++; GetHashPageStats(page, &stats); } else if (pagetype == LH_OVERFLOW_PAGE) { stats.overflow_pages++; GetHashPageStats(page, &stats); } else if (pagetype == LH_BITMAP_PAGE) stats.bitmap_pages++; else if (pagetype == LH_UNUSED_PAGE) stats.unused_pages++; else ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("unexpected page type 0x%04X in HASH index \"%s\" block %u", opaque->hasho_flag, RelationGetRelationName(rel), BufferGetBlockNumber(buf)))); } UnlockReleaseBuffer(buf); } /* Done accessing the index */ index_close(rel, AccessShareLock); /* Count unused pages as free space. */ stats.free_space += stats.unused_pages * stats.space_per_page; /* * Total space available for tuples excludes the metapage and the bitmap * pages. */ total_space = (nblocks - (stats.bitmap_pages + 1)) * stats.space_per_page; if (total_space == 0) free_percent = 0.0; else free_percent = 100.0 * stats.free_space / total_space; /* * Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); tupleDesc = BlessTupleDesc(tupleDesc); /* * Build and return the tuple */ MemSet(nulls, 0, sizeof(nulls)); values[0] = Int32GetDatum(stats.version); values[1] = Int64GetDatum((int64) stats.bucket_pages); values[2] = Int64GetDatum((int64) stats.overflow_pages); values[3] = Int64GetDatum((int64) stats.bitmap_pages); values[4] = Int64GetDatum((int64) stats.unused_pages); values[5] = Int64GetDatum(stats.live_items); values[6] = Int64GetDatum(stats.dead_items); values[7] = Float8GetDatum(free_percent); tuple = heap_form_tuple(tupleDesc, values, nulls); PG_RETURN_DATUM(HeapTupleGetDatum(tuple)); }
/* * _hash_addovflpage * * Add an overflow page to the bucket whose last page is pointed to by 'buf'. * * On entry, the caller must hold a pin but no lock on 'buf'. The pin is * dropped before exiting (we assume the caller is not interested in 'buf' * anymore). The returned overflow page will be pinned and write-locked; * it is guaranteed to be empty. * * The caller must hold a pin, but no lock, on the metapage buffer. * That buffer is returned in the same state. * * The caller must hold at least share lock on the bucket, to ensure that * no one else tries to compact the bucket meanwhile. This guarantees that * 'buf' won't stop being part of the bucket while it's unlocked. * * NB: since this could be executed concurrently by multiple processes, * one should not assume that the returned overflow page will be the * immediate successor of the originally passed 'buf'. Additional overflow * pages might have been added to the bucket chain in between. */ Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf) { Buffer ovflbuf; Page page; Page ovflpage; HashPageOpaque pageopaque; HashPageOpaque ovflopaque; /*CS3223*/ /*declare variables*/ HashMetaPage metap; Bucket bucket; int i; int index; int bitIndexInElement; uint32 ovflElement; uint32 *tempPointer; /* allocate and lock an empty overflow page */ ovflbuf = _hash_getovflpage(rel, metabuf); /*CS3223*/ metap = HashPageGetMeta(BufferGetPage(metabuf)); /* find bucket number of primary page */ page = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); bucket = pageopaque -> hasho_bucket; /* * Write-lock the tail page. It is okay to hold two buffer locks here * since there cannot be anyone else contending for access to ovflbuf. */ _hash_chgbufaccess(rel, buf, HASH_NOLOCK, HASH_WRITE); /* probably redundant... */ _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); /* loop to find current tail page, in case someone else inserted too */ //for (;;) /*CS3223*/ while (i>=0) { BlockNumber nextblkno; page = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); nextblkno = pageopaque->hasho_nextblkno; if (!BlockNumberIsValid(nextblkno)) break; /* we assume we do not need to write the unmodified page */ _hash_relbuf(rel, buf); buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); /*CS3223*/ i++; } /* now that we have correct backlink, initialize new overflow page */ ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); ovflopaque->hasho_nextblkno = InvalidBlockNumber; ovflopaque->hasho_bucket = pageopaque->hasho_bucket; ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; ovflopaque->hasho_page_id = HASHO_PAGE_ID; MarkBufferDirty(ovflbuf); /* logically chain overflow page to previous page */ pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); _hash_wrtbuf(rel, buf); /*CS3223*/ /* if length of the bucket chain is 1, only one ovflpage was added, which means the bucket was not split before */ if (i == 1) { index = bucket / 32; bitIndexInElement = bucket % 32; ovflElement = (uint32)metap->ovflBkts[index]; ovflElement = ovflElement | (1 << bitIndexInElement); tempPointer = &(metap->ovflBkts[index]); *tempPointer = ovflElement; } return ovflbuf; }
/* * _hash_getovflpage() * * Find an available overflow page and return it. The returned buffer * is pinned and write-locked, and has had _hash_pageinit() applied, * but it is caller's responsibility to fill the special space. * * The caller must hold a pin, but no lock, on the metapage buffer. * That buffer is left in the same state at exit. */ static Buffer _hash_getovflpage(Relation rel, Buffer metabuf) { HashMetaPage metap; Buffer mapbuf = 0; Buffer newbuf; BlockNumber blkno; uint32 orig_firstfree; uint32 splitnum; uint32 *freep = NULL; uint32 max_ovflpg; uint32 bit; uint32 first_page; uint32 last_bit; uint32 last_page; uint32 i, j; /* Get exclusive lock on the meta page */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); _hash_checkpage(rel, metabuf, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); /* start search at hashm_firstfree */ orig_firstfree = metap->hashm_firstfree; first_page = orig_firstfree >> BMPG_SHIFT(metap); bit = orig_firstfree & BMPG_MASK(metap); i = first_page; j = bit / BITS_PER_MAP; bit &= ~(BITS_PER_MAP - 1); /* outer loop iterates once per bitmap page */ for (;;) { BlockNumber mapblkno; Page mappage; uint32 last_inpage; /* want to end search with the last existing overflow page */ splitnum = metap->hashm_ovflpoint; max_ovflpg = metap->hashm_spares[splitnum] - 1; last_page = max_ovflpg >> BMPG_SHIFT(metap); last_bit = max_ovflpg & BMPG_MASK(metap); if (i > last_page) break; Assert(i < metap->hashm_nmaps); mapblkno = metap->hashm_mapp[i]; if (i == last_page) last_inpage = last_bit; else last_inpage = BMPGSZ_BIT(metap) - 1; /* Release exclusive lock on metapage while reading bitmap page */ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); freep = HashPageGetBitmap(mappage); for (; bit <= last_inpage; j++, bit += BITS_PER_MAP) { if (freep[j] != ALL_SET) goto found; } /* No free space here, try to advance to next map page */ _hash_relbuf(rel, mapbuf); i++; j = 0; /* scan from start of next map page */ bit = 0; /* Reacquire exclusive lock on the meta page */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); } /* * No free pages --- have to extend the relation to add an overflow page. * First, check to see if we have to add a new bitmap page too. */ if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1)) { /* * We create the new bitmap page with all pages marked "in use". * Actually two pages in the new bitmap's range will exist * immediately: the bitmap page itself, and the following page which * is the one we return to the caller. Both of these are correctly * marked "in use". Subsequent pages do not exist yet, but it is * convenient to pre-mark them as "in use" too. */ bit = metap->hashm_spares[splitnum]; _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit), MAIN_FORKNUM); metap->hashm_spares[splitnum]++; } else { /* * Nothing to do here; since the page will be past the last used page, * we know its bitmap bit was preinitialized to "in use". */ } /* Calculate address of the new overflow page */ bit = metap->hashm_spares[splitnum]; blkno = bitno_to_blkno(metap, bit); /* * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the * relation length stays in sync with ours. XXX It's annoying to do this * with metapage write lock held; would be better to use a lock that * doesn't block incoming searches. */ newbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM); metap->hashm_spares[splitnum]++; /* * Adjust hashm_firstfree to avoid redundant searches. But don't risk * changing it if someone moved it while we were searching bitmap pages. */ if (metap->hashm_firstfree == orig_firstfree) metap->hashm_firstfree = bit + 1; /* Write updated metapage and release lock, but not pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); return newbuf; found: /* convert bit to bit number within page */ bit += _hash_firstfreebit(freep[j]); /* mark page "in use" in the bitmap */ SETBIT(freep, bit); _hash_wrtbuf(rel, mapbuf); /* Reacquire exclusive lock on the meta page */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); /* convert bit to absolute bit number */ bit += (i << BMPG_SHIFT(metap)); /* Calculate address of the recycled overflow page */ blkno = bitno_to_blkno(metap, bit); /* * Adjust hashm_firstfree to avoid redundant searches. But don't risk * changing it if someone moved it while we were searching bitmap pages. */ if (metap->hashm_firstfree == orig_firstfree) { metap->hashm_firstfree = bit + 1; /* Write updated metapage and release lock, but not pin */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); } else { /* We didn't change the metapage, so no need to write */ _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK); } /* Fetch, init, and return the recycled page */ return _hash_getinitbuf(rel, blkno); }
/* * replay delete operation in hash index to remove * tuples marked as DEAD during index tuple insertion. */ static void hash_xlog_vacuum_one_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_vacuum_one_page *xldata; Buffer buffer; Buffer metabuf; Page page; XLogRedoAction action; HashPageOpaque pageopaque; xldata = (xl_hash_vacuum_one_page *) XLogRecGetData(record); /* * If we have any conflict processing to do, it must happen before we * update the page. * * Hash index records that are marked as LP_DEAD and being removed during * hash index tuple insertion can conflict with standby queries. You might * think that vacuum records would conflict as well, but we've handled * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid * cleaned by the vacuum of the heap and so we can resolve any conflicts * just once when that arrives. After that we know that no conflicts * exist from individual hash index vacuum records on that index. */ if (InHotStandby) { TransactionId latestRemovedXid = hash_xlog_vacuum_get_latestRemovedXid(record); RelFileNode rnode; XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode); } action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer); if (action == BLK_NEEDS_REDO) { char *ptr; Size len; ptr = XLogRecGetBlockData(record, 0, &len); page = (Page) BufferGetPage(buffer); if (len > 0) { OffsetNumber *unused; OffsetNumber *unend; unused = (OffsetNumber *) ptr; unend = (OffsetNumber *) ((char *) ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); } /* * Mark the page as not containing any LP_DEAD items. See comments * in _hash_vacuum_one_page() for details. */ pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) { Page metapage; HashMetaPage metap; metapage = BufferGetPage(metabuf); metap = HashPageGetMeta(metapage); metap->hashm_ntuples -= xldata->ntuples; PageSetLSN(metapage, lsn); MarkBufferDirty(metabuf); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); }
/* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state) { Relation rel = info->index; double tuples_removed; double num_index_tuples; double orig_ntuples; Bucket orig_maxbucket; Bucket cur_maxbucket; Bucket cur_bucket; Buffer metabuf; HashMetaPage metap; HashMetaPageData local_metapage; tuples_removed = 0; num_index_tuples = 0; /* * Read the metapage to fetch original bucket and tuple counts. Also, we * keep a copy of the last-seen metapage so that we can use its * hashm_spares[] values to compute bucket page addresses. This is a bit * hokey but perfectly safe, since the interesting entries in the spares * array cannot change under us; and it beats rereading the metapage for * each bucket. */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); orig_maxbucket = metap->hashm_maxbucket; orig_ntuples = metap->hashm_ntuples; memcpy(&local_metapage, metap, sizeof(local_metapage)); _hash_relbuf(rel, metabuf); /* Scan the buckets that we know exist */ cur_bucket = 0; cur_maxbucket = orig_maxbucket; loop_top: while (cur_bucket <= cur_maxbucket) { BlockNumber bucket_blkno; BlockNumber blkno; bool bucket_dirty = false; /* Get address of bucket's start page */ bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket); /* Exclusive-lock the bucket so we can shrink it */ _hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE); /* Shouldn't have any active scans locally, either */ if (_hash_has_active_scan(rel, cur_bucket)) elog(ERROR, "hash index has active scan during VACUUM"); /* Scan each page in bucket */ blkno = bucket_blkno; while (BlockNumberIsValid(blkno)) { Buffer buf; Page page; HashPageOpaque opaque; OffsetNumber offno; OffsetNumber maxoffno; OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; vacuum_delay_point(); buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, info->strategy); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(opaque->hasho_bucket == cur_bucket); /* Scan each tuple in page */ maxoffno = PageGetMaxOffsetNumber(page); for (offno = FirstOffsetNumber; offno <= maxoffno; offno = OffsetNumberNext(offno)) { IndexTuple itup; ItemPointer htup; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); htup = &(itup->t_tid); if (callback(htup, callback_state)) { /* mark the item for deletion */ deletable[ndeletable++] = offno; tuples_removed += 1; } else num_index_tuples += 1; } /* * Apply deletions and write page if needed, advance to next page. */ blkno = opaque->hasho_nextblkno; if (ndeletable > 0) { PageIndexMultiDelete(page, deletable, ndeletable); _hash_wrtbuf(rel, buf); bucket_dirty = true; } else _hash_relbuf(rel, buf); } /* If we deleted anything, try to compact free space */ if (bucket_dirty) _hash_squeezebucket(rel, cur_bucket, bucket_blkno, info->strategy); /* Release bucket lock */ _hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE); /* Advance to next bucket */ cur_bucket++; } /* Write-lock metapage and check for split since we started */ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE); metap = HashPageGetMeta(BufferGetPage(metabuf)); if (cur_maxbucket != metap->hashm_maxbucket) { /* There's been a split, so process the additional bucket(s) */ cur_maxbucket = metap->hashm_maxbucket; memcpy(&local_metapage, metap, sizeof(local_metapage)); _hash_relbuf(rel, metabuf); goto loop_top; } /* Okay, we're really done. Update tuple count in metapage. */ if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) { /* * No one has split or inserted anything since start of scan, so * believe our count as gospel. */ metap->hashm_ntuples = num_index_tuples; } else { /* * Otherwise, our count is untrustworthy since we may have * double-scanned tuples in split buckets. Proceed by dead-reckoning. * (Note: we still return estimated_count = false, because using this * count is better than not updating reltuples at all.) */ if (metap->hashm_ntuples > tuples_removed) metap->hashm_ntuples -= tuples_removed; else metap->hashm_ntuples = 0; num_index_tuples = metap->hashm_ntuples; } _hash_wrtbuf(rel, metabuf); /* return statistics */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); stats->estimated_count = false; stats->num_index_tuples = num_index_tuples; stats->tuples_removed += tuples_removed; /* hashvacuumcleanup will fill in num_pages */ return stats; }
/* * replay addition of overflow page for hash index */ static void hash_xlog_add_ovfl_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) XLogRecGetData(record); Buffer leftbuf; Buffer ovflbuf; Buffer metabuf; BlockNumber leftblk; BlockNumber rightblk; BlockNumber newmapblk = InvalidBlockNumber; Page ovflpage; HashPageOpaque ovflopaque; uint32 *num_bucket; char *data; Size datalen PG_USED_FOR_ASSERTS_ONLY; bool new_bmpage = false; XLogRecGetBlockTag(record, 0, NULL, NULL, &rightblk); XLogRecGetBlockTag(record, 1, NULL, NULL, &leftblk); ovflbuf = XLogInitBufferForRedo(record, 0); Assert(BufferIsValid(ovflbuf)); data = XLogRecGetBlockData(record, 0, &datalen); num_bucket = (uint32 *) data; Assert(datalen == sizeof(uint32)); _hash_initbuf(ovflbuf, InvalidBlockNumber, *num_bucket, LH_OVERFLOW_PAGE, true); /* update backlink */ ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); ovflopaque->hasho_prevblkno = leftblk; PageSetLSN(ovflpage, lsn); MarkBufferDirty(ovflbuf); if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO) { Page leftpage; HashPageOpaque leftopaque; leftpage = BufferGetPage(leftbuf); leftopaque = (HashPageOpaque) PageGetSpecialPointer(leftpage); leftopaque->hasho_nextblkno = rightblk; PageSetLSN(leftpage, lsn); MarkBufferDirty(leftbuf); } if (BufferIsValid(leftbuf)) UnlockReleaseBuffer(leftbuf); UnlockReleaseBuffer(ovflbuf); /* * Note: in normal operation, we'd update the bitmap and meta page while * still holding lock on the overflow pages. But during replay it's not * necessary to hold those locks, since no other index updates can be * happening concurrently. */ if (XLogRecHasBlockRef(record, 2)) { Buffer mapbuffer; if (XLogReadBufferForRedo(record, 2, &mapbuffer) == BLK_NEEDS_REDO) { Page mappage = (Page) BufferGetPage(mapbuffer); uint32 *freep = NULL; char *data; uint32 *bitmap_page_bit; freep = HashPageGetBitmap(mappage); data = XLogRecGetBlockData(record, 2, &datalen); bitmap_page_bit = (uint32 *) data; SETBIT(freep, *bitmap_page_bit); PageSetLSN(mappage, lsn); MarkBufferDirty(mapbuffer); } if (BufferIsValid(mapbuffer)) UnlockReleaseBuffer(mapbuffer); } if (XLogRecHasBlockRef(record, 3)) { Buffer newmapbuf; newmapbuf = XLogInitBufferForRedo(record, 3); _hash_initbitmapbuffer(newmapbuf, xlrec->bmsize, true); new_bmpage = true; newmapblk = BufferGetBlockNumber(newmapbuf); MarkBufferDirty(newmapbuf); PageSetLSN(BufferGetPage(newmapbuf), lsn); UnlockReleaseBuffer(newmapbuf); } if (XLogReadBufferForRedo(record, 4, &metabuf) == BLK_NEEDS_REDO) { HashMetaPage metap; Page page; uint32 *firstfree_ovflpage; data = XLogRecGetBlockData(record, 4, &datalen); firstfree_ovflpage = (uint32 *) data; page = BufferGetPage(metabuf); metap = HashPageGetMeta(page); metap->hashm_firstfree = *firstfree_ovflpage; if (!xlrec->bmpage_found) { metap->hashm_spares[metap->hashm_ovflpoint]++; if (new_bmpage) { Assert(BlockNumberIsValid(newmapblk)); metap->hashm_mapp[metap->hashm_nmaps] = newmapblk; metap->hashm_nmaps++; metap->hashm_spares[metap->hashm_ovflpoint]++; } } PageSetLSN(page, lsn); MarkBufferDirty(metabuf); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); }