/*
 *	_hash_addovflpage
 *
 *	Add an overflow page to the bucket whose last page is pointed to by 'buf'.
 *
 *	On entry, the caller must hold a pin but no lock on 'buf'.	The pin is
 *	dropped before exiting (we assume the caller is not interested in 'buf'
 *	anymore).  The returned overflow page will be pinned and write-locked;
 *	it is guaranteed to be empty.
 *
 *	The caller must hold a pin, but no lock, on the metapage buffer.
 *	That buffer is returned in the same state.
 *
 *	The caller must hold at least share lock on the bucket, to ensure that
 *	no one else tries to compact the bucket meanwhile.	This guarantees that
 *	'buf' won't stop being part of the bucket while it's unlocked.
 *
 * NB: since this could be executed concurrently by multiple processes,
 * one should not assume that the returned overflow page will be the
 * immediate successor of the originally passed 'buf'.	Additional overflow
 * pages might have been added to the bucket chain in between.
 */
Buffer
_hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf)
{
    Buffer		ovflbuf;
    Page		page;
    Page		ovflpage;
    HashPageOpaque pageopaque;
    HashPageOpaque ovflopaque;

    /* allocate and lock an empty overflow page */
    ovflbuf = _hash_getovflpage(rel, metabuf);

    /*
     * Write-lock the tail page.  It is okay to hold two buffer locks here
     * since there cannot be anyone else contending for access to ovflbuf.
     */
    _hash_chgbufaccess(rel, buf, HASH_NOLOCK, HASH_WRITE);

    /* probably redundant... */
    _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);

    /* loop to find current tail page, in case someone else inserted too */
    for (;;)
    {
        BlockNumber nextblkno;

        page = BufferGetPage(buf);
        pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
        nextblkno = pageopaque->hasho_nextblkno;

        if (!BlockNumberIsValid(nextblkno))
            break;

        /* we assume we do not need to write the unmodified page */
        _hash_relbuf(rel, buf);

        buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
    }

    /* now that we have correct backlink, initialize new overflow page */
    ovflpage = BufferGetPage(ovflbuf);
    ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
    ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
    ovflopaque->hasho_nextblkno = InvalidBlockNumber;
    ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
    ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
    ovflopaque->hasho_page_id = HASHO_PAGE_ID;

    MarkBufferDirty(ovflbuf);

    /* logically chain overflow page to previous page */
    pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
    _hash_wrtbuf(rel, buf);

    return ovflbuf;
}
/*
 *	_hash_freeovflpage() -
 *
 *	Remove this overflow page from its bucket's chain, and mark the page as
 *	free.  On entry, ovflbuf is write-locked; it is released before exiting.
 *
 *	Since this function is invoked in VACUUM, we provide an access strategy
 *	parameter that controls fetches of the bucket pages.
 *
 *	Returns the block number of the page that followed the given page
 *	in the bucket, or InvalidBlockNumber if no following page.
 *
 *	NB: caller must not hold lock on metapage, nor on either page that's
 *	adjacent in the bucket chain.  The caller had better hold exclusive lock
 *	on the bucket, too.
 */
BlockNumber
_hash_freeovflpage(Relation rel, Buffer ovflbuf,
                   BufferAccessStrategy bstrategy)
{
    HashMetaPage metap;
    Buffer		metabuf;
    Buffer		mapbuf;
    BlockNumber ovflblkno;
    BlockNumber prevblkno;
    BlockNumber blkno;
    BlockNumber nextblkno;
    HashPageOpaque ovflopaque;
    Page		ovflpage;
    Page		mappage;
    uint32	   *freep;
    uint32		ovflbitno;
    int32		bitmappage,
                bitmapbit;
    Bucket bucket PG_USED_FOR_ASSERTS_ONLY;

    /* Get information from the doomed page */
    _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE);
    ovflblkno = BufferGetBlockNumber(ovflbuf);
    ovflpage = BufferGetPage(ovflbuf);
    ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
    nextblkno = ovflopaque->hasho_nextblkno;
    prevblkno = ovflopaque->hasho_prevblkno;
    bucket = ovflopaque->hasho_bucket;

    /*
     * Zero the page for debugging's sake; then write and release it. (Note:
     * if we failed to zero the page here, we'd have problems with the Assert
     * in _hash_pageinit() when the page is reused.)
     */
    MemSet(ovflpage, 0, BufferGetPageSize(ovflbuf));
    _hash_wrtbuf(rel, ovflbuf);

    /*
     * Fix up the bucket chain.  this is a doubly-linked list, so we must fix
     * up the bucket chain members behind and ahead of the overflow page being
     * deleted.  No concurrency issues since we hold exclusive lock on the
     * entire bucket.
     */
    if (BlockNumberIsValid(prevblkno))
    {
        Buffer		prevbuf = _hash_getbuf_with_strategy(rel,
                              prevblkno,
                              HASH_WRITE,
                              LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
                              bstrategy);
        Page		prevpage = BufferGetPage(prevbuf);
        HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage);

        Assert(prevopaque->hasho_bucket == bucket);
        prevopaque->hasho_nextblkno = nextblkno;
        _hash_wrtbuf(rel, prevbuf);
    }
    if (BlockNumberIsValid(nextblkno))
    {
        Buffer		nextbuf = _hash_getbuf_with_strategy(rel,
                              nextblkno,
                              HASH_WRITE,
                              LH_OVERFLOW_PAGE,
                              bstrategy);
        Page		nextpage = BufferGetPage(nextbuf);
        HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage);

        Assert(nextopaque->hasho_bucket == bucket);
        nextopaque->hasho_prevblkno = prevblkno;
        _hash_wrtbuf(rel, nextbuf);
    }

    /* Note: bstrategy is intentionally not used for metapage and bitmap */

    /* Read the metapage so we can determine which bitmap page to use */
    metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
    metap = HashPageGetMeta(BufferGetPage(metabuf));

    /* Identify which bit to set */
    ovflbitno = blkno_to_bitno(metap, ovflblkno);

    bitmappage = ovflbitno >> BMPG_SHIFT(metap);
    bitmapbit = ovflbitno & BMPG_MASK(metap);

    if (bitmappage >= metap->hashm_nmaps)
        elog(ERROR, "invalid overflow bit number %u", ovflbitno);
    blkno = metap->hashm_mapp[bitmappage];

    /* Release metapage lock while we access the bitmap page */
    _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);

    /* Clear the bitmap bit to indicate that this overflow page is free */
    mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE);
    mappage = BufferGetPage(mapbuf);
    freep = HashPageGetBitmap(mappage);
    Assert(ISSET(freep, bitmapbit));
    CLRBIT(freep, bitmapbit);
    _hash_wrtbuf(rel, mapbuf);

    /* Get write-lock on metapage to update firstfree */
    _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

    /* if this is now the first free page, update hashm_firstfree */
    if (ovflbitno < metap->hashm_firstfree)
    {
        metap->hashm_firstfree = ovflbitno;
        _hash_wrtbuf(rel, metabuf);
    }
    else
    {
        /* no need to change metapage */
        _hash_relbuf(rel, metabuf);
    }

    return nextblkno;
}
/*
 *	_hash_getovflpage()
 *
 *	Find an available overflow page and return it.	The returned buffer
 *	is pinned and write-locked, and has had _hash_pageinit() applied,
 *	but it is caller's responsibility to fill the special space.
 *
 * The caller must hold a pin, but no lock, on the metapage buffer.
 * That buffer is left in the same state at exit.
 */
static Buffer
_hash_getovflpage(Relation rel, Buffer metabuf)
{
    HashMetaPage metap;
    Buffer		mapbuf = 0;
    Buffer		newbuf;
    BlockNumber blkno;
    uint32		orig_firstfree;
    uint32		splitnum;
    uint32	   *freep = NULL;
    uint32		max_ovflpg;
    uint32		bit;
    uint32		first_page;
    uint32		last_bit;
    uint32		last_page;
    uint32		i,
                j;

    /* Get exclusive lock on the meta page */
    _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

    _hash_checkpage(rel, metabuf, LH_META_PAGE);
    metap = HashPageGetMeta(BufferGetPage(metabuf));

    /* start search at hashm_firstfree */
    orig_firstfree = metap->hashm_firstfree;
    first_page = orig_firstfree >> BMPG_SHIFT(metap);
    bit = orig_firstfree & BMPG_MASK(metap);
    i = first_page;
    j = bit / BITS_PER_MAP;
    bit &= ~(BITS_PER_MAP - 1);

    /* outer loop iterates once per bitmap page */
    for (;;)
    {
        BlockNumber mapblkno;
        Page		mappage;
        uint32		last_inpage;

        /* want to end search with the last existing overflow page */
        splitnum = metap->hashm_ovflpoint;
        max_ovflpg = metap->hashm_spares[splitnum] - 1;
        last_page = max_ovflpg >> BMPG_SHIFT(metap);
        last_bit = max_ovflpg & BMPG_MASK(metap);

        if (i > last_page)
            break;

        Assert(i < metap->hashm_nmaps);
        mapblkno = metap->hashm_mapp[i];

        if (i == last_page)
            last_inpage = last_bit;
        else
            last_inpage = BMPGSZ_BIT(metap) - 1;

        /* Release exclusive lock on metapage while reading bitmap page */
        _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);

        mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE);
        mappage = BufferGetPage(mapbuf);
        freep = HashPageGetBitmap(mappage);

        for (; bit <= last_inpage; j++, bit += BITS_PER_MAP)
        {
            if (freep[j] != ALL_SET)
                goto found;
        }

        /* No free space here, try to advance to next map page */
        _hash_relbuf(rel, mapbuf);
        i++;
        j = 0;					/* scan from start of next map page */
        bit = 0;

        /* Reacquire exclusive lock on the meta page */
        _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
    }

    /*
     * No free pages --- have to extend the relation to add an overflow page.
     * First, check to see if we have to add a new bitmap page too.
     */
    if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1))
    {
        /*
         * We create the new bitmap page with all pages marked "in use".
         * Actually two pages in the new bitmap's range will exist
         * immediately: the bitmap page itself, and the following page which
         * is the one we return to the caller.	Both of these are correctly
         * marked "in use".  Subsequent pages do not exist yet, but it is
         * convenient to pre-mark them as "in use" too.
         */
        bit = metap->hashm_spares[splitnum];
        _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit), MAIN_FORKNUM);
        metap->hashm_spares[splitnum]++;
    }
    else
    {
        /*
         * Nothing to do here; since the page will be past the last used page,
         * we know its bitmap bit was preinitialized to "in use".
         */
    }

    /* Calculate address of the new overflow page */
    bit = metap->hashm_spares[splitnum];
    blkno = bitno_to_blkno(metap, bit);

    /*
     * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the
     * relation length stays in sync with ours.  XXX It's annoying to do this
     * with metapage write lock held; would be better to use a lock that
     * doesn't block incoming searches.
     */
    newbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM);

    metap->hashm_spares[splitnum]++;

    /*
     * Adjust hashm_firstfree to avoid redundant searches.	But don't risk
     * changing it if someone moved it while we were searching bitmap pages.
     */
    if (metap->hashm_firstfree == orig_firstfree)
        metap->hashm_firstfree = bit + 1;

    /* Write updated metapage and release lock, but not pin */
    _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);

    return newbuf;

found:
    /* convert bit to bit number within page */
    bit += _hash_firstfreebit(freep[j]);

    /* mark page "in use" in the bitmap */
    SETBIT(freep, bit);
    _hash_wrtbuf(rel, mapbuf);

    /* Reacquire exclusive lock on the meta page */
    _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

    /* convert bit to absolute bit number */
    bit += (i << BMPG_SHIFT(metap));

    /* Calculate address of the recycled overflow page */
    blkno = bitno_to_blkno(metap, bit);

    /*
     * Adjust hashm_firstfree to avoid redundant searches.	But don't risk
     * changing it if someone moved it while we were searching bitmap pages.
     */
    if (metap->hashm_firstfree == orig_firstfree)
    {
        metap->hashm_firstfree = bit + 1;

        /* Write updated metapage and release lock, but not pin */
        _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
    }
    else
    {
        /* We didn't change the metapage, so no need to write */
        _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
    }

    /* Fetch, init, and return the recycled page */
    return _hash_getinitbuf(rel, blkno);
}
Example #4
0
/*
 * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
 *
 * We are splitting a bucket that consists of a base bucket page and zero
 * or more overflow (bucket chain) pages.  We must relocate tuples that
 * belong in the new bucket, and compress out any free space in the old
 * bucket.
 *
 * The caller must hold exclusive locks on both buckets to ensure that
 * no one else is trying to access them (see README).
 *
 * The caller must hold a pin, but no lock, on the metapage buffer.
 * The buffer is returned in the same state.  (The metapage is only
 * touched if it becomes necessary to add or remove overflow pages.)
 */
static void
_hash_splitbucket(Relation rel,
				  Buffer metabuf,
				  Bucket obucket,
				  Bucket nbucket,
				  BlockNumber start_oblkno,
				  BlockNumber start_nblkno,
				  uint32 maxbucket,
				  uint32 highmask,
				  uint32 lowmask)
{
	Bucket		bucket;
	Buffer		obuf;
	Buffer		nbuf;
	BlockNumber oblkno;
	BlockNumber nblkno;
	bool		null;
	Datum		datum;
	HashItem	hitem;
	HashPageOpaque oopaque;
	HashPageOpaque nopaque;
	IndexTuple	itup;
	Size		itemsz;
	OffsetNumber ooffnum;
	OffsetNumber noffnum;
	OffsetNumber omaxoffnum;
	Page		opage;
	Page		npage;
	TupleDesc	itupdesc = RelationGetDescr(rel);

	/*
	 * It should be okay to simultaneously write-lock pages from each
	 * bucket, since no one else can be trying to acquire buffer lock
	 * on pages of either bucket.
	 */
	oblkno = start_oblkno;
	nblkno = start_nblkno;
	obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
	nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE);
	opage = BufferGetPage(obuf);
	npage = BufferGetPage(nbuf);

	_hash_checkpage(rel, opage, LH_BUCKET_PAGE);
	oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);

	/* initialize the new bucket's primary page */
	_hash_pageinit(npage, BufferGetPageSize(nbuf));
	nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
	nopaque->hasho_prevblkno = InvalidBlockNumber;
	nopaque->hasho_nextblkno = InvalidBlockNumber;
	nopaque->hasho_bucket = nbucket;
	nopaque->hasho_flag = LH_BUCKET_PAGE;
	nopaque->hasho_filler = HASHO_FILL;

	/*
	 * Partition the tuples in the old bucket between the old bucket and the
	 * new bucket, advancing along the old bucket's overflow bucket chain
	 * and adding overflow pages to the new bucket as needed.
	 */
	ooffnum = FirstOffsetNumber;
	omaxoffnum = PageGetMaxOffsetNumber(opage);
	for (;;)
	{
		/*
		 * at each iteration through this loop, each of these variables
		 * should be up-to-date: obuf opage oopaque ooffnum omaxoffnum
		 */

		/* check if we're at the end of the page */
		if (ooffnum > omaxoffnum)
		{
			/* at end of page, but check for an(other) overflow page */
			oblkno = oopaque->hasho_nextblkno;
			if (!BlockNumberIsValid(oblkno))
				break;
			/*
			 * we ran out of tuples on this particular page, but we
			 * have more overflow pages; advance to next page.
			 */
			_hash_wrtbuf(rel, obuf);

			obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
			opage = BufferGetPage(obuf);
			_hash_checkpage(rel, opage, LH_OVERFLOW_PAGE);
			oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
			ooffnum = FirstOffsetNumber;
			omaxoffnum = PageGetMaxOffsetNumber(opage);
			continue;
		}

		/*
		 * Re-hash the tuple to determine which bucket it now belongs in.
		 *
		 * It is annoying to call the hash function while holding locks,
		 * but releasing and relocking the page for each tuple is unappealing
		 * too.
		 */
		hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum));
		itup = &(hitem->hash_itup);
		datum = index_getattr(itup, 1, itupdesc, &null);
		Assert(!null);

		bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum),
									  maxbucket, highmask, lowmask);

		if (bucket == nbucket)
		{
			/*
			 * insert the tuple into the new bucket.  if it doesn't fit on
			 * the current page in the new bucket, we must allocate a new
			 * overflow page and place the tuple on that page instead.
			 */
			itemsz = IndexTupleDSize(hitem->hash_itup)
				+ (sizeof(HashItemData) - sizeof(IndexTupleData));

			itemsz = MAXALIGN(itemsz);

			if (PageGetFreeSpace(npage) < itemsz)
			{
				/* write out nbuf and drop lock, but keep pin */
				_hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK);
				/* chain to a new overflow page */
				nbuf = _hash_addovflpage(rel, metabuf, nbuf);
				npage = BufferGetPage(nbuf);
				_hash_checkpage(rel, npage, LH_OVERFLOW_PAGE);
				/* we don't need nopaque within the loop */
			}

			noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
			if (PageAddItem(npage, (Item) hitem, itemsz, noffnum, LP_USED)
				== InvalidOffsetNumber)
				elog(ERROR, "failed to add index item to \"%s\"",
					 RelationGetRelationName(rel));

			/*
			 * now delete the tuple from the old bucket.  after this
			 * section of code, 'ooffnum' will actually point to the
			 * ItemId to which we would point if we had advanced it before
			 * the deletion (PageIndexTupleDelete repacks the ItemId
			 * array).	this also means that 'omaxoffnum' is exactly one
			 * less than it used to be, so we really can just decrement it
			 * instead of calling PageGetMaxOffsetNumber.
			 */
			PageIndexTupleDelete(opage, ooffnum);
			omaxoffnum = OffsetNumberPrev(omaxoffnum);
		}
		else
		{
			/*
			 * the tuple stays on this page.  we didn't move anything, so
			 * we didn't delete anything and therefore we don't have to
			 * change 'omaxoffnum'.
			 */
			Assert(bucket == obucket);
			ooffnum = OffsetNumberNext(ooffnum);
		}
	}

	/*
	 * We're at the end of the old bucket chain, so we're done partitioning
	 * the tuples.  Before quitting, call _hash_squeezebucket to ensure the
	 * tuples remaining in the old bucket (including the overflow pages) are
	 * packed as tightly as possible.  The new bucket is already tight.
	 */
	_hash_wrtbuf(rel, obuf);
	_hash_wrtbuf(rel, nbuf);

	_hash_squeezebucket(rel, obucket, start_oblkno);
}
Example #5
0
/*
 * Attempt to expand the hash table by creating one new bucket.
 *
 * This will silently do nothing if it cannot get the needed locks.
 *
 * The caller should hold no locks on the hash index.
 *
 * The caller must hold a pin, but no lock, on the metapage buffer.
 * The buffer is returned in the same state.
 */
void
_hash_expandtable(Relation rel, Buffer metabuf)
{
	HashMetaPage metap;
	Bucket		old_bucket;
	Bucket		new_bucket;
	uint32		spare_ndx;
	BlockNumber start_oblkno;
	BlockNumber start_nblkno;
	uint32		maxbucket;
	uint32		highmask;
	uint32		lowmask;

	/*
	 * Obtain the page-zero lock to assert the right to begin a split
	 * (see README).
	 *
	 * Note: deadlock should be impossible here. Our own backend could only
	 * be holding bucket sharelocks due to stopped indexscans; those will not
	 * block other holders of the page-zero lock, who are only interested in
	 * acquiring bucket sharelocks themselves.  Exclusive bucket locks are
	 * only taken here and in hashbulkdelete, and neither of these operations
	 * needs any additional locks to complete.  (If, due to some flaw in this
	 * reasoning, we manage to deadlock anyway, it's okay to error out; the
	 * index will be left in a consistent state.)
	 */
	_hash_getlock(rel, 0, HASH_EXCLUSIVE);

	/* Write-lock the meta page */
	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

	metap = (HashMetaPage) BufferGetPage(metabuf);
	_hash_checkpage(rel, (Page) metap, LH_META_PAGE);

	/*
	 * Check to see if split is still needed; someone else might have already
	 * done one while we waited for the lock.
	 *
	 * Make sure this stays in sync with_hash_doinsert()
	 */
	if (metap->hashm_ntuples <=
		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
		goto fail;

	/*
	 * Determine which bucket is to be split, and attempt to lock the old
	 * bucket.  If we can't get the lock, give up.
	 *
	 * The lock protects us against other backends, but not against our own
	 * backend.  Must check for active scans separately.
	 *
	 * Ideally we would lock the new bucket too before proceeding, but if
	 * we are about to cross a splitpoint then the BUCKET_TO_BLKNO mapping
	 * isn't correct yet.  For simplicity we update the metapage first and
	 * then lock.  This should be okay because no one else should be trying
	 * to lock the new bucket yet...
	 */
	new_bucket = metap->hashm_maxbucket + 1;
	old_bucket = (new_bucket & metap->hashm_lowmask);

	start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);

	if (_hash_has_active_scan(rel, old_bucket))
		goto fail;

	if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE))
		goto fail;

	/*
	 * Okay to proceed with split.  Update the metapage bucket mapping info.
	 */
	metap->hashm_maxbucket = new_bucket;

	if (new_bucket > metap->hashm_highmask)
	{
		/* Starting a new doubling */
		metap->hashm_lowmask = metap->hashm_highmask;
		metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
	}

	/*
	 * If the split point is increasing (hashm_maxbucket's log base 2
	 * increases), we need to adjust the hashm_spares[] array and
	 * hashm_ovflpoint so that future overflow pages will be created beyond
	 * this new batch of bucket pages.
	 *
	 * XXX should initialize new bucket pages to prevent out-of-order
	 * page creation?  Don't wanna do it right here though.
	 */
	spare_ndx = _hash_log2(metap->hashm_maxbucket + 1);
	if (spare_ndx > metap->hashm_ovflpoint)
	{
		Assert(spare_ndx == metap->hashm_ovflpoint + 1);
		metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
		metap->hashm_ovflpoint = spare_ndx;
	}

	/* now we can compute the new bucket's primary block number */
	start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);

	Assert(!_hash_has_active_scan(rel, new_bucket));

	if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE))
		elog(PANIC, "could not get lock on supposedly new bucket");

	/*
	 * Copy bucket mapping info now; this saves re-accessing the meta page
	 * inside _hash_splitbucket's inner loop.  Note that once we drop the
	 * split lock, other splits could begin, so these values might be out of
	 * date before _hash_splitbucket finishes.  That's okay, since all it
	 * needs is to tell which of these two buckets to map hashkeys into.
	 */
	maxbucket = metap->hashm_maxbucket;
	highmask = metap->hashm_highmask;
	lowmask = metap->hashm_lowmask;

	/* Write out the metapage and drop lock, but keep pin */
	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);

	/* Release split lock; okay for other splits to occur now */
	_hash_droplock(rel, 0, HASH_EXCLUSIVE);

	/* Relocate records to the new bucket */
	_hash_splitbucket(rel, metabuf, old_bucket, new_bucket,
					  start_oblkno, start_nblkno,
					  maxbucket, highmask, lowmask);

	/* Release bucket locks, allowing others to access them */
	_hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
	_hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);

	return;

	/* Here if decide not to split or fail to acquire old bucket lock */
fail:

	/* We didn't write the metapage, so just drop lock */
	_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);

	/* Release split lock */
	_hash_droplock(rel, 0, HASH_EXCLUSIVE);
}
Example #6
0
/*
 * Helper function to perform deletion of index entries from a bucket.
 *
 * This function expects that the caller has acquired a cleanup lock on the
 * primary bucket page, and will return with a write lock again held on the
 * primary bucket page.  The lock won't necessarily be held continuously,
 * though, because we'll release it when visiting overflow pages.
 *
 * It would be very bad if this function cleaned a page while some other
 * backend was in the midst of scanning it, because hashgettuple assumes
 * that the next valid TID will be greater than or equal to the current
 * valid TID.  There can't be any concurrent scans in progress when we first
 * enter this function because of the cleanup lock we hold on the primary
 * bucket page, but as soon as we release that lock, there might be.  We
 * handle that by conspiring to prevent those scans from passing our cleanup
 * scan.  To do that, we lock the next page in the bucket chain before
 * releasing the lock on the previous page.  (This type of lock chaining is
 * not ideal, so we might want to look for a better solution at some point.)
 *
 * We need to retain a pin on the primary bucket to ensure that no concurrent
 * split can start.
 */
void
hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
                  BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
                  uint32 maxbucket, uint32 highmask, uint32 lowmask,
                  double *tuples_removed, double *num_index_tuples,
                  bool split_cleanup,
                  IndexBulkDeleteCallback callback, void *callback_state)
{
    BlockNumber blkno;
    Buffer		buf;
    Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
    bool		bucket_dirty = false;

    blkno = bucket_blkno;
    buf = bucket_buf;

    if (split_cleanup)
        new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
                     lowmask, maxbucket);

    /* Scan each page in bucket */
    for (;;)
    {
        HashPageOpaque opaque;
        OffsetNumber offno;
        OffsetNumber maxoffno;
        Buffer		next_buf;
        Page		page;
        OffsetNumber deletable[MaxOffsetNumber];
        int			ndeletable = 0;
        bool		retain_pin = false;
        bool		curr_page_dirty = false;

        vacuum_delay_point();

        page = BufferGetPage(buf);
        opaque = (HashPageOpaque) PageGetSpecialPointer(page);

        /* Scan each tuple in page */
        maxoffno = PageGetMaxOffsetNumber(page);
        for (offno = FirstOffsetNumber;
                offno <= maxoffno;
                offno = OffsetNumberNext(offno))
        {
            ItemPointer htup;
            IndexTuple	itup;
            Bucket		bucket;
            bool		kill_tuple = false;

            itup = (IndexTuple) PageGetItem(page,
                                            PageGetItemId(page, offno));
            htup = &(itup->t_tid);

            /*
             * To remove the dead tuples, we strictly want to rely on results
             * of callback function.  refer btvacuumpage for detailed reason.
             */
            if (callback && callback(htup, callback_state))
            {
                kill_tuple = true;
                if (tuples_removed)
                    *tuples_removed += 1;
            }
            else if (split_cleanup)
            {
                /* delete the tuples that are moved by split. */
                bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
                                              maxbucket,
                                              highmask,
                                              lowmask);
                /* mark the item for deletion */
                if (bucket != cur_bucket)
                {
                    /*
                     * We expect tuples to either belong to curent bucket or
                     * new_bucket.  This is ensured because we don't allow
                     * further splits from bucket that contains garbage. See
                     * comments in _hash_expandtable.
                     */
                    Assert(bucket == new_bucket);
                    kill_tuple = true;
                }
            }

            if (kill_tuple)
            {
                /* mark the item for deletion */
                deletable[ndeletable++] = offno;
            }
            else
            {
                /* we're keeping it, so count it */
                if (num_index_tuples)
                    *num_index_tuples += 1;
            }
        }

        /* retain the pin on primary bucket page till end of bucket scan */
        if (blkno == bucket_blkno)
            retain_pin = true;
        else
            retain_pin = false;

        blkno = opaque->hasho_nextblkno;

        /*
         * Apply deletions, advance to next page and write page if needed.
         */
        if (ndeletable > 0)
        {
            PageIndexMultiDelete(page, deletable, ndeletable);
            bucket_dirty = true;
            curr_page_dirty = true;
        }

        /* bail out if there are no more pages to scan. */
        if (!BlockNumberIsValid(blkno))
            break;

        next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
                                              LH_OVERFLOW_PAGE,
                                              bstrategy);

        /*
         * release the lock on previous page after acquiring the lock on next
         * page
         */
        if (curr_page_dirty)
        {
            if (retain_pin)
                _hash_chgbufaccess(rel, buf, HASH_WRITE, HASH_NOLOCK);
            else
                _hash_wrtbuf(rel, buf);
            curr_page_dirty = false;
        }
        else if (retain_pin)
            _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);
        else
            _hash_relbuf(rel, buf);

        buf = next_buf;
    }

    /*
     * lock the bucket page to clear the garbage flag and squeeze the bucket.
     * if the current buffer is same as bucket buffer, then we already have
     * lock on bucket page.
     */
    if (buf != bucket_buf)
    {
        _hash_relbuf(rel, buf);
        _hash_chgbufaccess(rel, bucket_buf, HASH_NOLOCK, HASH_WRITE);
    }

    /*
     * Clear the garbage flag from bucket after deleting the tuples that are
     * moved by split.  We purposefully clear the flag before squeeze bucket,
     * so that after restart, vacuum shouldn't again try to delete the moved
     * by split tuples.
     */
    if (split_cleanup)
    {
        HashPageOpaque bucket_opaque;
        Page		page;

        page = BufferGetPage(bucket_buf);
        bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);

        bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
    }

    /*
     * If we have deleted anything, try to compact free space.  For squeezing
     * the bucket, we must have a cleanup lock, else it can impact the
     * ordering of tuples for a scan that has started before it.
     */
    if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
        _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
                            bstrategy);
    else
        _hash_chgbufaccess(rel, bucket_buf, HASH_WRITE, HASH_NOLOCK);
}
Example #7
0
/*
 * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
 *
 * We are splitting a bucket that consists of a base bucket page and zero
 * or more overflow (bucket chain) pages.  We must relocate tuples that
 * belong in the new bucket, and compress out any free space in the old
 * bucket.
 *
 * The caller must hold exclusive locks on both buckets to ensure that
 * no one else is trying to access them (see README).
 *
 * The caller must hold a pin, but no lock, on the metapage buffer.
 * The buffer is returned in the same state.  (The metapage is only
 * touched if it becomes necessary to add or remove overflow pages.)
 */
static void
_hash_splitbucket(Relation rel,
                  Buffer metabuf,
                  Bucket obucket,
                  Bucket nbucket,
                  BlockNumber start_oblkno,
                  BlockNumber start_nblkno,
                  uint32 maxbucket,
                  uint32 highmask,
                  uint32 lowmask)
{
    BlockNumber oblkno;
    BlockNumber nblkno;
    Buffer		obuf;
    Buffer		nbuf;
    Page		opage;
    Page		npage;
    HashPageOpaque oopaque;
    HashPageOpaque nopaque;

    /*
     * It should be okay to simultaneously write-lock pages from each bucket,
     * since no one else can be trying to acquire buffer lock on pages of
     * either bucket.
     */
    oblkno = start_oblkno;
    obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_BUCKET_PAGE);
    opage = BufferGetPage(obuf);
    oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);

    nblkno = start_nblkno;
    nbuf = _hash_getnewbuf(rel, nblkno, MAIN_FORKNUM);
    npage = BufferGetPage(nbuf);

    /* initialize the new bucket's primary page */
    nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
    nopaque->hasho_prevblkno = InvalidBlockNumber;
    nopaque->hasho_nextblkno = InvalidBlockNumber;
    nopaque->hasho_bucket = nbucket;
    nopaque->hasho_flag = LH_BUCKET_PAGE;
    nopaque->hasho_page_id = HASHO_PAGE_ID;

    /*
     * Partition the tuples in the old bucket between the old bucket and the
     * new bucket, advancing along the old bucket's overflow bucket chain and
     * adding overflow pages to the new bucket as needed.  Outer loop iterates
     * once per page in old bucket.
     */
    for (;;)
    {
        OffsetNumber ooffnum;
        OffsetNumber omaxoffnum;
        OffsetNumber deletable[MaxOffsetNumber];
        int			ndeletable = 0;

        /* Scan each tuple in old page */
        omaxoffnum = PageGetMaxOffsetNumber(opage);
        for (ooffnum = FirstOffsetNumber;
                ooffnum <= omaxoffnum;
                ooffnum = OffsetNumberNext(ooffnum))
        {
            IndexTuple	itup;
            Size		itemsz;
            Bucket		bucket;

            /*
             * Fetch the item's hash key (conveniently stored in the item) and
             * determine which bucket it now belongs in.
             */
            itup = (IndexTuple) PageGetItem(opage,
                                            PageGetItemId(opage, ooffnum));
            bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
                                          maxbucket, highmask, lowmask);

            if (bucket == nbucket)
            {
                /*
                 * insert the tuple into the new bucket.  if it doesn't fit on
                 * the current page in the new bucket, we must allocate a new
                 * overflow page and place the tuple on that page instead.
                 */
                itemsz = IndexTupleDSize(*itup);
                itemsz = MAXALIGN(itemsz);

                if (PageGetFreeSpace(npage) < itemsz)
                {
                    /* write out nbuf and drop lock, but keep pin */
                    _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK);
                    /* chain to a new overflow page */
                    nbuf = _hash_addovflpage(rel, metabuf, nbuf);
                    npage = BufferGetPage(nbuf);
                    /* we don't need nblkno or nopaque within the loop */
                }

                /*
                 * Insert tuple on new page, using _hash_pgaddtup to ensure
                 * correct ordering by hashkey.  This is a tad inefficient
                 * since we may have to shuffle itempointers repeatedly.
                 * Possible future improvement: accumulate all the items for
                 * the new page and qsort them before insertion.
                 */
                (void) _hash_pgaddtup(rel, nbuf, itemsz, itup);

                /*
                 * Mark tuple for deletion from old page.
                 */
                deletable[ndeletable++] = ooffnum;
            }
            else
            {
                /*
                 * the tuple stays on this page, so nothing to do.
                 */
                Assert(bucket == obucket);
            }
        }

        oblkno = oopaque->hasho_nextblkno;

        /*
         * Done scanning this old page.  If we moved any tuples, delete them
         * from the old page.
         */
        if (ndeletable > 0)
        {
            PageIndexMultiDelete(opage, deletable, ndeletable);
            _hash_wrtbuf(rel, obuf);
        }
        else
            _hash_relbuf(rel, obuf);

        /* Exit loop if no more overflow pages in old bucket */
        if (!BlockNumberIsValid(oblkno))
            break;

        /* Else, advance to next old page */
        obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
        opage = BufferGetPage(obuf);
        oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
    }

    /*
     * We're at the end of the old bucket chain, so we're done partitioning
     * the tuples.	Before quitting, call _hash_squeezebucket to ensure the
     * tuples remaining in the old bucket (including the overflow pages) are
     * packed as tightly as possible.  The new bucket is already tight.
     */
    _hash_wrtbuf(rel, nbuf);

    _hash_squeezebucket(rel, obucket, start_oblkno, NULL);
}
Example #8
0
/*
 *	_hash_doinsert() -- Handle insertion of a single HashItem in the table.
 *
 *		This routine is called by the public interface routines, hashbuild
 *		and hashinsert.  By here, hashitem is completely filled in.
 *		The datum to be used as a "key" is in the hashitem.
 */
InsertIndexResult
_hash_doinsert(Relation rel, HashItem hitem)
{
	Buffer		buf;
	Buffer		metabuf;
	HashMetaPage metap;
	IndexTuple	itup;
	BlockNumber itup_blkno;
	OffsetNumber itup_off;
	InsertIndexResult res;
	BlockNumber blkno;
	Page		page;
	HashPageOpaque pageopaque;
	Size		itemsz;
	bool		do_expand;
	uint32		hashkey;
	Bucket		bucket;
	Datum		datum;
	bool		isnull;

	/*
	 * Compute the hash key for the item.  We do this first so as not to
	 * need to hold any locks while running the hash function.
	 */
	itup = &(hitem->hash_itup);
	if (rel->rd_rel->relnatts != 1)
		elog(ERROR, "hash indexes support only one index key");
	datum = index_getattr(itup, 1, RelationGetDescr(rel), &isnull);
	Assert(!isnull);
	hashkey = _hash_datum2hashkey(rel, datum);

	/* compute item size too */
	itemsz = IndexTupleDSize(hitem->hash_itup)
		+ (sizeof(HashItemData) - sizeof(IndexTupleData));

	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but
								 * we need to be consistent */

	/*
	 * Acquire shared split lock so we can compute the target bucket
	 * safely (see README).
	 */
	_hash_getlock(rel, 0, HASH_SHARE);

	/* Read the metapage */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
	metap = (HashMetaPage) BufferGetPage(metabuf);
	_hash_checkpage(rel, (Page) metap, LH_META_PAGE);

	/*
	 * Check whether the item can fit on a hash page at all. (Eventually,
	 * we ought to try to apply TOAST methods if not.)  Note that at this
	 * point, itemsz doesn't include the ItemId.
	 */
	if (itemsz > HashMaxItemSize((Page) metap))
		ereport(ERROR,
				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
				 errmsg("index row size %lu exceeds hash maximum %lu",
						(unsigned long) itemsz,
						(unsigned long) HashMaxItemSize((Page) metap))));

	/*
	 * Compute the target bucket number, and convert to block number.
	 */
	bucket = _hash_hashkey2bucket(hashkey,
								  metap->hashm_maxbucket,
								  metap->hashm_highmask,
								  metap->hashm_lowmask);

	blkno = BUCKET_TO_BLKNO(metap, bucket);

	/* release lock on metapage, but keep pin since we'll need it again */
	_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);

	/*
	 * Acquire share lock on target bucket; then we can release split lock.
	 */
	_hash_getlock(rel, blkno, HASH_SHARE);

	_hash_droplock(rel, 0, HASH_SHARE);

	/* Fetch the primary bucket page for the bucket */
	buf = _hash_getbuf(rel, blkno, HASH_WRITE);
	page = BufferGetPage(buf);
	_hash_checkpage(rel, page, LH_BUCKET_PAGE);
	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
	Assert(pageopaque->hasho_bucket == bucket);

	/* Do the insertion */
	while (PageGetFreeSpace(page) < itemsz)
	{
		/*
		 * no space on this page; check for an overflow page
		 */
		BlockNumber	nextblkno = pageopaque->hasho_nextblkno;

		if (BlockNumberIsValid(nextblkno))
		{
			/*
			 * ovfl page exists; go get it.  if it doesn't have room,
			 * we'll find out next pass through the loop test above.
			 */
			_hash_relbuf(rel, buf);
			buf = _hash_getbuf(rel, nextblkno, HASH_WRITE);
			page = BufferGetPage(buf);
		}
		else
		{
			/*
			 * we're at the end of the bucket chain and we haven't found a
			 * page with enough room.  allocate a new overflow page.
			 */

			/* release our write lock without modifying buffer */
			_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);

			/* chain to a new overflow page */
			buf = _hash_addovflpage(rel, metabuf, buf);
			page = BufferGetPage(buf);

			/* should fit now, given test above */
			Assert(PageGetFreeSpace(page) >= itemsz);
		}
		_hash_checkpage(rel, page, LH_OVERFLOW_PAGE);
		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
		Assert(pageopaque->hasho_bucket == bucket);
	}

	/* found page with enough space, so add the item here */
	itup_off = _hash_pgaddtup(rel, buf, itemsz, hitem);
	itup_blkno = BufferGetBlockNumber(buf);

	/* write and release the modified page */
	_hash_wrtbuf(rel, buf);

	/* We can drop the bucket lock now */
	_hash_droplock(rel, blkno, HASH_SHARE);

	/*
	 * Write-lock the metapage so we can increment the tuple count.
	 * After incrementing it, check to see if it's time for a split.
	 */
	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

	metap->hashm_ntuples += 1;

	/* Make sure this stays in sync with _hash_expandtable() */
	do_expand = metap->hashm_ntuples >
		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);

	/* Write out the metapage and drop lock, but keep pin */
	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);

	/* Attempt to split if a split is needed */
	if (do_expand)
		_hash_expandtable(rel, metabuf);

	/* Finally drop our pin on the metapage */
	_hash_dropbuf(rel, metabuf);

	/* Create the return data structure */
	res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));

	ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);

	return res;
}
Example #9
0
/*
 *	_hash_metapinit() -- Initialize the metadata page of a hash index,
 *				the initial buckets, and the initial bitmap page.
 *
 * The initial number of buckets is dependent on num_tuples, an estimate
 * of the number of tuples to be loaded into the index initially.  The
 * chosen number of buckets is returned.
 *
 * We are fairly cavalier about locking here, since we know that no one else
 * could be accessing this index.  In particular the rule about not holding
 * multiple buffer locks is ignored.
 */
uint32
_hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum)
{
    HashMetaPage metap;
    HashPageOpaque pageopaque;
    Buffer		metabuf;
    Buffer		buf;
    Page		pg;
    int32		data_width;
    int32		item_width;
    int32		ffactor;
    double		dnumbuckets;
    uint32		num_buckets;
    uint32		log2_num_buckets;
    uint32		i;

    /* safety check */
    if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0)
        elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
             RelationGetRelationName(rel));

    /*
     * Determine the target fill factor (in tuples per bucket) for this index.
     * The idea is to make the fill factor correspond to pages about as full
     * as the user-settable fillfactor parameter says.	We can compute it
     * exactly since the index datatype (i.e. uint32 hash key) is fixed-width.
     */
    data_width = sizeof(uint32);
    item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
                 sizeof(ItemIdData);		/* include the line pointer */
    ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width;
    /* keep to a sane range */
    if (ffactor < 10)
        ffactor = 10;

    /*
     * Choose the number of initial bucket pages to match the fill factor
     * given the estimated number of tuples.  We round up the result to the
     * next power of 2, however, and always force at least 2 bucket pages. The
     * upper limit is determined by considerations explained in
     * _hash_expandtable().
     */
    dnumbuckets = num_tuples / ffactor;
    if (dnumbuckets <= 2.0)
        num_buckets = 2;
    else if (dnumbuckets >= (double) 0x40000000)
        num_buckets = 0x40000000;
    else
        num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets);

    log2_num_buckets = _hash_log2(num_buckets);
    Assert(num_buckets == (((uint32) 1) << log2_num_buckets));
    Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS);

    /*
     * We initialize the metapage, the first N bucket pages, and the first
     * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()
     * calls to occur.	This ensures that the smgr level has the right idea of
     * the physical index length.
     */
    metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum);
    pg = BufferGetPage(metabuf);

    pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
    pageopaque->hasho_prevblkno = InvalidBlockNumber;
    pageopaque->hasho_nextblkno = InvalidBlockNumber;
    pageopaque->hasho_bucket = -1;
    pageopaque->hasho_flag = LH_META_PAGE;
    pageopaque->hasho_page_id = HASHO_PAGE_ID;

    metap = HashPageGetMeta(pg);

    metap->hashm_magic = HASH_MAGIC;
    metap->hashm_version = HASH_VERSION;
    metap->hashm_ntuples = 0;
    metap->hashm_nmaps = 0;
    metap->hashm_ffactor = ffactor;
    metap->hashm_bsize = HashGetMaxBitmapSize(pg);
    /* find largest bitmap array size that will fit in page size */
    for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
    {
        if ((1 << i) <= metap->hashm_bsize)
            break;
    }
    Assert(i > 0);
    metap->hashm_bmsize = 1 << i;
    metap->hashm_bmshift = i + BYTE_TO_BIT;
    Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1));

    /*
     * Label the index with its primary hash support function's OID.  This is
     * pretty useless for normal operation (in fact, hashm_procid is not used
     * anywhere), but it might be handy for forensic purposes so we keep it.
     */
    metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);

    /*
     * We initialize the index with N buckets, 0 .. N-1, occupying physical
     * blocks 1 to N.  The first freespace bitmap page is in block N+1. Since
     * N is a power of 2, we can set the masks this way:
     */
    metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1;
    metap->hashm_highmask = (num_buckets << 1) - 1;

    MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
    MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));

    /* Set up mapping for one spare page after the initial splitpoints */
    metap->hashm_spares[log2_num_buckets] = 1;
    metap->hashm_ovflpoint = log2_num_buckets;
    metap->hashm_firstfree = 0;

    /*
     * Release buffer lock on the metapage while we initialize buckets.
     * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS
     * won't accomplish anything.  It's a bad idea to hold buffer locks for
     * long intervals in any case, since that can block the bgwriter.
     */
    _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);

    /*
     * Initialize the first N buckets
     */
    for (i = 0; i < num_buckets; i++)
    {
        /* Allow interrupts, in case N is huge */
        CHECK_FOR_INTERRUPTS();

        buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum);
        pg = BufferGetPage(buf);
        pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
        pageopaque->hasho_prevblkno = InvalidBlockNumber;
        pageopaque->hasho_nextblkno = InvalidBlockNumber;
        pageopaque->hasho_bucket = i;
        pageopaque->hasho_flag = LH_BUCKET_PAGE;
        pageopaque->hasho_page_id = HASHO_PAGE_ID;
        _hash_wrtbuf(rel, buf);
    }

    /* Now reacquire buffer lock on metapage */
    _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

    /*
     * Initialize first bitmap page
     */
    _hash_initbitmap(rel, metap, num_buckets + 1, forkNum);

    /* all done */
    _hash_wrtbuf(rel, metabuf);

    return num_buckets;
}
Example #10
0
/*
 *	hashgetmulti() -- get multiple tuples at once
 *
 * This is a somewhat generic implementation: it avoids lock reacquisition
 * overhead, but there's no smarts about picking especially good stopping
 * points such as index page boundaries.
 */
Datum
hashgetmulti(PG_FUNCTION_ARGS)
{
	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
	ItemPointer tids = (ItemPointer) PG_GETARG_POINTER(1);
	int32		max_tids = PG_GETARG_INT32(2);
	int32	   *returned_tids = (int32 *) PG_GETARG_POINTER(3);
	HashScanOpaque so = (HashScanOpaque) scan->opaque;
	Relation	rel = scan->indexRelation;
	bool		res = true;
	int32		ntids = 0;

	/*
	 * We hold pin but not lock on current buffer while outside the hash AM.
	 * Reacquire the read lock here.
	 */
	if (BufferIsValid(so->hashso_curbuf))
		_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ);

	while (ntids < max_tids)
	{
		/*
		 * Start scan, or advance to next tuple.
		 */
		if (ItemPointerIsValid(&(scan->currentItemData)))
			res = _hash_next(scan, ForwardScanDirection);
		else
			res = _hash_first(scan, ForwardScanDirection);

		/*
		 * Skip killed tuples if asked to.
		 */
		if (scan->ignore_killed_tuples)
		{
			while (res)
			{
				Page		page;
				OffsetNumber offnum;

				offnum = ItemPointerGetOffsetNumber(&(scan->currentItemData));
				page = BufferGetPage(so->hashso_curbuf);
				if (!ItemIdDeleted(PageGetItemId(page, offnum)))
					break;
				res = _hash_next(scan, ForwardScanDirection);
			}
		}

		if (!res)
			break;
		/* Save tuple ID, and continue scanning */
		tids[ntids] = scan->xs_ctup.t_self;
		ntids++;
	}

	/* Release read lock on current buffer, but keep it pinned */
	if (BufferIsValid(so->hashso_curbuf))
		_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK);

	*returned_tids = ntids;
	PG_RETURN_BOOL(res);
}
Example #11
0
/*
 *	hashgettuple() -- Get the next tuple in the scan.
 */
Datum
hashgettuple(PG_FUNCTION_ARGS)
{
	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
	ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
	HashScanOpaque so = (HashScanOpaque) scan->opaque;
	Relation	rel = scan->indexRelation;
	Page		page;
	OffsetNumber offnum;
	bool		res;

	/*
	 * We hold pin but not lock on current buffer while outside the hash AM.
	 * Reacquire the read lock here.
	 */
	if (BufferIsValid(so->hashso_curbuf))
		_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ);

	/*
	 * If we've already initialized this scan, we can just advance it in the
	 * appropriate direction.  If we haven't done so yet, we call a routine to
	 * get the first item in the scan.
	 */
	if (ItemPointerIsValid(&(scan->currentItemData)))
	{
		/*
		 * Check to see if we should kill the previously-fetched tuple.
		 */
		if (scan->kill_prior_tuple)
		{
			/*
			 * Yes, so mark it by setting the LP_DELETE bit in the item flags.
			 */
			offnum = ItemPointerGetOffsetNumber(&(scan->currentItemData));
			page = BufferGetPage(so->hashso_curbuf);
			PageGetItemId(page, offnum)->lp_flags |= LP_DELETE;

			/*
			 * Since this can be redone later if needed, it's treated the same
			 * as a commit-hint-bit status update for heap tuples: we mark the
			 * buffer dirty but don't make a WAL log entry.
			 */
			SetBufferCommitInfoNeedsSave(so->hashso_curbuf);
		}

		/*
		 * Now continue the scan.
		 */
		res = _hash_next(scan, dir);
	}
	else
		res = _hash_first(scan, dir);

	/*
	 * Skip killed tuples if asked to.
	 */
	if (scan->ignore_killed_tuples)
	{
		while (res)
		{
			offnum = ItemPointerGetOffsetNumber(&(scan->currentItemData));
			page = BufferGetPage(so->hashso_curbuf);
			if (!ItemIdDeleted(PageGetItemId(page, offnum)))
				break;
			res = _hash_next(scan, dir);
		}
	}

	/* Release read lock on current buffer, but keep it pinned */
	if (BufferIsValid(so->hashso_curbuf))
		_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK);

	PG_RETURN_BOOL(res);
}
Example #12
0
/*
 *	_hash_first() -- Find the first item in a scan.
 *
 *		Find the first item in the index that
 *		satisfies the qualification associated with the scan descriptor. On
 *		success, the page containing the current index tuple is read locked
 *		and pinned, and the scan's opaque data entry is updated to
 *		include the buffer.
 */
bool
_hash_first(IndexScanDesc scan, ScanDirection dir)
{
	Relation	rel = scan->indexRelation;
	HashScanOpaque so = (HashScanOpaque) scan->opaque;
	ScanKey		cur;
	uint32		hashkey;
	Bucket		bucket;
	BlockNumber blkno;
	BlockNumber oldblkno = InvalidBuffer;
	bool		retry = false;
	Buffer		buf;
	Buffer		metabuf;
	Page		page;
	HashPageOpaque opaque;
	HashMetaPage metap;
	IndexTuple	itup;
	ItemPointer current;
	OffsetNumber offnum;

	pgstat_count_index_scan(rel);

	current = &(so->hashso_curpos);
	ItemPointerSetInvalid(current);

	/*
	 * We do not support hash scans with no index qualification, because we
	 * would have to read the whole index rather than just one bucket. That
	 * creates a whole raft of problems, since we haven't got a practical way
	 * to lock all the buckets against splits or compactions.
	 */
	if (scan->numberOfKeys < 1)
		ereport(ERROR,
				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
				 errmsg("hash indexes do not support whole-index scans")));

	/* There may be more than one index qual, but we hash only the first */
	cur = &scan->keyData[0];

	/* We support only single-column hash indexes */
	Assert(cur->sk_attno == 1);
	/* And there's only one operator strategy, too */
	Assert(cur->sk_strategy == HTEqualStrategyNumber);

	/*
	 * If the constant in the index qual is NULL, assume it cannot match any
	 * items in the index.
	 */
	if (cur->sk_flags & SK_ISNULL)
		return false;

	/*
	 * Okay to compute the hash key.  We want to do this before acquiring any
	 * locks, in case a user-defined hash function happens to be slow.
	 *
	 * If scankey operator is not a cross-type comparison, we can use the
	 * cached hash function; otherwise gotta look it up in the catalogs.
	 *
	 * We support the convention that sk_subtype == InvalidOid means the
	 * opclass input type; this is a hack to simplify life for ScanKeyInit().
	 */
	if (cur->sk_subtype == rel->rd_opcintype[0] ||
		cur->sk_subtype == InvalidOid)
		hashkey = _hash_datum2hashkey(rel, cur->sk_argument);
	else
		hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument,
										   cur->sk_subtype);

	so->hashso_sk_hash = hashkey;

	/* Read the metapage */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
	metap = HashPageGetMeta(BufferGetPage(metabuf));

	/*
	 * Loop until we get a lock on the correct target bucket.
	 */
	for (;;)
	{
		/*
		 * Compute the target bucket number, and convert to block number.
		 */
		bucket = _hash_hashkey2bucket(hashkey,
									  metap->hashm_maxbucket,
									  metap->hashm_highmask,
									  metap->hashm_lowmask);

		blkno = BUCKET_TO_BLKNO(metap, bucket);

		/* Release metapage lock, but keep pin. */
		_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);

		/*
		 * If the previous iteration of this loop locked what is still the
		 * correct target bucket, we are done.  Otherwise, drop any old lock
		 * and lock what now appears to be the correct bucket.
		 */
		if (retry)
		{
			if (oldblkno == blkno)
				break;
			_hash_droplock(rel, oldblkno, HASH_SHARE);
		}
		_hash_getlock(rel, blkno, HASH_SHARE);

		/*
		 * Reacquire metapage lock and check that no bucket split has taken
		 * place while we were awaiting the bucket lock.
		 */
		_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ);
		oldblkno = blkno;
		retry = true;
	}

	/* done with the metapage */
	_hash_dropbuf(rel, metabuf);

	/* Update scan opaque state to show we have lock on the bucket */
	so->hashso_bucket = bucket;
	so->hashso_bucket_valid = true;
	so->hashso_bucket_blkno = blkno;

	/* Fetch the primary bucket page for the bucket */
	buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE);
	page = BufferGetPage(buf);
	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
	Assert(opaque->hasho_bucket == bucket);

	/* If a backwards scan is requested, move to the end of the chain */
	if (ScanDirectionIsBackward(dir))
	{
		while (BlockNumberIsValid(opaque->hasho_nextblkno))
			_hash_readnext(rel, &buf, &page, &opaque);
	}

	/* Now find the first tuple satisfying the qualification */
	if (!_hash_step(scan, &buf, dir))
		return false;

	/* if we're here, _hash_step found a valid tuple */
	offnum = ItemPointerGetOffsetNumber(current);
	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
	page = BufferGetPage(buf);
	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
	so->hashso_heappos = itup->t_tid;

	return true;
}
Example #13
0
/*
 *	_hash_doinsert() -- Handle insertion of a single index tuple.
 *
 *		This routine is called by the public interface routines, hashbuild
 *		and hashinsert.  By here, itup is completely filled in.
 */
void
_hash_doinsert(Relation rel, IndexTuple itup)
{
	Buffer		buf;
	Buffer		metabuf;
	HashMetaPage metap;
	BlockNumber blkno;
	Page		page;
	HashPageOpaque pageopaque;
	Size		itemsz;
	bool		do_expand;
	uint32		hashkey;
	Bucket		bucket;

	/*
	 * Get the hash key for the item (it's stored in the index tuple itself).
	 */
	hashkey = _hash_get_indextuple_hashkey(itup);

	/* compute item size too */
	itemsz = IndexTupleDSize(*itup);
	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
								 * need to be consistent */

	/*
	 * Acquire shared split lock so we can compute the target bucket safely
	 * (see README).
	 */
	_hash_getlock(rel, 0, HASH_SHARE);

	/* Read the metapage */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
	metap = HashPageGetMeta(BufferGetPage(metabuf));

	/*
	 * Check whether the item can fit on a hash page at all. (Eventually, we
	 * ought to try to apply TOAST methods if not.)  Note that at this point,
	 * itemsz doesn't include the ItemId.
	 *
	 * XXX this is useless code if we are only storing hash keys.
	 */
	if (itemsz > HashMaxItemSize((Page) metap))
		ereport(ERROR,
				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
				 errmsg("index row size %lu exceeds hash maximum %lu",
						(unsigned long) itemsz,
						(unsigned long) HashMaxItemSize((Page) metap)),
			errhint("Values larger than a buffer page cannot be indexed.")));

	/*
	 * Compute the target bucket number, and convert to block number.
	 */
	bucket = _hash_hashkey2bucket(hashkey,
								  metap->hashm_maxbucket,
								  metap->hashm_highmask,
								  metap->hashm_lowmask);

	blkno = BUCKET_TO_BLKNO(metap, bucket);

	/* release lock on metapage, but keep pin since we'll need it again */
	_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);

	/*
	 * Acquire share lock on target bucket; then we can release split lock.
	 */
	_hash_getlock(rel, blkno, HASH_SHARE);

	_hash_droplock(rel, 0, HASH_SHARE);

	/* Fetch the primary bucket page for the bucket */
	buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);
	page = BufferGetPage(buf);
	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
	Assert(pageopaque->hasho_bucket == bucket);

	/* Do the insertion */
	while (PageGetFreeSpace(page) < itemsz)
	{
		/*
		 * no space on this page; check for an overflow page
		 */
		BlockNumber nextblkno = pageopaque->hasho_nextblkno;

		if (BlockNumberIsValid(nextblkno))
		{
			/*
			 * ovfl page exists; go get it.  if it doesn't have room, we'll
			 * find out next pass through the loop test above.
			 */
			_hash_relbuf(rel, buf);
			buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
			page = BufferGetPage(buf);
		}
		else
		{
			/*
			 * we're at the end of the bucket chain and we haven't found a
			 * page with enough room.  allocate a new overflow page.
			 */

			/* release our write lock without modifying buffer */
			_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);

			/* chain to a new overflow page */
			buf = _hash_addovflpage(rel, metabuf, buf);
			page = BufferGetPage(buf);

			/* should fit now, given test above */
			Assert(PageGetFreeSpace(page) >= itemsz);
		}
		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
		Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE);
		Assert(pageopaque->hasho_bucket == bucket);
	}

	/* found page with enough space, so add the item here */
	(void) _hash_pgaddtup(rel, buf, itemsz, itup);

	/* write and release the modified page */
	_hash_wrtbuf(rel, buf);

	/* We can drop the bucket lock now */
	_hash_droplock(rel, blkno, HASH_SHARE);

	/*
	 * Write-lock the metapage so we can increment the tuple count. After
	 * incrementing it, check to see if it's time for a split.
	 */
	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

	metap->hashm_ntuples += 1;

	/* Make sure this stays in sync with _hash_expandtable() */
	do_expand = metap->hashm_ntuples >
		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);

	/* Write out the metapage and drop lock, but keep pin */
	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);

	/* Attempt to split if a split is needed */
	if (do_expand)
		_hash_expandtable(rel, metabuf);

	/* Finally drop our pin on the metapage */
	_hash_dropbuf(rel, metabuf);
}
Example #14
0
/*
 *	_hash_addovflpage
 *
 *	Add an overflow page to the bucket whose last page is pointed to by 'buf'.
 *
 *	On entry, the caller must hold a pin but no lock on 'buf'.	The pin is
 *	dropped before exiting (we assume the caller is not interested in 'buf'
 *	anymore).  The returned overflow page will be pinned and write-locked;
 *	it is guaranteed to be empty.
 *
 *	The caller must hold a pin, but no lock, on the metapage buffer.
 *	That buffer is returned in the same state.
 *
 *	The caller must hold at least share lock on the bucket, to ensure that
 *	no one else tries to compact the bucket meanwhile.	This guarantees that
 *	'buf' won't stop being part of the bucket while it's unlocked.
 *
 * NB: since this could be executed concurrently by multiple processes,
 * one should not assume that the returned overflow page will be the
 * immediate successor of the originally passed 'buf'.	Additional overflow
 * pages might have been added to the bucket chain in between.
 */
Buffer
_hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf)
{
	Buffer			ovflbuf;
	Page			page;
	Page			ovflpage;
	HashPageOpaque 	pageopaque;
	HashPageOpaque 	ovflopaque;
	/*CS3223*/ /*declare variables*/
	HashMetaPage 	metap;
	Bucket 			bucket;
	int 			i;
	int				index;
	int				bitIndexInElement;
	uint32			ovflElement;
	uint32 			*tempPointer;
	
	/* allocate and lock an empty overflow page */
	ovflbuf = _hash_getovflpage(rel, metabuf);
	
	/*CS3223*/
	metap = HashPageGetMeta(BufferGetPage(metabuf));
	/* find bucket number of primary page */
	page = BufferGetPage(buf);
	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
	bucket = pageopaque -> hasho_bucket;
	
	/*
	 * Write-lock the tail page.  It is okay to hold two buffer locks here
	 * since there cannot be anyone else contending for access to ovflbuf.
	 */
	_hash_chgbufaccess(rel, buf, HASH_NOLOCK, HASH_WRITE);

	/* probably redundant... */
	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);

	/* loop to find current tail page, in case someone else inserted too */
	//for (;;) 
	/*CS3223*/
	while (i>=0)
	{
		BlockNumber nextblkno;

		page = BufferGetPage(buf);
		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
		nextblkno = pageopaque->hasho_nextblkno;

		if (!BlockNumberIsValid(nextblkno))
			break;

		/* we assume we do not need to write the unmodified page */
		_hash_relbuf(rel, buf);

		buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
		/*CS3223*/
		i++;
	}

	/* now that we have correct backlink, initialize new overflow page */
	ovflpage = BufferGetPage(ovflbuf);
	ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
	ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
	ovflopaque->hasho_nextblkno = InvalidBlockNumber;
	ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
	ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
	ovflopaque->hasho_page_id = HASHO_PAGE_ID;
	
	MarkBufferDirty(ovflbuf);

	/* logically chain overflow page to previous page */
	pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
	_hash_wrtbuf(rel, buf);
	
	/*CS3223*/
	/* if length of the bucket chain is 1, only one ovflpage was added, which means the bucket was not split before */
	if (i == 1) {
		index 			  = bucket / 32;
		bitIndexInElement = bucket % 32;
		ovflElement		  = (uint32)metap->ovflBkts[index];
		ovflElement		  = ovflElement | (1 << bitIndexInElement);
		tempPointer  	  = &(metap->ovflBkts[index]);
		*tempPointer	  = ovflElement;
	}

	return ovflbuf;
}
Example #15
0
/*
 * Attempt to expand the hash table by creating one new bucket.
 *
 * This will silently do nothing if it cannot get the needed locks.
 *
 * The caller should hold no locks on the hash index.
 *
 * The caller must hold a pin, but no lock, on the metapage buffer.
 * The buffer is returned in the same state.
 */
void
_hash_expandtable(Relation rel, Buffer metabuf)
{
    HashMetaPage metap;
    Bucket		old_bucket;
    Bucket		new_bucket;
    uint32		spare_ndx;
    BlockNumber start_oblkno;
    BlockNumber start_nblkno;
    uint32		maxbucket;
    uint32		highmask;
    uint32		lowmask;

    /*
     * Obtain the page-zero lock to assert the right to begin a split (see
     * README).
     *
     * Note: deadlock should be impossible here. Our own backend could only be
     * holding bucket sharelocks due to stopped indexscans; those will not
     * block other holders of the page-zero lock, who are only interested in
     * acquiring bucket sharelocks themselves.	Exclusive bucket locks are
     * only taken here and in hashbulkdelete, and neither of these operations
     * needs any additional locks to complete.	(If, due to some flaw in this
     * reasoning, we manage to deadlock anyway, it's okay to error out; the
     * index will be left in a consistent state.)
     */
    _hash_getlock(rel, 0, HASH_EXCLUSIVE);

    /* Write-lock the meta page */
    _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

    _hash_checkpage(rel, metabuf, LH_META_PAGE);
    metap = HashPageGetMeta(BufferGetPage(metabuf));

    /*
     * Check to see if split is still needed; someone else might have already
     * done one while we waited for the lock.
     *
     * Make sure this stays in sync with _hash_doinsert()
     */
    if (metap->hashm_ntuples <=
            (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
        goto fail;

    /*
     * Can't split anymore if maxbucket has reached its maximum possible
     * value.
     *
     * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because
     * the calculation maxbucket+1 mustn't overflow).  Currently we restrict
     * to half that because of overflow looping in _hash_log2() and
     * insufficient space in hashm_spares[].  It's moot anyway because an
     * index with 2^32 buckets would certainly overflow BlockNumber and hence
     * _hash_alloc_buckets() would fail, but if we supported buckets smaller
     * than a disk block then this would be an independent constraint.
     *
     * If you change this, see also the maximum initial number of buckets in
     * _hash_metapinit().
     */
    if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
        goto fail;

    /*
     * Determine which bucket is to be split, and attempt to lock the old
     * bucket.	If we can't get the lock, give up.
     *
     * The lock protects us against other backends, but not against our own
     * backend.  Must check for active scans separately.
     */
    new_bucket = metap->hashm_maxbucket + 1;

    old_bucket = (new_bucket & metap->hashm_lowmask);

    start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);

    if (_hash_has_active_scan(rel, old_bucket))
        goto fail;

    if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE))
        goto fail;

    /*
     * Likewise lock the new bucket (should never fail).
     *
     * Note: it is safe to compute the new bucket's blkno here, even though we
     * may still need to update the BUCKET_TO_BLKNO mapping.  This is because
     * the current value of hashm_spares[hashm_ovflpoint] correctly shows
     * where we are going to put a new splitpoint's worth of buckets.
     */
    start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);

    if (_hash_has_active_scan(rel, new_bucket))
        elog(ERROR, "scan in progress on supposedly new bucket");

    if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE))
        elog(ERROR, "could not get lock on supposedly new bucket");

    /*
     * If the split point is increasing (hashm_maxbucket's log base 2
     * increases), we need to allocate a new batch of bucket pages.
     */
    spare_ndx = _hash_log2(new_bucket + 1);
    if (spare_ndx > metap->hashm_ovflpoint)
    {
        Assert(spare_ndx == metap->hashm_ovflpoint + 1);

        /*
         * The number of buckets in the new splitpoint is equal to the total
         * number already in existence, i.e. new_bucket.  Currently this maps
         * one-to-one to blocks required, but someday we may need a more
         * complicated calculation here.
         */
        if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket))
        {
            /* can't split due to BlockNumber overflow */
            _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
            _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);
            goto fail;
        }
    }

    /*
     * Okay to proceed with split.	Update the metapage bucket mapping info.
     *
     * Since we are scribbling on the metapage data right in the shared
     * buffer, any failure in this next little bit leaves us with a big
     * problem: the metapage is effectively corrupt but could get written back
     * to disk.  We don't really expect any failure, but just to be sure,
     * establish a critical section.
     */
    START_CRIT_SECTION();

    metap->hashm_maxbucket = new_bucket;

    if (new_bucket > metap->hashm_highmask)
    {
        /* Starting a new doubling */
        metap->hashm_lowmask = metap->hashm_highmask;
        metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
    }

    /*
     * If the split point is increasing (hashm_maxbucket's log base 2
     * increases), we need to adjust the hashm_spares[] array and
     * hashm_ovflpoint so that future overflow pages will be created beyond
     * this new batch of bucket pages.
     */
    if (spare_ndx > metap->hashm_ovflpoint)
    {
        metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
        metap->hashm_ovflpoint = spare_ndx;
    }

    /* Done mucking with metapage */
    END_CRIT_SECTION();

    /*
     * Copy bucket mapping info now; this saves re-accessing the meta page
     * inside _hash_splitbucket's inner loop.  Note that once we drop the
     * split lock, other splits could begin, so these values might be out of
     * date before _hash_splitbucket finishes.	That's okay, since all it
     * needs is to tell which of these two buckets to map hashkeys into.
     */
    maxbucket = metap->hashm_maxbucket;
    highmask = metap->hashm_highmask;
    lowmask = metap->hashm_lowmask;

    /* Write out the metapage and drop lock, but keep pin */
    _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);

    /* Release split lock; okay for other splits to occur now */
    _hash_droplock(rel, 0, HASH_EXCLUSIVE);

    /* Relocate records to the new bucket */
    _hash_splitbucket(rel, metabuf, old_bucket, new_bucket,
                      start_oblkno, start_nblkno,
                      maxbucket, highmask, lowmask);

    /* Release bucket locks, allowing others to access them */
    _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
    _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);

    return;

    /* Here if decide not to split or fail to acquire old bucket lock */
fail:

    /* We didn't write the metapage, so just drop lock */
    _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);

    /* Release split lock */
    _hash_droplock(rel, 0, HASH_EXCLUSIVE);
}
Example #16
0
/*
 *	hashgettuple() -- Get the next tuple in the scan.
 */
bool
hashgettuple(IndexScanDesc scan, ScanDirection dir)
{
	HashScanOpaque so = (HashScanOpaque) scan->opaque;
	Relation	rel = scan->indexRelation;
	Buffer		buf;
	Page		page;
	OffsetNumber offnum;
	ItemPointer current;
	bool		res;

	/* Hash indexes are always lossy since we store only the hash code */
	scan->xs_recheck = true;

	/*
	 * We hold pin but not lock on current buffer while outside the hash AM.
	 * Reacquire the read lock here.
	 */
	if (BufferIsValid(so->hashso_curbuf))
		_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ);

	/*
	 * If we've already initialized this scan, we can just advance it in the
	 * appropriate direction.  If we haven't done so yet, we call a routine to
	 * get the first item in the scan.
	 */
	current = &(so->hashso_curpos);
	if (ItemPointerIsValid(current))
	{
		/*
		 * An insertion into the current index page could have happened while
		 * we didn't have read lock on it.  Re-find our position by looking
		 * for the TID we previously returned.  (Because we hold share lock on
		 * the bucket, no deletions or splits could have occurred; therefore
		 * we can expect that the TID still exists in the current index page,
		 * at an offset >= where we were.)
		 */
		OffsetNumber maxoffnum;

		buf = so->hashso_curbuf;
		Assert(BufferIsValid(buf));
		page = BufferGetPage(buf);
		TestForOldSnapshot(scan->xs_snapshot, rel, page);
		maxoffnum = PageGetMaxOffsetNumber(page);
		for (offnum = ItemPointerGetOffsetNumber(current);
			 offnum <= maxoffnum;
			 offnum = OffsetNumberNext(offnum))
		{
			IndexTuple	itup;

			itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
			if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid)))
				break;
		}
		if (offnum > maxoffnum)
			elog(ERROR, "failed to re-find scan position within index \"%s\"",
				 RelationGetRelationName(rel));
		ItemPointerSetOffsetNumber(current, offnum);

		/*
		 * Check to see if we should kill the previously-fetched tuple.
		 */
		if (scan->kill_prior_tuple)
		{
			/*
			 * Yes, so mark it by setting the LP_DEAD state in the item flags.
			 */
			ItemIdMarkDead(PageGetItemId(page, offnum));

			/*
			 * Since this can be redone later if needed, mark as a hint.
			 */
			MarkBufferDirtyHint(buf, true);
		}

		/*
		 * Now continue the scan.
		 */
		res = _hash_next(scan, dir);
	}
	else
		res = _hash_first(scan, dir);

	/*
	 * Skip killed tuples if asked to.
	 */
	if (scan->ignore_killed_tuples)
	{
		while (res)
		{
			offnum = ItemPointerGetOffsetNumber(current);
			page = BufferGetPage(so->hashso_curbuf);
			if (!ItemIdIsDead(PageGetItemId(page, offnum)))
				break;
			res = _hash_next(scan, dir);
		}
	}

	/* Release read lock on current buffer, but keep it pinned */
	if (BufferIsValid(so->hashso_curbuf))
		_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK);

	/* Return current heap TID on success */
	scan->xs_ctup.t_self = so->hashso_heappos;

	return res;
}
Example #17
0
/*
 *	_hash_doinsert() -- Handle insertion of a single index tuple.
 *
 *		This routine is called by the public interface routines, hashbuild
 *		and hashinsert.  By here, itup is completely filled in.
 */
void
_hash_doinsert(Relation rel, IndexTuple itup)
{
	Buffer		buf = InvalidBuffer;
	Buffer		bucket_buf;
	Buffer		metabuf;
	HashMetaPage metap;
	BlockNumber blkno;
	BlockNumber oldblkno;
	bool		retry;
	Page		page;
	HashPageOpaque pageopaque;
	Size		itemsz;
	bool		do_expand;
	uint32		hashkey;
	Bucket		bucket;
	uint32		maxbucket;
	uint32		highmask;
	uint32		lowmask;

	/*
	 * Get the hash key for the item (it's stored in the index tuple itself).
	 */
	hashkey = _hash_get_indextuple_hashkey(itup);

	/* compute item size too */
	itemsz = IndexTupleDSize(*itup);
	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
								 * need to be consistent */

restart_insert:
	/* Read the metapage */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
	metap = HashPageGetMeta(BufferGetPage(metabuf));

	/*
	 * Check whether the item can fit on a hash page at all. (Eventually, we
	 * ought to try to apply TOAST methods if not.)  Note that at this point,
	 * itemsz doesn't include the ItemId.
	 *
	 * XXX this is useless code if we are only storing hash keys.
	 */
	if (itemsz > HashMaxItemSize((Page) metap))
		ereport(ERROR,
				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
				 errmsg("index row size %zu exceeds hash maximum %zu",
						itemsz, HashMaxItemSize((Page) metap)),
			errhint("Values larger than a buffer page cannot be indexed.")));

	oldblkno = InvalidBlockNumber;
	retry = false;

	/*
	 * Loop until we get a lock on the correct target bucket.
	 */
	for (;;)
	{
		/*
		 * Compute the target bucket number, and convert to block number.
		 */
		bucket = _hash_hashkey2bucket(hashkey,
									  metap->hashm_maxbucket,
									  metap->hashm_highmask,
									  metap->hashm_lowmask);

		blkno = BUCKET_TO_BLKNO(metap, bucket);

		/*
		 * Copy bucket mapping info now; refer the comment in
		 * _hash_expandtable where we copy this information before calling
		 * _hash_splitbucket to see why this is okay.
		 */
		maxbucket = metap->hashm_maxbucket;
		highmask = metap->hashm_highmask;
		lowmask = metap->hashm_lowmask;

		/* Release metapage lock, but keep pin. */
		_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);

		/*
		 * If the previous iteration of this loop locked the primary page of
		 * what is still the correct target bucket, we are done.  Otherwise,
		 * drop any old lock before acquiring the new one.
		 */
		if (retry)
		{
			if (oldblkno == blkno)
				break;
			_hash_relbuf(rel, buf);
		}

		/* Fetch and lock the primary bucket page for the target bucket */
		buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);

		/*
		 * Reacquire metapage lock and check that no bucket split has taken
		 * place while we were awaiting the bucket lock.
		 */
		_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ);
		oldblkno = blkno;
		retry = true;
	}

	/* remember the primary bucket buffer to release the pin on it at end. */
	bucket_buf = buf;

	page = BufferGetPage(buf);
	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
	Assert(pageopaque->hasho_bucket == bucket);

	/*
	 * If this bucket is in the process of being split, try to finish the
	 * split before inserting, because that might create room for the
	 * insertion to proceed without allocating an additional overflow page.
	 * It's only interesting to finish the split if we're trying to insert
	 * into the bucket from which we're removing tuples (the "old" bucket),
	 * not if we're trying to insert into the bucket into which tuples are
	 * being moved (the "new" bucket).
	 */
	if (H_BUCKET_BEING_SPLIT(pageopaque) && IsBufferCleanupOK(buf))
	{
		/* release the lock on bucket buffer, before completing the split. */
		_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);

		_hash_finish_split(rel, metabuf, buf, pageopaque->hasho_bucket,
						   maxbucket, highmask, lowmask);

		/* release the pin on old and meta buffer.  retry for insert. */
		_hash_dropbuf(rel, buf);
		_hash_dropbuf(rel, metabuf);
		goto restart_insert;
	}

	/* Do the insertion */
	while (PageGetFreeSpace(page) < itemsz)
	{
		/*
		 * no space on this page; check for an overflow page
		 */
		BlockNumber nextblkno = pageopaque->hasho_nextblkno;

		if (BlockNumberIsValid(nextblkno))
		{
			/*
			 * ovfl page exists; go get it.  if it doesn't have room, we'll
			 * find out next pass through the loop test above.  we always
			 * release both the lock and pin if this is an overflow page, but
			 * only the lock if this is the primary bucket page, since the pin
			 * on the primary bucket must be retained throughout the scan.
			 */
			if (buf != bucket_buf)
				_hash_relbuf(rel, buf);
			else
				_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);
			buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
			page = BufferGetPage(buf);
		}
		else
		{
			/*
			 * we're at the end of the bucket chain and we haven't found a
			 * page with enough room.  allocate a new overflow page.
			 */

			/* release our write lock without modifying buffer */
			_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);

			/* chain to a new overflow page */
			buf = _hash_addovflpage(rel, metabuf, buf, (buf == bucket_buf) ? true : false);
			page = BufferGetPage(buf);

			/* should fit now, given test above */
			Assert(PageGetFreeSpace(page) >= itemsz);
		}
		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
		Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE);
		Assert(pageopaque->hasho_bucket == bucket);
	}

	/* found page with enough space, so add the item here */
	(void) _hash_pgaddtup(rel, buf, itemsz, itup);

	/*
	 * dirty and release the modified page.  if the page we modified was an
	 * overflow page, we also need to separately drop the pin we retained on
	 * the primary bucket page.
	 */
	MarkBufferDirty(buf);
	_hash_relbuf(rel, buf);
	if (buf != bucket_buf)
		_hash_dropbuf(rel, bucket_buf);

	/*
	 * Write-lock the metapage so we can increment the tuple count. After
	 * incrementing it, check to see if it's time for a split.
	 */
	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

	metap->hashm_ntuples += 1;

	/* Make sure this stays in sync with _hash_expandtable() */
	do_expand = metap->hashm_ntuples >
		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);

	/* Write out the metapage and drop lock, but keep pin */
	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);

	/* Attempt to split if a split is needed */
	if (do_expand)
		_hash_expandtable(rel, metabuf);

	/* Finally drop our pin on the metapage */
	_hash_dropbuf(rel, metabuf);
}