/*
 *	_hash_addovflpage
 *
 *	Add an overflow page to the bucket whose last page is pointed to by 'buf'.
 *
 *	On entry, the caller must hold a pin but no lock on 'buf'.	The pin is
 *	dropped before exiting (we assume the caller is not interested in 'buf'
 *	anymore).  The returned overflow page will be pinned and write-locked;
 *	it is guaranteed to be empty.
 *
 *	The caller must hold a pin, but no lock, on the metapage buffer.
 *	That buffer is returned in the same state.
 *
 *	The caller must hold at least share lock on the bucket, to ensure that
 *	no one else tries to compact the bucket meanwhile.	This guarantees that
 *	'buf' won't stop being part of the bucket while it's unlocked.
 *
 * NB: since this could be executed concurrently by multiple processes,
 * one should not assume that the returned overflow page will be the
 * immediate successor of the originally passed 'buf'.	Additional overflow
 * pages might have been added to the bucket chain in between.
 */
Buffer
_hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf)
{
    Buffer		ovflbuf;
    Page		page;
    Page		ovflpage;
    HashPageOpaque pageopaque;
    HashPageOpaque ovflopaque;

    /* allocate and lock an empty overflow page */
    ovflbuf = _hash_getovflpage(rel, metabuf);

    /*
     * Write-lock the tail page.  It is okay to hold two buffer locks here
     * since there cannot be anyone else contending for access to ovflbuf.
     */
    _hash_chgbufaccess(rel, buf, HASH_NOLOCK, HASH_WRITE);

    /* probably redundant... */
    _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);

    /* loop to find current tail page, in case someone else inserted too */
    for (;;)
    {
        BlockNumber nextblkno;

        page = BufferGetPage(buf);
        pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
        nextblkno = pageopaque->hasho_nextblkno;

        if (!BlockNumberIsValid(nextblkno))
            break;

        /* we assume we do not need to write the unmodified page */
        _hash_relbuf(rel, buf);

        buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
    }

    /* now that we have correct backlink, initialize new overflow page */
    ovflpage = BufferGetPage(ovflbuf);
    ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
    ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
    ovflopaque->hasho_nextblkno = InvalidBlockNumber;
    ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
    ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
    ovflopaque->hasho_page_id = HASHO_PAGE_ID;

    MarkBufferDirty(ovflbuf);

    /* logically chain overflow page to previous page */
    pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
    _hash_wrtbuf(rel, buf);

    return ovflbuf;
}
示例#2
0
/*
 *	_hash_initbitmap()
 *
 *	 Initialize a new bitmap page.	The metapage has a write-lock upon
 *	 entering the function, and must be written by caller after return.
 *
 * 'blkno' is the block number of the new bitmap page.
 *
 * All bits in the new bitmap page are set to "1", indicating "in use".
 */
void
_hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno)
{
	Buffer		buf;
	Page		pg;
	HashPageOpaque op;
	uint32	   *freep;

	/*
	 * It is okay to write-lock the new bitmap page while holding metapage
	 * write lock, because no one else could be contending for the new page.
	 * Also, the metapage lock makes it safe to extend the index using P_NEW,
	 * which we want to do to ensure the smgr's idea of the relation size
	 * stays in step with ours.
	 *
	 * There is some loss of concurrency in possibly doing I/O for the new
	 * page while holding the metapage lock, but this path is taken so seldom
	 * that it's not worth worrying about.
	 */
	buf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
	if (BufferGetBlockNumber(buf) != blkno)
		elog(ERROR, "unexpected hash relation size: %u, should be %u",
			 BufferGetBlockNumber(buf), blkno);

	pg = BufferGetPage(buf);

	/* initialize the page */
	_hash_pageinit(pg, BufferGetPageSize(buf));
	op = (HashPageOpaque) PageGetSpecialPointer(pg);
	op->hasho_prevblkno = InvalidBlockNumber;
	op->hasho_nextblkno = InvalidBlockNumber;
	op->hasho_bucket = -1;
	op->hasho_flag = LH_BITMAP_PAGE;
	op->hasho_filler = HASHO_FILL;

	/* set all of the bits to 1 */
	freep = HashPageGetBitmap(pg);
	MemSet(freep, 0xFF, BMPGSZ_BYTE(metap));

	/* write out the new bitmap page (releasing write lock and pin) */
	_hash_wrtbuf(rel, buf);

	/* add the new bitmap page to the metapage's list of bitmaps */
	/* metapage already has a write lock */
	if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
		ereport(ERROR,
				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
				 errmsg("out of overflow pages in hash index \"%s\"",
						RelationGetRelationName(rel))));

	metap->hashm_mapp[metap->hashm_nmaps] = blkno;

	metap->hashm_nmaps++;
}
/*
 *	_hash_initbitmap()
 *
 *	 Initialize a new bitmap page.	The metapage has a write-lock upon
 *	 entering the function, and must be written by caller after return.
 *
 * 'blkno' is the block number of the new bitmap page.
 *
 * All bits in the new bitmap page are set to "1", indicating "in use".
 */
void
_hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno,
                 ForkNumber forkNum)
{
    Buffer		buf;
    Page		pg;
    HashPageOpaque op;
    uint32	   *freep;

    /*
     * It is okay to write-lock the new bitmap page while holding metapage
     * write lock, because no one else could be contending for the new page.
     * Also, the metapage lock makes it safe to extend the index using
     * _hash_getnewbuf.
     *
     * There is some loss of concurrency in possibly doing I/O for the new
     * page while holding the metapage lock, but this path is taken so seldom
     * that it's not worth worrying about.
     */
    buf = _hash_getnewbuf(rel, blkno, forkNum);
    pg = BufferGetPage(buf);

    /* initialize the page's special space */
    op = (HashPageOpaque) PageGetSpecialPointer(pg);
    op->hasho_prevblkno = InvalidBlockNumber;
    op->hasho_nextblkno = InvalidBlockNumber;
    op->hasho_bucket = -1;
    op->hasho_flag = LH_BITMAP_PAGE;
    op->hasho_page_id = HASHO_PAGE_ID;

    /* set all of the bits to 1 */
    freep = HashPageGetBitmap(pg);
    MemSet(freep, 0xFF, BMPGSZ_BYTE(metap));

    /* write out the new bitmap page (releasing write lock and pin) */
    _hash_wrtbuf(rel, buf);

    /* add the new bitmap page to the metapage's list of bitmaps */
    /* metapage already has a write lock */
    if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
        ereport(ERROR,
                (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                 errmsg("out of overflow pages in hash index \"%s\"",
                        RelationGetRelationName(rel))));

    metap->hashm_mapp[metap->hashm_nmaps] = blkno;

    metap->hashm_nmaps++;
}
/*
 *	_hash_freeovflpage() -
 *
 *	Remove this overflow page from its bucket's chain, and mark the page as
 *	free.  On entry, ovflbuf is write-locked; it is released before exiting.
 *
 *	Since this function is invoked in VACUUM, we provide an access strategy
 *	parameter that controls fetches of the bucket pages.
 *
 *	Returns the block number of the page that followed the given page
 *	in the bucket, or InvalidBlockNumber if no following page.
 *
 *	NB: caller must not hold lock on metapage, nor on either page that's
 *	adjacent in the bucket chain.  The caller had better hold exclusive lock
 *	on the bucket, too.
 */
BlockNumber
_hash_freeovflpage(Relation rel, Buffer ovflbuf,
                   BufferAccessStrategy bstrategy)
{
    HashMetaPage metap;
    Buffer		metabuf;
    Buffer		mapbuf;
    BlockNumber ovflblkno;
    BlockNumber prevblkno;
    BlockNumber blkno;
    BlockNumber nextblkno;
    HashPageOpaque ovflopaque;
    Page		ovflpage;
    Page		mappage;
    uint32	   *freep;
    uint32		ovflbitno;
    int32		bitmappage,
                bitmapbit;
    Bucket bucket PG_USED_FOR_ASSERTS_ONLY;

    /* Get information from the doomed page */
    _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE);
    ovflblkno = BufferGetBlockNumber(ovflbuf);
    ovflpage = BufferGetPage(ovflbuf);
    ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
    nextblkno = ovflopaque->hasho_nextblkno;
    prevblkno = ovflopaque->hasho_prevblkno;
    bucket = ovflopaque->hasho_bucket;

    /*
     * Zero the page for debugging's sake; then write and release it. (Note:
     * if we failed to zero the page here, we'd have problems with the Assert
     * in _hash_pageinit() when the page is reused.)
     */
    MemSet(ovflpage, 0, BufferGetPageSize(ovflbuf));
    _hash_wrtbuf(rel, ovflbuf);

    /*
     * Fix up the bucket chain.  this is a doubly-linked list, so we must fix
     * up the bucket chain members behind and ahead of the overflow page being
     * deleted.  No concurrency issues since we hold exclusive lock on the
     * entire bucket.
     */
    if (BlockNumberIsValid(prevblkno))
    {
        Buffer		prevbuf = _hash_getbuf_with_strategy(rel,
                              prevblkno,
                              HASH_WRITE,
                              LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
                              bstrategy);
        Page		prevpage = BufferGetPage(prevbuf);
        HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage);

        Assert(prevopaque->hasho_bucket == bucket);
        prevopaque->hasho_nextblkno = nextblkno;
        _hash_wrtbuf(rel, prevbuf);
    }
    if (BlockNumberIsValid(nextblkno))
    {
        Buffer		nextbuf = _hash_getbuf_with_strategy(rel,
                              nextblkno,
                              HASH_WRITE,
                              LH_OVERFLOW_PAGE,
                              bstrategy);
        Page		nextpage = BufferGetPage(nextbuf);
        HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage);

        Assert(nextopaque->hasho_bucket == bucket);
        nextopaque->hasho_prevblkno = prevblkno;
        _hash_wrtbuf(rel, nextbuf);
    }

    /* Note: bstrategy is intentionally not used for metapage and bitmap */

    /* Read the metapage so we can determine which bitmap page to use */
    metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
    metap = HashPageGetMeta(BufferGetPage(metabuf));

    /* Identify which bit to set */
    ovflbitno = blkno_to_bitno(metap, ovflblkno);

    bitmappage = ovflbitno >> BMPG_SHIFT(metap);
    bitmapbit = ovflbitno & BMPG_MASK(metap);

    if (bitmappage >= metap->hashm_nmaps)
        elog(ERROR, "invalid overflow bit number %u", ovflbitno);
    blkno = metap->hashm_mapp[bitmappage];

    /* Release metapage lock while we access the bitmap page */
    _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);

    /* Clear the bitmap bit to indicate that this overflow page is free */
    mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE);
    mappage = BufferGetPage(mapbuf);
    freep = HashPageGetBitmap(mappage);
    Assert(ISSET(freep, bitmapbit));
    CLRBIT(freep, bitmapbit);
    _hash_wrtbuf(rel, mapbuf);

    /* Get write-lock on metapage to update firstfree */
    _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

    /* if this is now the first free page, update hashm_firstfree */
    if (ovflbitno < metap->hashm_firstfree)
    {
        metap->hashm_firstfree = ovflbitno;
        _hash_wrtbuf(rel, metabuf);
    }
    else
    {
        /* no need to change metapage */
        _hash_relbuf(rel, metabuf);
    }

    return nextblkno;
}
/*
 *	_hash_getovflpage()
 *
 *	Find an available overflow page and return it.	The returned buffer
 *	is pinned and write-locked, and has had _hash_pageinit() applied,
 *	but it is caller's responsibility to fill the special space.
 *
 * The caller must hold a pin, but no lock, on the metapage buffer.
 * That buffer is left in the same state at exit.
 */
static Buffer
_hash_getovflpage(Relation rel, Buffer metabuf)
{
    HashMetaPage metap;
    Buffer		mapbuf = 0;
    Buffer		newbuf;
    BlockNumber blkno;
    uint32		orig_firstfree;
    uint32		splitnum;
    uint32	   *freep = NULL;
    uint32		max_ovflpg;
    uint32		bit;
    uint32		first_page;
    uint32		last_bit;
    uint32		last_page;
    uint32		i,
                j;

    /* Get exclusive lock on the meta page */
    _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

    _hash_checkpage(rel, metabuf, LH_META_PAGE);
    metap = HashPageGetMeta(BufferGetPage(metabuf));

    /* start search at hashm_firstfree */
    orig_firstfree = metap->hashm_firstfree;
    first_page = orig_firstfree >> BMPG_SHIFT(metap);
    bit = orig_firstfree & BMPG_MASK(metap);
    i = first_page;
    j = bit / BITS_PER_MAP;
    bit &= ~(BITS_PER_MAP - 1);

    /* outer loop iterates once per bitmap page */
    for (;;)
    {
        BlockNumber mapblkno;
        Page		mappage;
        uint32		last_inpage;

        /* want to end search with the last existing overflow page */
        splitnum = metap->hashm_ovflpoint;
        max_ovflpg = metap->hashm_spares[splitnum] - 1;
        last_page = max_ovflpg >> BMPG_SHIFT(metap);
        last_bit = max_ovflpg & BMPG_MASK(metap);

        if (i > last_page)
            break;

        Assert(i < metap->hashm_nmaps);
        mapblkno = metap->hashm_mapp[i];

        if (i == last_page)
            last_inpage = last_bit;
        else
            last_inpage = BMPGSZ_BIT(metap) - 1;

        /* Release exclusive lock on metapage while reading bitmap page */
        _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);

        mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE);
        mappage = BufferGetPage(mapbuf);
        freep = HashPageGetBitmap(mappage);

        for (; bit <= last_inpage; j++, bit += BITS_PER_MAP)
        {
            if (freep[j] != ALL_SET)
                goto found;
        }

        /* No free space here, try to advance to next map page */
        _hash_relbuf(rel, mapbuf);
        i++;
        j = 0;					/* scan from start of next map page */
        bit = 0;

        /* Reacquire exclusive lock on the meta page */
        _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
    }

    /*
     * No free pages --- have to extend the relation to add an overflow page.
     * First, check to see if we have to add a new bitmap page too.
     */
    if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1))
    {
        /*
         * We create the new bitmap page with all pages marked "in use".
         * Actually two pages in the new bitmap's range will exist
         * immediately: the bitmap page itself, and the following page which
         * is the one we return to the caller.	Both of these are correctly
         * marked "in use".  Subsequent pages do not exist yet, but it is
         * convenient to pre-mark them as "in use" too.
         */
        bit = metap->hashm_spares[splitnum];
        _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit), MAIN_FORKNUM);
        metap->hashm_spares[splitnum]++;
    }
    else
    {
        /*
         * Nothing to do here; since the page will be past the last used page,
         * we know its bitmap bit was preinitialized to "in use".
         */
    }

    /* Calculate address of the new overflow page */
    bit = metap->hashm_spares[splitnum];
    blkno = bitno_to_blkno(metap, bit);

    /*
     * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the
     * relation length stays in sync with ours.  XXX It's annoying to do this
     * with metapage write lock held; would be better to use a lock that
     * doesn't block incoming searches.
     */
    newbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM);

    metap->hashm_spares[splitnum]++;

    /*
     * Adjust hashm_firstfree to avoid redundant searches.	But don't risk
     * changing it if someone moved it while we were searching bitmap pages.
     */
    if (metap->hashm_firstfree == orig_firstfree)
        metap->hashm_firstfree = bit + 1;

    /* Write updated metapage and release lock, but not pin */
    _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);

    return newbuf;

found:
    /* convert bit to bit number within page */
    bit += _hash_firstfreebit(freep[j]);

    /* mark page "in use" in the bitmap */
    SETBIT(freep, bit);
    _hash_wrtbuf(rel, mapbuf);

    /* Reacquire exclusive lock on the meta page */
    _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

    /* convert bit to absolute bit number */
    bit += (i << BMPG_SHIFT(metap));

    /* Calculate address of the recycled overflow page */
    blkno = bitno_to_blkno(metap, bit);

    /*
     * Adjust hashm_firstfree to avoid redundant searches.	But don't risk
     * changing it if someone moved it while we were searching bitmap pages.
     */
    if (metap->hashm_firstfree == orig_firstfree)
    {
        metap->hashm_firstfree = bit + 1;

        /* Write updated metapage and release lock, but not pin */
        _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
    }
    else
    {
        /* We didn't change the metapage, so no need to write */
        _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
    }

    /* Fetch, init, and return the recycled page */
    return _hash_getinitbuf(rel, blkno);
}
示例#6
0
/*
 * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
 *
 * We are splitting a bucket that consists of a base bucket page and zero
 * or more overflow (bucket chain) pages.  We must relocate tuples that
 * belong in the new bucket, and compress out any free space in the old
 * bucket.
 *
 * The caller must hold exclusive locks on both buckets to ensure that
 * no one else is trying to access them (see README).
 *
 * The caller must hold a pin, but no lock, on the metapage buffer.
 * The buffer is returned in the same state.  (The metapage is only
 * touched if it becomes necessary to add or remove overflow pages.)
 */
static void
_hash_splitbucket(Relation rel,
				  Buffer metabuf,
				  Bucket obucket,
				  Bucket nbucket,
				  BlockNumber start_oblkno,
				  BlockNumber start_nblkno,
				  uint32 maxbucket,
				  uint32 highmask,
				  uint32 lowmask)
{
	Bucket		bucket;
	Buffer		obuf;
	Buffer		nbuf;
	BlockNumber oblkno;
	BlockNumber nblkno;
	bool		null;
	Datum		datum;
	HashItem	hitem;
	HashPageOpaque oopaque;
	HashPageOpaque nopaque;
	IndexTuple	itup;
	Size		itemsz;
	OffsetNumber ooffnum;
	OffsetNumber noffnum;
	OffsetNumber omaxoffnum;
	Page		opage;
	Page		npage;
	TupleDesc	itupdesc = RelationGetDescr(rel);

	/*
	 * It should be okay to simultaneously write-lock pages from each
	 * bucket, since no one else can be trying to acquire buffer lock
	 * on pages of either bucket.
	 */
	oblkno = start_oblkno;
	nblkno = start_nblkno;
	obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
	nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE);
	opage = BufferGetPage(obuf);
	npage = BufferGetPage(nbuf);

	_hash_checkpage(rel, opage, LH_BUCKET_PAGE);
	oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);

	/* initialize the new bucket's primary page */
	_hash_pageinit(npage, BufferGetPageSize(nbuf));
	nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
	nopaque->hasho_prevblkno = InvalidBlockNumber;
	nopaque->hasho_nextblkno = InvalidBlockNumber;
	nopaque->hasho_bucket = nbucket;
	nopaque->hasho_flag = LH_BUCKET_PAGE;
	nopaque->hasho_filler = HASHO_FILL;

	/*
	 * Partition the tuples in the old bucket between the old bucket and the
	 * new bucket, advancing along the old bucket's overflow bucket chain
	 * and adding overflow pages to the new bucket as needed.
	 */
	ooffnum = FirstOffsetNumber;
	omaxoffnum = PageGetMaxOffsetNumber(opage);
	for (;;)
	{
		/*
		 * at each iteration through this loop, each of these variables
		 * should be up-to-date: obuf opage oopaque ooffnum omaxoffnum
		 */

		/* check if we're at the end of the page */
		if (ooffnum > omaxoffnum)
		{
			/* at end of page, but check for an(other) overflow page */
			oblkno = oopaque->hasho_nextblkno;
			if (!BlockNumberIsValid(oblkno))
				break;
			/*
			 * we ran out of tuples on this particular page, but we
			 * have more overflow pages; advance to next page.
			 */
			_hash_wrtbuf(rel, obuf);

			obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
			opage = BufferGetPage(obuf);
			_hash_checkpage(rel, opage, LH_OVERFLOW_PAGE);
			oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
			ooffnum = FirstOffsetNumber;
			omaxoffnum = PageGetMaxOffsetNumber(opage);
			continue;
		}

		/*
		 * Re-hash the tuple to determine which bucket it now belongs in.
		 *
		 * It is annoying to call the hash function while holding locks,
		 * but releasing and relocking the page for each tuple is unappealing
		 * too.
		 */
		hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum));
		itup = &(hitem->hash_itup);
		datum = index_getattr(itup, 1, itupdesc, &null);
		Assert(!null);

		bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum),
									  maxbucket, highmask, lowmask);

		if (bucket == nbucket)
		{
			/*
			 * insert the tuple into the new bucket.  if it doesn't fit on
			 * the current page in the new bucket, we must allocate a new
			 * overflow page and place the tuple on that page instead.
			 */
			itemsz = IndexTupleDSize(hitem->hash_itup)
				+ (sizeof(HashItemData) - sizeof(IndexTupleData));

			itemsz = MAXALIGN(itemsz);

			if (PageGetFreeSpace(npage) < itemsz)
			{
				/* write out nbuf and drop lock, but keep pin */
				_hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK);
				/* chain to a new overflow page */
				nbuf = _hash_addovflpage(rel, metabuf, nbuf);
				npage = BufferGetPage(nbuf);
				_hash_checkpage(rel, npage, LH_OVERFLOW_PAGE);
				/* we don't need nopaque within the loop */
			}

			noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
			if (PageAddItem(npage, (Item) hitem, itemsz, noffnum, LP_USED)
				== InvalidOffsetNumber)
				elog(ERROR, "failed to add index item to \"%s\"",
					 RelationGetRelationName(rel));

			/*
			 * now delete the tuple from the old bucket.  after this
			 * section of code, 'ooffnum' will actually point to the
			 * ItemId to which we would point if we had advanced it before
			 * the deletion (PageIndexTupleDelete repacks the ItemId
			 * array).	this also means that 'omaxoffnum' is exactly one
			 * less than it used to be, so we really can just decrement it
			 * instead of calling PageGetMaxOffsetNumber.
			 */
			PageIndexTupleDelete(opage, ooffnum);
			omaxoffnum = OffsetNumberPrev(omaxoffnum);
		}
		else
		{
			/*
			 * the tuple stays on this page.  we didn't move anything, so
			 * we didn't delete anything and therefore we don't have to
			 * change 'omaxoffnum'.
			 */
			Assert(bucket == obucket);
			ooffnum = OffsetNumberNext(ooffnum);
		}
	}

	/*
	 * We're at the end of the old bucket chain, so we're done partitioning
	 * the tuples.  Before quitting, call _hash_squeezebucket to ensure the
	 * tuples remaining in the old bucket (including the overflow pages) are
	 * packed as tightly as possible.  The new bucket is already tight.
	 */
	_hash_wrtbuf(rel, obuf);
	_hash_wrtbuf(rel, nbuf);

	_hash_squeezebucket(rel, obucket, start_oblkno);
}
示例#7
0
/*
 *	_hash_metapinit() -- Initialize the metadata page of a hash index,
 *				the two buckets that we begin with and the initial
 *				bitmap page.
 *
 * We are fairly cavalier about locking here, since we know that no one else
 * could be accessing this index.  In particular the rule about not holding
 * multiple buffer locks is ignored.
 */
void
_hash_metapinit(Relation rel)
{
	HashMetaPage metap;
	HashPageOpaque pageopaque;
	Buffer		metabuf;
	Buffer		buf;
	Page		pg;
	int32		data_width;
	int32		item_width;
	int32		ffactor;
	uint16		i;

	/* safety check */
	if (RelationGetNumberOfBlocks(rel) != 0)
		elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
			 RelationGetRelationName(rel));

	/*
	 * Determine the target fill factor (tuples per bucket) for this index.
	 * The idea is to make the fill factor correspond to pages about 3/4ths
	 * full.  We can compute it exactly if the index datatype is fixed-width,
	 * but for var-width there's some guessing involved.
	 */
	data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid,
								 RelationGetDescr(rel)->attrs[0]->atttypmod);
	item_width = MAXALIGN(sizeof(HashItemData)) + MAXALIGN(data_width) +
		sizeof(ItemIdData);		/* include the line pointer */
	ffactor = (BLCKSZ * 3 / 4) / item_width;
	/* keep to a sane range */
	if (ffactor < 10)
		ffactor = 10;

	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
	pg = BufferGetPage(metabuf);
	_hash_pageinit(pg, BufferGetPageSize(metabuf));

	pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
	pageopaque->hasho_prevblkno = InvalidBlockNumber;
	pageopaque->hasho_nextblkno = InvalidBlockNumber;
	pageopaque->hasho_bucket = -1;
	pageopaque->hasho_flag = LH_META_PAGE;
	pageopaque->hasho_filler = HASHO_FILL;

	metap = (HashMetaPage) pg;

	metap->hashm_magic = HASH_MAGIC;
	metap->hashm_version = HASH_VERSION;
	metap->hashm_ntuples = 0;
	metap->hashm_nmaps = 0;
	metap->hashm_ffactor = ffactor;
	metap->hashm_bsize = BufferGetPageSize(metabuf);
	/* find largest bitmap array size that will fit in page size */
	for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
	{
		if ((1 << i) <= (metap->hashm_bsize -
						 (MAXALIGN(sizeof(PageHeaderData)) +
						  MAXALIGN(sizeof(HashPageOpaqueData)))))
			break;
	}
	Assert(i > 0);
	metap->hashm_bmsize = 1 << i;
	metap->hashm_bmshift = i + BYTE_TO_BIT;
	Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1));

	metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);

	/*
	 * We initialize the index with two buckets, 0 and 1, occupying physical
	 * blocks 1 and 2.  The first freespace bitmap page is in block 3.
	 */
	metap->hashm_maxbucket = metap->hashm_lowmask = 1;	/* nbuckets - 1 */
	metap->hashm_highmask = 3;	/* (nbuckets << 1) - 1 */

	MemSet((char *) metap->hashm_spares, 0, sizeof(metap->hashm_spares));
	MemSet((char *) metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));

	metap->hashm_spares[1] = 1;	/* the first bitmap page is only spare */
	metap->hashm_ovflpoint = 1;
	metap->hashm_firstfree = 0;

	/*
	 * Initialize the first two buckets
	 */
	for (i = 0; i <= 1; i++)
	{
		buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE);
		pg = BufferGetPage(buf);
		_hash_pageinit(pg, BufferGetPageSize(buf));
		pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
		pageopaque->hasho_prevblkno = InvalidBlockNumber;
		pageopaque->hasho_nextblkno = InvalidBlockNumber;
		pageopaque->hasho_bucket = i;
		pageopaque->hasho_flag = LH_BUCKET_PAGE;
		pageopaque->hasho_filler = HASHO_FILL;
		_hash_wrtbuf(rel, buf);
	}

	/*
	 * Initialize first bitmap page.  Can't do this until we
	 * create the first two buckets, else smgr will complain.
	 */
	_hash_initbitmap(rel, metap, 3);

	/* all done */
	_hash_wrtbuf(rel, metabuf);
}
示例#8
0
文件: hash.c 项目: michaelpq/postgres
/*
 * Helper function to perform deletion of index entries from a bucket.
 *
 * This function expects that the caller has acquired a cleanup lock on the
 * primary bucket page, and will return with a write lock again held on the
 * primary bucket page.  The lock won't necessarily be held continuously,
 * though, because we'll release it when visiting overflow pages.
 *
 * It would be very bad if this function cleaned a page while some other
 * backend was in the midst of scanning it, because hashgettuple assumes
 * that the next valid TID will be greater than or equal to the current
 * valid TID.  There can't be any concurrent scans in progress when we first
 * enter this function because of the cleanup lock we hold on the primary
 * bucket page, but as soon as we release that lock, there might be.  We
 * handle that by conspiring to prevent those scans from passing our cleanup
 * scan.  To do that, we lock the next page in the bucket chain before
 * releasing the lock on the previous page.  (This type of lock chaining is
 * not ideal, so we might want to look for a better solution at some point.)
 *
 * We need to retain a pin on the primary bucket to ensure that no concurrent
 * split can start.
 */
void
hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
                  BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
                  uint32 maxbucket, uint32 highmask, uint32 lowmask,
                  double *tuples_removed, double *num_index_tuples,
                  bool split_cleanup,
                  IndexBulkDeleteCallback callback, void *callback_state)
{
    BlockNumber blkno;
    Buffer		buf;
    Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
    bool		bucket_dirty = false;

    blkno = bucket_blkno;
    buf = bucket_buf;

    if (split_cleanup)
        new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
                     lowmask, maxbucket);

    /* Scan each page in bucket */
    for (;;)
    {
        HashPageOpaque opaque;
        OffsetNumber offno;
        OffsetNumber maxoffno;
        Buffer		next_buf;
        Page		page;
        OffsetNumber deletable[MaxOffsetNumber];
        int			ndeletable = 0;
        bool		retain_pin = false;
        bool		curr_page_dirty = false;

        vacuum_delay_point();

        page = BufferGetPage(buf);
        opaque = (HashPageOpaque) PageGetSpecialPointer(page);

        /* Scan each tuple in page */
        maxoffno = PageGetMaxOffsetNumber(page);
        for (offno = FirstOffsetNumber;
                offno <= maxoffno;
                offno = OffsetNumberNext(offno))
        {
            ItemPointer htup;
            IndexTuple	itup;
            Bucket		bucket;
            bool		kill_tuple = false;

            itup = (IndexTuple) PageGetItem(page,
                                            PageGetItemId(page, offno));
            htup = &(itup->t_tid);

            /*
             * To remove the dead tuples, we strictly want to rely on results
             * of callback function.  refer btvacuumpage for detailed reason.
             */
            if (callback && callback(htup, callback_state))
            {
                kill_tuple = true;
                if (tuples_removed)
                    *tuples_removed += 1;
            }
            else if (split_cleanup)
            {
                /* delete the tuples that are moved by split. */
                bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
                                              maxbucket,
                                              highmask,
                                              lowmask);
                /* mark the item for deletion */
                if (bucket != cur_bucket)
                {
                    /*
                     * We expect tuples to either belong to curent bucket or
                     * new_bucket.  This is ensured because we don't allow
                     * further splits from bucket that contains garbage. See
                     * comments in _hash_expandtable.
                     */
                    Assert(bucket == new_bucket);
                    kill_tuple = true;
                }
            }

            if (kill_tuple)
            {
                /* mark the item for deletion */
                deletable[ndeletable++] = offno;
            }
            else
            {
                /* we're keeping it, so count it */
                if (num_index_tuples)
                    *num_index_tuples += 1;
            }
        }

        /* retain the pin on primary bucket page till end of bucket scan */
        if (blkno == bucket_blkno)
            retain_pin = true;
        else
            retain_pin = false;

        blkno = opaque->hasho_nextblkno;

        /*
         * Apply deletions, advance to next page and write page if needed.
         */
        if (ndeletable > 0)
        {
            PageIndexMultiDelete(page, deletable, ndeletable);
            bucket_dirty = true;
            curr_page_dirty = true;
        }

        /* bail out if there are no more pages to scan. */
        if (!BlockNumberIsValid(blkno))
            break;

        next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
                                              LH_OVERFLOW_PAGE,
                                              bstrategy);

        /*
         * release the lock on previous page after acquiring the lock on next
         * page
         */
        if (curr_page_dirty)
        {
            if (retain_pin)
                _hash_chgbufaccess(rel, buf, HASH_WRITE, HASH_NOLOCK);
            else
                _hash_wrtbuf(rel, buf);
            curr_page_dirty = false;
        }
        else if (retain_pin)
            _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);
        else
            _hash_relbuf(rel, buf);

        buf = next_buf;
    }

    /*
     * lock the bucket page to clear the garbage flag and squeeze the bucket.
     * if the current buffer is same as bucket buffer, then we already have
     * lock on bucket page.
     */
    if (buf != bucket_buf)
    {
        _hash_relbuf(rel, buf);
        _hash_chgbufaccess(rel, bucket_buf, HASH_NOLOCK, HASH_WRITE);
    }

    /*
     * Clear the garbage flag from bucket after deleting the tuples that are
     * moved by split.  We purposefully clear the flag before squeeze bucket,
     * so that after restart, vacuum shouldn't again try to delete the moved
     * by split tuples.
     */
    if (split_cleanup)
    {
        HashPageOpaque bucket_opaque;
        Page		page;

        page = BufferGetPage(bucket_buf);
        bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);

        bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
    }

    /*
     * If we have deleted anything, try to compact free space.  For squeezing
     * the bucket, we must have a cleanup lock, else it can impact the
     * ordering of tuples for a scan that has started before it.
     */
    if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
        _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
                            bstrategy);
    else
        _hash_chgbufaccess(rel, bucket_buf, HASH_WRITE, HASH_NOLOCK);
}
示例#9
0
/*
 *	_hash_doinsert() -- Handle insertion of a single HashItem in the table.
 *
 *		This routine is called by the public interface routines, hashbuild
 *		and hashinsert.  By here, hashitem is completely filled in.
 *		The datum to be used as a "key" is in the hashitem.
 */
InsertIndexResult
_hash_doinsert(Relation rel, HashItem hitem)
{
	Buffer		buf;
	Buffer		metabuf;
	HashMetaPage metap;
	IndexTuple	itup;
	BlockNumber itup_blkno;
	OffsetNumber itup_off;
	InsertIndexResult res;
	BlockNumber blkno;
	Page		page;
	HashPageOpaque pageopaque;
	Size		itemsz;
	bool		do_expand;
	uint32		hashkey;
	Bucket		bucket;
	Datum		datum;
	bool		isnull;

	/*
	 * Compute the hash key for the item.  We do this first so as not to
	 * need to hold any locks while running the hash function.
	 */
	itup = &(hitem->hash_itup);
	if (rel->rd_rel->relnatts != 1)
		elog(ERROR, "hash indexes support only one index key");
	datum = index_getattr(itup, 1, RelationGetDescr(rel), &isnull);
	Assert(!isnull);
	hashkey = _hash_datum2hashkey(rel, datum);

	/* compute item size too */
	itemsz = IndexTupleDSize(hitem->hash_itup)
		+ (sizeof(HashItemData) - sizeof(IndexTupleData));

	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but
								 * we need to be consistent */

	/*
	 * Acquire shared split lock so we can compute the target bucket
	 * safely (see README).
	 */
	_hash_getlock(rel, 0, HASH_SHARE);

	/* Read the metapage */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
	metap = (HashMetaPage) BufferGetPage(metabuf);
	_hash_checkpage(rel, (Page) metap, LH_META_PAGE);

	/*
	 * Check whether the item can fit on a hash page at all. (Eventually,
	 * we ought to try to apply TOAST methods if not.)  Note that at this
	 * point, itemsz doesn't include the ItemId.
	 */
	if (itemsz > HashMaxItemSize((Page) metap))
		ereport(ERROR,
				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
				 errmsg("index row size %lu exceeds hash maximum %lu",
						(unsigned long) itemsz,
						(unsigned long) HashMaxItemSize((Page) metap))));

	/*
	 * Compute the target bucket number, and convert to block number.
	 */
	bucket = _hash_hashkey2bucket(hashkey,
								  metap->hashm_maxbucket,
								  metap->hashm_highmask,
								  metap->hashm_lowmask);

	blkno = BUCKET_TO_BLKNO(metap, bucket);

	/* release lock on metapage, but keep pin since we'll need it again */
	_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);

	/*
	 * Acquire share lock on target bucket; then we can release split lock.
	 */
	_hash_getlock(rel, blkno, HASH_SHARE);

	_hash_droplock(rel, 0, HASH_SHARE);

	/* Fetch the primary bucket page for the bucket */
	buf = _hash_getbuf(rel, blkno, HASH_WRITE);
	page = BufferGetPage(buf);
	_hash_checkpage(rel, page, LH_BUCKET_PAGE);
	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
	Assert(pageopaque->hasho_bucket == bucket);

	/* Do the insertion */
	while (PageGetFreeSpace(page) < itemsz)
	{
		/*
		 * no space on this page; check for an overflow page
		 */
		BlockNumber	nextblkno = pageopaque->hasho_nextblkno;

		if (BlockNumberIsValid(nextblkno))
		{
			/*
			 * ovfl page exists; go get it.  if it doesn't have room,
			 * we'll find out next pass through the loop test above.
			 */
			_hash_relbuf(rel, buf);
			buf = _hash_getbuf(rel, nextblkno, HASH_WRITE);
			page = BufferGetPage(buf);
		}
		else
		{
			/*
			 * we're at the end of the bucket chain and we haven't found a
			 * page with enough room.  allocate a new overflow page.
			 */

			/* release our write lock without modifying buffer */
			_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);

			/* chain to a new overflow page */
			buf = _hash_addovflpage(rel, metabuf, buf);
			page = BufferGetPage(buf);

			/* should fit now, given test above */
			Assert(PageGetFreeSpace(page) >= itemsz);
		}
		_hash_checkpage(rel, page, LH_OVERFLOW_PAGE);
		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
		Assert(pageopaque->hasho_bucket == bucket);
	}

	/* found page with enough space, so add the item here */
	itup_off = _hash_pgaddtup(rel, buf, itemsz, hitem);
	itup_blkno = BufferGetBlockNumber(buf);

	/* write and release the modified page */
	_hash_wrtbuf(rel, buf);

	/* We can drop the bucket lock now */
	_hash_droplock(rel, blkno, HASH_SHARE);

	/*
	 * Write-lock the metapage so we can increment the tuple count.
	 * After incrementing it, check to see if it's time for a split.
	 */
	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

	metap->hashm_ntuples += 1;

	/* Make sure this stays in sync with _hash_expandtable() */
	do_expand = metap->hashm_ntuples >
		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);

	/* Write out the metapage and drop lock, but keep pin */
	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);

	/* Attempt to split if a split is needed */
	if (do_expand)
		_hash_expandtable(rel, metabuf);

	/* Finally drop our pin on the metapage */
	_hash_dropbuf(rel, metabuf);

	/* Create the return data structure */
	res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));

	ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);

	return res;
}
示例#10
0
/*
 *	_hash_metapinit() -- Initialize the metadata page of a hash index,
 *				the initial buckets, and the initial bitmap page.
 *
 * The initial number of buckets is dependent on num_tuples, an estimate
 * of the number of tuples to be loaded into the index initially.  The
 * chosen number of buckets is returned.
 *
 * We are fairly cavalier about locking here, since we know that no one else
 * could be accessing this index.  In particular the rule about not holding
 * multiple buffer locks is ignored.
 */
uint32
_hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum)
{
    HashMetaPage metap;
    HashPageOpaque pageopaque;
    Buffer		metabuf;
    Buffer		buf;
    Page		pg;
    int32		data_width;
    int32		item_width;
    int32		ffactor;
    double		dnumbuckets;
    uint32		num_buckets;
    uint32		log2_num_buckets;
    uint32		i;

    /* safety check */
    if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0)
        elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
             RelationGetRelationName(rel));

    /*
     * Determine the target fill factor (in tuples per bucket) for this index.
     * The idea is to make the fill factor correspond to pages about as full
     * as the user-settable fillfactor parameter says.	We can compute it
     * exactly since the index datatype (i.e. uint32 hash key) is fixed-width.
     */
    data_width = sizeof(uint32);
    item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
                 sizeof(ItemIdData);		/* include the line pointer */
    ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width;
    /* keep to a sane range */
    if (ffactor < 10)
        ffactor = 10;

    /*
     * Choose the number of initial bucket pages to match the fill factor
     * given the estimated number of tuples.  We round up the result to the
     * next power of 2, however, and always force at least 2 bucket pages. The
     * upper limit is determined by considerations explained in
     * _hash_expandtable().
     */
    dnumbuckets = num_tuples / ffactor;
    if (dnumbuckets <= 2.0)
        num_buckets = 2;
    else if (dnumbuckets >= (double) 0x40000000)
        num_buckets = 0x40000000;
    else
        num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets);

    log2_num_buckets = _hash_log2(num_buckets);
    Assert(num_buckets == (((uint32) 1) << log2_num_buckets));
    Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS);

    /*
     * We initialize the metapage, the first N bucket pages, and the first
     * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()
     * calls to occur.	This ensures that the smgr level has the right idea of
     * the physical index length.
     */
    metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum);
    pg = BufferGetPage(metabuf);

    pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
    pageopaque->hasho_prevblkno = InvalidBlockNumber;
    pageopaque->hasho_nextblkno = InvalidBlockNumber;
    pageopaque->hasho_bucket = -1;
    pageopaque->hasho_flag = LH_META_PAGE;
    pageopaque->hasho_page_id = HASHO_PAGE_ID;

    metap = HashPageGetMeta(pg);

    metap->hashm_magic = HASH_MAGIC;
    metap->hashm_version = HASH_VERSION;
    metap->hashm_ntuples = 0;
    metap->hashm_nmaps = 0;
    metap->hashm_ffactor = ffactor;
    metap->hashm_bsize = HashGetMaxBitmapSize(pg);
    /* find largest bitmap array size that will fit in page size */
    for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
    {
        if ((1 << i) <= metap->hashm_bsize)
            break;
    }
    Assert(i > 0);
    metap->hashm_bmsize = 1 << i;
    metap->hashm_bmshift = i + BYTE_TO_BIT;
    Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1));

    /*
     * Label the index with its primary hash support function's OID.  This is
     * pretty useless for normal operation (in fact, hashm_procid is not used
     * anywhere), but it might be handy for forensic purposes so we keep it.
     */
    metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);

    /*
     * We initialize the index with N buckets, 0 .. N-1, occupying physical
     * blocks 1 to N.  The first freespace bitmap page is in block N+1. Since
     * N is a power of 2, we can set the masks this way:
     */
    metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1;
    metap->hashm_highmask = (num_buckets << 1) - 1;

    MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
    MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));

    /* Set up mapping for one spare page after the initial splitpoints */
    metap->hashm_spares[log2_num_buckets] = 1;
    metap->hashm_ovflpoint = log2_num_buckets;
    metap->hashm_firstfree = 0;

    /*
     * Release buffer lock on the metapage while we initialize buckets.
     * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS
     * won't accomplish anything.  It's a bad idea to hold buffer locks for
     * long intervals in any case, since that can block the bgwriter.
     */
    _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);

    /*
     * Initialize the first N buckets
     */
    for (i = 0; i < num_buckets; i++)
    {
        /* Allow interrupts, in case N is huge */
        CHECK_FOR_INTERRUPTS();

        buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum);
        pg = BufferGetPage(buf);
        pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
        pageopaque->hasho_prevblkno = InvalidBlockNumber;
        pageopaque->hasho_nextblkno = InvalidBlockNumber;
        pageopaque->hasho_bucket = i;
        pageopaque->hasho_flag = LH_BUCKET_PAGE;
        pageopaque->hasho_page_id = HASHO_PAGE_ID;
        _hash_wrtbuf(rel, buf);
    }

    /* Now reacquire buffer lock on metapage */
    _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

    /*
     * Initialize first bitmap page
     */
    _hash_initbitmap(rel, metap, num_buckets + 1, forkNum);

    /* all done */
    _hash_wrtbuf(rel, metabuf);

    return num_buckets;
}
示例#11
0
文件: hashpage.c 项目: ricky-wu/gpdb
/*
 *	_hash_metapinit() -- Initialize the metadata page of a hash index,
 *				the two buckets that we begin with and the initial
 *				bitmap page.
 *
 * We are fairly cavalier about locking here, since we know that no one else
 * could be accessing this index.  In particular the rule about not holding
 * multiple buffer locks is ignored.
 */
void
_hash_metapinit(Relation rel)
{
	MIRROREDLOCK_BUFMGR_DECLARE;

	HashMetaPage metap;
	HashPageOpaque pageopaque;
	Buffer		metabuf;
	Buffer		buf;
	Page		pg;
	int32		data_width;
	int32		item_width;
	int32		ffactor;
	uint16		i;

	/* safety check */
	if (RelationGetNumberOfBlocks(rel) != 0)
		elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
			 RelationGetRelationName(rel));

	/*
	 * Determine the target fill factor (in tuples per bucket) for this index.
	 * The idea is to make the fill factor correspond to pages about as full
	 * as the user-settable fillfactor parameter says.	We can compute it
	 * exactly if the index datatype is fixed-width, but for var-width there's
	 * some guessing involved.
	 */
	data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid,
								 RelationGetDescr(rel)->attrs[0]->atttypmod);
	item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
		sizeof(ItemIdData);		/* include the line pointer */
	ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width;
	/* keep to a sane range */
	if (ffactor < 10)
		ffactor = 10;

	/*
	 * We initialize the metapage, the first two bucket pages, and the
	 * first bitmap page in sequence, using _hash_getnewbuf to cause
	 * smgrextend() calls to occur.  This ensures that the smgr level
	 * has the right idea of the physical index length.
	 */
	
	// -------- MirroredLock ----------
	MIRROREDLOCK_BUFMGR_LOCK;
	
	metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, HASH_WRITE);
	pg = BufferGetPage(metabuf);
	_hash_pageinit(pg, BufferGetPageSize(metabuf));

	pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
	pageopaque->hasho_prevblkno = InvalidBlockNumber;
	pageopaque->hasho_nextblkno = InvalidBlockNumber;
	pageopaque->hasho_bucket = -1;
	pageopaque->hasho_flag = LH_META_PAGE;
	pageopaque->hasho_filler = HASHO_FILL;

	metap = (HashMetaPage) pg;

	metap->hashm_magic = HASH_MAGIC;
	metap->hashm_version = HASH_VERSION;
	metap->hashm_ntuples = 0;
	metap->hashm_nmaps = 0;
	metap->hashm_ffactor = ffactor;
	metap->hashm_bsize = BufferGetPageSize(metabuf);
	/* find largest bitmap array size that will fit in page size */
	for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
	{
		if ((1 << i) <= (metap->hashm_bsize -
						 (MAXALIGN(sizeof(PageHeaderData)) +
						  MAXALIGN(sizeof(HashPageOpaqueData)))))
			break;
	}
	Assert(i > 0);
	metap->hashm_bmsize = 1 << i;
	metap->hashm_bmshift = i + BYTE_TO_BIT;
	Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1));

	metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);

	/*
	 * We initialize the index with two buckets, 0 and 1, occupying physical
	 * blocks 1 and 2.	The first freespace bitmap page is in block 3.
	 */
	metap->hashm_maxbucket = metap->hashm_lowmask = 1;	/* nbuckets - 1 */
	metap->hashm_highmask = 3;	/* (nbuckets << 1) - 1 */

	MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
	MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));

	metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */
	metap->hashm_ovflpoint = 1;
	metap->hashm_firstfree = 0;

	/*
	 * Initialize the first two buckets
	 */
	for (i = 0; i <= 1; i++)
	{
		buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE);
		pg = BufferGetPage(buf);
		_hash_pageinit(pg, BufferGetPageSize(buf));
		pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
		pageopaque->hasho_prevblkno = InvalidBlockNumber;
		pageopaque->hasho_nextblkno = InvalidBlockNumber;
		pageopaque->hasho_bucket = i;
		pageopaque->hasho_flag = LH_BUCKET_PAGE;
		pageopaque->hasho_filler = HASHO_FILL;
		_hash_wrtbuf(rel, buf);
	}

	/*
	 * Initialize first bitmap page
	 */
	_hash_initbitmap(rel, metap, 3);

	/* all done */
	_hash_wrtbuf(rel, metabuf);
	
	MIRROREDLOCK_BUFMGR_UNLOCK;
	// -------- MirroredLock ----------
	
}
示例#12
0
/*
 *	_hash_squeezebucket(rel, bucket)
 *
 *	Try to squeeze the tuples onto pages occurring earlier in the
 *	bucket chain in an attempt to free overflow pages. When we start
 *	the "squeezing", the page from which we start taking tuples (the
 *	"read" page) is the last bucket in the bucket chain and the page
 *	onto which we start squeezing tuples (the "write" page) is the
 *	first page in the bucket chain.  The read page works backward and
 *	the write page works forward; the procedure terminates when the
 *	read page and write page are the same page.
 *
 *	At completion of this procedure, it is guaranteed that all pages in
 *	the bucket are nonempty, unless the bucket is totally empty (in
 *	which case all overflow pages will be freed).  The original implementation
 *	required that to be true on entry as well, but it's a lot easier for
 *	callers to leave empty overflow pages and let this guy clean it up.
 *
 *	Caller must hold exclusive lock on the target bucket.  This allows
 *	us to safely lock multiple pages in the bucket.
 */
void
_hash_squeezebucket(Relation rel,
					Bucket bucket,
					BlockNumber bucket_blkno)
{
	Buffer		wbuf;
	Buffer		rbuf = 0;
	BlockNumber wblkno;
	BlockNumber rblkno;
	Page		wpage;
	Page		rpage;
	HashPageOpaque wopaque;
	HashPageOpaque ropaque;
	OffsetNumber woffnum;
	OffsetNumber roffnum;
	IndexTuple	itup;
	Size		itemsz;

	/*
	 * start squeezing into the base bucket page.
	 */
	wblkno = bucket_blkno;
	wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE);
	_hash_checkpage(rel, wbuf, LH_BUCKET_PAGE);
	wpage = BufferGetPage(wbuf);
	wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);

	/*
	 * if there aren't any overflow pages, there's nothing to squeeze.
	 */
	if (!BlockNumberIsValid(wopaque->hasho_nextblkno))
	{
		_hash_relbuf(rel, wbuf);
		return;
	}

	/*
	 * find the last page in the bucket chain by starting at the base bucket
	 * page and working forward.
	 */
	ropaque = wopaque;
	do
	{
		rblkno = ropaque->hasho_nextblkno;
		if (ropaque != wopaque)
			_hash_relbuf(rel, rbuf);
		rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE);
		_hash_checkpage(rel, rbuf, LH_OVERFLOW_PAGE);
		rpage = BufferGetPage(rbuf);
		ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
		Assert(ropaque->hasho_bucket == bucket);
	} while (BlockNumberIsValid(ropaque->hasho_nextblkno));

	/*
	 * squeeze the tuples.
	 */
	roffnum = FirstOffsetNumber;
	for (;;)
	{
		/* this test is needed in case page is empty on entry */
		if (roffnum <= PageGetMaxOffsetNumber(rpage))
		{
			itup = (IndexTuple) PageGetItem(rpage,
											PageGetItemId(rpage, roffnum));
			itemsz = IndexTupleDSize(*itup);
			itemsz = MAXALIGN(itemsz);

			/*
			 * Walk up the bucket chain, looking for a page big enough for
			 * this item.  Exit if we reach the read page.
			 */
			while (PageGetFreeSpace(wpage) < itemsz)
			{
				Assert(!PageIsEmpty(wpage));

				wblkno = wopaque->hasho_nextblkno;
				Assert(BlockNumberIsValid(wblkno));

				_hash_wrtbuf(rel, wbuf);

				if (rblkno == wblkno)
				{
					/* wbuf is already released */
					_hash_wrtbuf(rel, rbuf);
					return;
				}

				wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE);
				_hash_checkpage(rel, wbuf, LH_OVERFLOW_PAGE);
				wpage = BufferGetPage(wbuf);
				wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
				Assert(wopaque->hasho_bucket == bucket);
			}

			/*
			 * we have found room so insert on the "write" page.
			 */
			woffnum = OffsetNumberNext(PageGetMaxOffsetNumber(wpage));
			if (PageAddItem(wpage, (Item) itup, itemsz, woffnum, LP_USED)
				== InvalidOffsetNumber)
				elog(ERROR, "failed to add index item to \"%s\"",
					 RelationGetRelationName(rel));

			/*
			 * delete the tuple from the "read" page. PageIndexTupleDelete
			 * repacks the ItemId array, so 'roffnum' will be "advanced" to
			 * the "next" ItemId.
			 */
			PageIndexTupleDelete(rpage, roffnum);
		}

		/*
		 * if the "read" page is now empty because of the deletion (or because
		 * it was empty when we got to it), free it.
		 *
		 * Tricky point here: if our read and write pages are adjacent in the
		 * bucket chain, our write lock on wbuf will conflict with
		 * _hash_freeovflpage's attempt to update the sibling links of the
		 * removed page.  However, in that case we are done anyway, so we can
		 * simply drop the write lock before calling _hash_freeovflpage.
		 */
		if (PageIsEmpty(rpage))
		{
			rblkno = ropaque->hasho_prevblkno;
			Assert(BlockNumberIsValid(rblkno));

			/* are we freeing the page adjacent to wbuf? */
			if (rblkno == wblkno)
			{
				/* yes, so release wbuf lock first */
				_hash_wrtbuf(rel, wbuf);
				/* free this overflow page (releases rbuf) */
				_hash_freeovflpage(rel, rbuf);
				/* done */
				return;
			}

			/* free this overflow page, then get the previous one */
			_hash_freeovflpage(rel, rbuf);

			rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE);
			_hash_checkpage(rel, rbuf, LH_OVERFLOW_PAGE);
			rpage = BufferGetPage(rbuf);
			ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
			Assert(ropaque->hasho_bucket == bucket);

			roffnum = FirstOffsetNumber;
		}
	}

	/* NOTREACHED */
}
示例#13
0
/*
 *	_hash_doinsert() -- Handle insertion of a single index tuple.
 *
 *		This routine is called by the public interface routines, hashbuild
 *		and hashinsert.  By here, itup is completely filled in.
 */
void
_hash_doinsert(Relation rel, IndexTuple itup)
{
	Buffer		buf;
	Buffer		metabuf;
	HashMetaPage metap;
	BlockNumber blkno;
	Page		page;
	HashPageOpaque pageopaque;
	Size		itemsz;
	bool		do_expand;
	uint32		hashkey;
	Bucket		bucket;

	/*
	 * Get the hash key for the item (it's stored in the index tuple itself).
	 */
	hashkey = _hash_get_indextuple_hashkey(itup);

	/* compute item size too */
	itemsz = IndexTupleDSize(*itup);
	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
								 * need to be consistent */

	/*
	 * Acquire shared split lock so we can compute the target bucket safely
	 * (see README).
	 */
	_hash_getlock(rel, 0, HASH_SHARE);

	/* Read the metapage */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
	metap = HashPageGetMeta(BufferGetPage(metabuf));

	/*
	 * Check whether the item can fit on a hash page at all. (Eventually, we
	 * ought to try to apply TOAST methods if not.)  Note that at this point,
	 * itemsz doesn't include the ItemId.
	 *
	 * XXX this is useless code if we are only storing hash keys.
	 */
	if (itemsz > HashMaxItemSize((Page) metap))
		ereport(ERROR,
				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
				 errmsg("index row size %lu exceeds hash maximum %lu",
						(unsigned long) itemsz,
						(unsigned long) HashMaxItemSize((Page) metap)),
			errhint("Values larger than a buffer page cannot be indexed.")));

	/*
	 * Compute the target bucket number, and convert to block number.
	 */
	bucket = _hash_hashkey2bucket(hashkey,
								  metap->hashm_maxbucket,
								  metap->hashm_highmask,
								  metap->hashm_lowmask);

	blkno = BUCKET_TO_BLKNO(metap, bucket);

	/* release lock on metapage, but keep pin since we'll need it again */
	_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);

	/*
	 * Acquire share lock on target bucket; then we can release split lock.
	 */
	_hash_getlock(rel, blkno, HASH_SHARE);

	_hash_droplock(rel, 0, HASH_SHARE);

	/* Fetch the primary bucket page for the bucket */
	buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);
	page = BufferGetPage(buf);
	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
	Assert(pageopaque->hasho_bucket == bucket);

	/* Do the insertion */
	while (PageGetFreeSpace(page) < itemsz)
	{
		/*
		 * no space on this page; check for an overflow page
		 */
		BlockNumber nextblkno = pageopaque->hasho_nextblkno;

		if (BlockNumberIsValid(nextblkno))
		{
			/*
			 * ovfl page exists; go get it.  if it doesn't have room, we'll
			 * find out next pass through the loop test above.
			 */
			_hash_relbuf(rel, buf);
			buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
			page = BufferGetPage(buf);
		}
		else
		{
			/*
			 * we're at the end of the bucket chain and we haven't found a
			 * page with enough room.  allocate a new overflow page.
			 */

			/* release our write lock without modifying buffer */
			_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);

			/* chain to a new overflow page */
			buf = _hash_addovflpage(rel, metabuf, buf);
			page = BufferGetPage(buf);

			/* should fit now, given test above */
			Assert(PageGetFreeSpace(page) >= itemsz);
		}
		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
		Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE);
		Assert(pageopaque->hasho_bucket == bucket);
	}

	/* found page with enough space, so add the item here */
	(void) _hash_pgaddtup(rel, buf, itemsz, itup);

	/* write and release the modified page */
	_hash_wrtbuf(rel, buf);

	/* We can drop the bucket lock now */
	_hash_droplock(rel, blkno, HASH_SHARE);

	/*
	 * Write-lock the metapage so we can increment the tuple count. After
	 * incrementing it, check to see if it's time for a split.
	 */
	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

	metap->hashm_ntuples += 1;

	/* Make sure this stays in sync with _hash_expandtable() */
	do_expand = metap->hashm_ntuples >
		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);

	/* Write out the metapage and drop lock, but keep pin */
	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);

	/* Attempt to split if a split is needed */
	if (do_expand)
		_hash_expandtable(rel, metabuf);

	/* Finally drop our pin on the metapage */
	_hash_dropbuf(rel, metabuf);
}
示例#14
0
/*
 *	_hash_freeovflpage() -
 *
 *	Remove this overflow page from its bucket's chain, and mark the page as
 *	free.  On entry, ovflbuf is write-locked; it is released before exiting.
 *
 *	Since this function is invoked in VACUUM, we provide an access strategy
 *	parameter that controls fetches of the bucket pages.
 *
 *	Returns the block number of the page that followed the given page
 *	in the bucket, or InvalidBlockNumber if no following page.
 *
 *	NB: caller must not hold lock on metapage, nor on either page that's
 *	adjacent in the bucket chain.  The caller had better hold exclusive lock
 *	on the bucket, too.
 */
BlockNumber
_hash_freeovflpage(Relation rel, Buffer ovflbuf,
				   BufferAccessStrategy bstrategy)
{
	HashMetaPage metap;
	Buffer		metabuf;
	Buffer		mapbuf;
	BlockNumber ovflblkno;
	BlockNumber prevblkno;
	BlockNumber blkno;
	BlockNumber nextblkno;
	HashPageOpaque ovflopaque;
	Page		ovflpage;
	Page		mappage;
	uint32	   *freep;
	uint32		ovflbitno;
	int32		bitmappage,
				bitmapbit;
	/*CS3223*/
	int 			index;
	int 			bitIndexInElement;
	uint32 			ovflElement;
	uint32 			temp, temp2;
	int 			i;
	BlockNumber 	nextblkno_temp;
	HashPageOpaque	pageopaque;
	Page			page;
	uint32 			*tempPointer;
	
	Bucket bucket PG_USED_FOR_ASSERTS_ONLY;

	/* Get information from the doomed page */
	_hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE);
	ovflblkno = BufferGetBlockNumber(ovflbuf);
	ovflpage = BufferGetPage(ovflbuf);
	ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
	nextblkno = ovflopaque->hasho_nextblkno;
	prevblkno = ovflopaque->hasho_prevblkno;
	bucket = ovflopaque->hasho_bucket;
	
	/*CS3223*/
	/* find the length of the bucket chain*/
	
	while (i>=0)
	{
		//nextblkno_temp;

		page = BufferGetPage(ovflbuf);
		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
		nextblkno_temp = pageopaque->hasho_nextblkno;

		if (!BlockNumberIsValid(nextblkno_temp))
			break;

		/* we assume we do not need to write the unmodified page */
		_hash_relbuf(rel, ovflbuf);

		ovflbuf = _hash_getbuf(rel, nextblkno_temp, HASH_WRITE, LH_OVERFLOW_PAGE);
		/*CS3223*/
		i++;
	}

	/*
	 * Zero the page for debugging's sake; then write and release it. (Note:
	 * if we failed to zero the page here, we'd have problems with the Assert
	 * in _hash_pageinit() when the page is reused.)
	 */
	MemSet(ovflpage, 0, BufferGetPageSize(ovflbuf));
	_hash_wrtbuf(rel, ovflbuf);

	/*
	 * Fix up the bucket chain.  this is a doubly-linked list, so we must fix
	 * up the bucket chain members behind and ahead of the overflow page being
	 * deleted.  No concurrency issues since we hold exclusive lock on the
	 * entire bucket.
	 */
	if (BlockNumberIsValid(prevblkno))
	{
		Buffer		prevbuf = _hash_getbuf_with_strategy(rel,
														 prevblkno,
														 HASH_WRITE,
										   LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
														 bstrategy);
		Page		prevpage = BufferGetPage(prevbuf);
		HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage);

		Assert(prevopaque->hasho_bucket == bucket);
		prevopaque->hasho_nextblkno = nextblkno;
		_hash_wrtbuf(rel, prevbuf);
	}
	if (BlockNumberIsValid(nextblkno))
	{
		Buffer		nextbuf = _hash_getbuf_with_strategy(rel,
														 nextblkno,
														 HASH_WRITE,
														 LH_OVERFLOW_PAGE,
														 bstrategy);
		Page		nextpage = BufferGetPage(nextbuf);
		HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage);

		Assert(nextopaque->hasho_bucket == bucket);
		nextopaque->hasho_prevblkno = prevblkno;
		_hash_wrtbuf(rel, nextbuf);
	}
	
	/*CS3223*/
	if (i == 0) {		//length of the bucket chain is 0, no overflow bucket for that primary bucket
		index 			  = bucket / 32;
		bitIndexInElement = bucket % 32;
		ovflElement  	  = metap->ovflBkts[index];
		temp 			  = ovflElement >> bitIndexInElement;
		temp 			 -= 1;		//bit changed from 1 to 0
		temp2 			  = temp << bitIndexInElement;
		ovflElement 	  = ovflElement | temp2;
		tempPointer       = &(metap->ovflBkts[index]);
		*tempPointer      = ovflElement;
	}
示例#15
0
/*
 *	_hash_addovflpage
 *
 *	Add an overflow page to the bucket whose last page is pointed to by 'buf'.
 *
 *	On entry, the caller must hold a pin but no lock on 'buf'.	The pin is
 *	dropped before exiting (we assume the caller is not interested in 'buf'
 *	anymore).  The returned overflow page will be pinned and write-locked;
 *	it is guaranteed to be empty.
 *
 *	The caller must hold a pin, but no lock, on the metapage buffer.
 *	That buffer is returned in the same state.
 *
 *	The caller must hold at least share lock on the bucket, to ensure that
 *	no one else tries to compact the bucket meanwhile.	This guarantees that
 *	'buf' won't stop being part of the bucket while it's unlocked.
 *
 * NB: since this could be executed concurrently by multiple processes,
 * one should not assume that the returned overflow page will be the
 * immediate successor of the originally passed 'buf'.	Additional overflow
 * pages might have been added to the bucket chain in between.
 */
Buffer
_hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf)
{
	Buffer			ovflbuf;
	Page			page;
	Page			ovflpage;
	HashPageOpaque 	pageopaque;
	HashPageOpaque 	ovflopaque;
	/*CS3223*/ /*declare variables*/
	HashMetaPage 	metap;
	Bucket 			bucket;
	int 			i;
	int				index;
	int				bitIndexInElement;
	uint32			ovflElement;
	uint32 			*tempPointer;
	
	/* allocate and lock an empty overflow page */
	ovflbuf = _hash_getovflpage(rel, metabuf);
	
	/*CS3223*/
	metap = HashPageGetMeta(BufferGetPage(metabuf));
	/* find bucket number of primary page */
	page = BufferGetPage(buf);
	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
	bucket = pageopaque -> hasho_bucket;
	
	/*
	 * Write-lock the tail page.  It is okay to hold two buffer locks here
	 * since there cannot be anyone else contending for access to ovflbuf.
	 */
	_hash_chgbufaccess(rel, buf, HASH_NOLOCK, HASH_WRITE);

	/* probably redundant... */
	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);

	/* loop to find current tail page, in case someone else inserted too */
	//for (;;) 
	/*CS3223*/
	while (i>=0)
	{
		BlockNumber nextblkno;

		page = BufferGetPage(buf);
		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
		nextblkno = pageopaque->hasho_nextblkno;

		if (!BlockNumberIsValid(nextblkno))
			break;

		/* we assume we do not need to write the unmodified page */
		_hash_relbuf(rel, buf);

		buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
		/*CS3223*/
		i++;
	}

	/* now that we have correct backlink, initialize new overflow page */
	ovflpage = BufferGetPage(ovflbuf);
	ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
	ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
	ovflopaque->hasho_nextblkno = InvalidBlockNumber;
	ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
	ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
	ovflopaque->hasho_page_id = HASHO_PAGE_ID;
	
	MarkBufferDirty(ovflbuf);

	/* logically chain overflow page to previous page */
	pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
	_hash_wrtbuf(rel, buf);
	
	/*CS3223*/
	/* if length of the bucket chain is 1, only one ovflpage was added, which means the bucket was not split before */
	if (i == 1) {
		index 			  = bucket / 32;
		bitIndexInElement = bucket % 32;
		ovflElement		  = (uint32)metap->ovflBkts[index];
		ovflElement		  = ovflElement | (1 << bitIndexInElement);
		tempPointer  	  = &(metap->ovflBkts[index]);
		*tempPointer	  = ovflElement;
	}

	return ovflbuf;
}
示例#16
0
/*
 * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
 *
 * We are splitting a bucket that consists of a base bucket page and zero
 * or more overflow (bucket chain) pages.  We must relocate tuples that
 * belong in the new bucket, and compress out any free space in the old
 * bucket.
 *
 * The caller must hold exclusive locks on both buckets to ensure that
 * no one else is trying to access them (see README).
 *
 * The caller must hold a pin, but no lock, on the metapage buffer.
 * The buffer is returned in the same state.  (The metapage is only
 * touched if it becomes necessary to add or remove overflow pages.)
 */
static void
_hash_splitbucket(Relation rel,
                  Buffer metabuf,
                  Bucket obucket,
                  Bucket nbucket,
                  BlockNumber start_oblkno,
                  BlockNumber start_nblkno,
                  uint32 maxbucket,
                  uint32 highmask,
                  uint32 lowmask)
{
    BlockNumber oblkno;
    BlockNumber nblkno;
    Buffer		obuf;
    Buffer		nbuf;
    Page		opage;
    Page		npage;
    HashPageOpaque oopaque;
    HashPageOpaque nopaque;

    /*
     * It should be okay to simultaneously write-lock pages from each bucket,
     * since no one else can be trying to acquire buffer lock on pages of
     * either bucket.
     */
    oblkno = start_oblkno;
    obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_BUCKET_PAGE);
    opage = BufferGetPage(obuf);
    oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);

    nblkno = start_nblkno;
    nbuf = _hash_getnewbuf(rel, nblkno, MAIN_FORKNUM);
    npage = BufferGetPage(nbuf);

    /* initialize the new bucket's primary page */
    nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
    nopaque->hasho_prevblkno = InvalidBlockNumber;
    nopaque->hasho_nextblkno = InvalidBlockNumber;
    nopaque->hasho_bucket = nbucket;
    nopaque->hasho_flag = LH_BUCKET_PAGE;
    nopaque->hasho_page_id = HASHO_PAGE_ID;

    /*
     * Partition the tuples in the old bucket between the old bucket and the
     * new bucket, advancing along the old bucket's overflow bucket chain and
     * adding overflow pages to the new bucket as needed.  Outer loop iterates
     * once per page in old bucket.
     */
    for (;;)
    {
        OffsetNumber ooffnum;
        OffsetNumber omaxoffnum;
        OffsetNumber deletable[MaxOffsetNumber];
        int			ndeletable = 0;

        /* Scan each tuple in old page */
        omaxoffnum = PageGetMaxOffsetNumber(opage);
        for (ooffnum = FirstOffsetNumber;
                ooffnum <= omaxoffnum;
                ooffnum = OffsetNumberNext(ooffnum))
        {
            IndexTuple	itup;
            Size		itemsz;
            Bucket		bucket;

            /*
             * Fetch the item's hash key (conveniently stored in the item) and
             * determine which bucket it now belongs in.
             */
            itup = (IndexTuple) PageGetItem(opage,
                                            PageGetItemId(opage, ooffnum));
            bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
                                          maxbucket, highmask, lowmask);

            if (bucket == nbucket)
            {
                /*
                 * insert the tuple into the new bucket.  if it doesn't fit on
                 * the current page in the new bucket, we must allocate a new
                 * overflow page and place the tuple on that page instead.
                 */
                itemsz = IndexTupleDSize(*itup);
                itemsz = MAXALIGN(itemsz);

                if (PageGetFreeSpace(npage) < itemsz)
                {
                    /* write out nbuf and drop lock, but keep pin */
                    _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK);
                    /* chain to a new overflow page */
                    nbuf = _hash_addovflpage(rel, metabuf, nbuf);
                    npage = BufferGetPage(nbuf);
                    /* we don't need nblkno or nopaque within the loop */
                }

                /*
                 * Insert tuple on new page, using _hash_pgaddtup to ensure
                 * correct ordering by hashkey.  This is a tad inefficient
                 * since we may have to shuffle itempointers repeatedly.
                 * Possible future improvement: accumulate all the items for
                 * the new page and qsort them before insertion.
                 */
                (void) _hash_pgaddtup(rel, nbuf, itemsz, itup);

                /*
                 * Mark tuple for deletion from old page.
                 */
                deletable[ndeletable++] = ooffnum;
            }
            else
            {
                /*
                 * the tuple stays on this page, so nothing to do.
                 */
                Assert(bucket == obucket);
            }
        }

        oblkno = oopaque->hasho_nextblkno;

        /*
         * Done scanning this old page.  If we moved any tuples, delete them
         * from the old page.
         */
        if (ndeletable > 0)
        {
            PageIndexMultiDelete(opage, deletable, ndeletable);
            _hash_wrtbuf(rel, obuf);
        }
        else
            _hash_relbuf(rel, obuf);

        /* Exit loop if no more overflow pages in old bucket */
        if (!BlockNumberIsValid(oblkno))
            break;

        /* Else, advance to next old page */
        obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
        opage = BufferGetPage(obuf);
        oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
    }

    /*
     * We're at the end of the old bucket chain, so we're done partitioning
     * the tuples.	Before quitting, call _hash_squeezebucket to ensure the
     * tuples remaining in the old bucket (including the overflow pages) are
     * packed as tightly as possible.  The new bucket is already tight.
     */
    _hash_wrtbuf(rel, nbuf);

    _hash_squeezebucket(rel, obucket, start_oblkno, NULL);
}
示例#17
0
/*
 *	_hash_squeezebucket(rel, bucket)
 *
 *	Try to squeeze the tuples onto pages occurring earlier in the
 *	bucket chain in an attempt to free overflow pages. When we start
 *	the "squeezing", the page from which we start taking tuples (the
 *	"read" page) is the last bucket in the bucket chain and the page
 *	onto which we start squeezing tuples (the "write" page) is the
 *	first page in the bucket chain.  The read page works backward and
 *	the write page works forward; the procedure terminates when the
 *	read page and write page are the same page.
 *
 *	At completion of this procedure, it is guaranteed that all pages in
 *	the bucket are nonempty, unless the bucket is totally empty (in
 *	which case all overflow pages will be freed).  The original implementation
 *	required that to be true on entry as well, but it's a lot easier for
 *	callers to leave empty overflow pages and let this guy clean it up.
 *
 *	Caller must hold exclusive lock on the target bucket.  This allows
 *	us to safely lock multiple pages in the bucket.
 *
 *	Since this function is invoked in VACUUM, we provide an access strategy
 *	parameter that controls fetches of the bucket pages.
 */
void
_hash_squeezebucket(Relation rel,
                    Bucket bucket,
                    BlockNumber bucket_blkno,
                    BufferAccessStrategy bstrategy)
{
    BlockNumber wblkno;
    BlockNumber rblkno;
    Buffer		wbuf;
    Buffer		rbuf;
    Page		wpage;
    Page		rpage;
    HashPageOpaque wopaque;
    HashPageOpaque ropaque;
    bool		wbuf_dirty;

    /*
     * start squeezing into the base bucket page.
     */
    wblkno = bucket_blkno;
    wbuf = _hash_getbuf_with_strategy(rel,
                                      wblkno,
                                      HASH_WRITE,
                                      LH_BUCKET_PAGE,
                                      bstrategy);
    wpage = BufferGetPage(wbuf);
    wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);

    /*
     * if there aren't any overflow pages, there's nothing to squeeze.
     */
    if (!BlockNumberIsValid(wopaque->hasho_nextblkno))
    {
        _hash_relbuf(rel, wbuf);
        return;
    }

    /*
     * Find the last page in the bucket chain by starting at the base bucket
     * page and working forward.  Note: we assume that a hash bucket chain is
     * usually smaller than the buffer ring being used by VACUUM, else using
     * the access strategy here would be counterproductive.
     */
    rbuf = InvalidBuffer;
    ropaque = wopaque;
    do
    {
        rblkno = ropaque->hasho_nextblkno;
        if (rbuf != InvalidBuffer)
            _hash_relbuf(rel, rbuf);
        rbuf = _hash_getbuf_with_strategy(rel,
                                          rblkno,
                                          HASH_WRITE,
                                          LH_OVERFLOW_PAGE,
                                          bstrategy);
        rpage = BufferGetPage(rbuf);
        ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
        Assert(ropaque->hasho_bucket == bucket);
    } while (BlockNumberIsValid(ropaque->hasho_nextblkno));

    /*
     * squeeze the tuples.
     */
    wbuf_dirty = false;
    for (;;)
    {
        OffsetNumber roffnum;
        OffsetNumber maxroffnum;
        OffsetNumber deletable[MaxOffsetNumber];
        int			ndeletable = 0;

        /* Scan each tuple in "read" page */
        maxroffnum = PageGetMaxOffsetNumber(rpage);
        for (roffnum = FirstOffsetNumber;
                roffnum <= maxroffnum;
                roffnum = OffsetNumberNext(roffnum))
        {
            IndexTuple	itup;
            Size		itemsz;

            itup = (IndexTuple) PageGetItem(rpage,
                                            PageGetItemId(rpage, roffnum));
            itemsz = IndexTupleDSize(*itup);
            itemsz = MAXALIGN(itemsz);

            /*
             * Walk up the bucket chain, looking for a page big enough for
             * this item.  Exit if we reach the read page.
             */
            while (PageGetFreeSpace(wpage) < itemsz)
            {
                Assert(!PageIsEmpty(wpage));

                wblkno = wopaque->hasho_nextblkno;
                Assert(BlockNumberIsValid(wblkno));

                if (wbuf_dirty)
                    _hash_wrtbuf(rel, wbuf);
                else
                    _hash_relbuf(rel, wbuf);

                /* nothing more to do if we reached the read page */
                if (rblkno == wblkno)
                {
                    if (ndeletable > 0)
                    {
                        /* Delete tuples we already moved off read page */
                        PageIndexMultiDelete(rpage, deletable, ndeletable);
                        _hash_wrtbuf(rel, rbuf);
                    }
                    else
                        _hash_relbuf(rel, rbuf);
                    return;
                }

                wbuf = _hash_getbuf_with_strategy(rel,
                                                  wblkno,
                                                  HASH_WRITE,
                                                  LH_OVERFLOW_PAGE,
                                                  bstrategy);
                wpage = BufferGetPage(wbuf);
                wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
                Assert(wopaque->hasho_bucket == bucket);
                wbuf_dirty = false;
            }

            /*
             * we have found room so insert on the "write" page, being careful
             * to preserve hashkey ordering.  (If we insert many tuples into
             * the same "write" page it would be worth qsort'ing instead of
             * doing repeated _hash_pgaddtup.)
             */
            (void) _hash_pgaddtup(rel, wbuf, itemsz, itup);
            wbuf_dirty = true;

            /* remember tuple for deletion from "read" page */
            deletable[ndeletable++] = roffnum;
        }

        /*
         * If we reach here, there are no live tuples on the "read" page ---
         * it was empty when we got to it, or we moved them all.  So we can
         * just free the page without bothering with deleting tuples
         * individually.  Then advance to the previous "read" page.
         *
         * Tricky point here: if our read and write pages are adjacent in the
         * bucket chain, our write lock on wbuf will conflict with
         * _hash_freeovflpage's attempt to update the sibling links of the
         * removed page.  However, in that case we are done anyway, so we can
         * simply drop the write lock before calling _hash_freeovflpage.
         */
        rblkno = ropaque->hasho_prevblkno;
        Assert(BlockNumberIsValid(rblkno));

        /* are we freeing the page adjacent to wbuf? */
        if (rblkno == wblkno)
        {
            /* yes, so release wbuf lock first */
            if (wbuf_dirty)
                _hash_wrtbuf(rel, wbuf);
            else
                _hash_relbuf(rel, wbuf);
            /* free this overflow page (releases rbuf) */
            _hash_freeovflpage(rel, rbuf, bstrategy);
            /* done */
            return;
        }

        /* free this overflow page, then get the previous one */
        _hash_freeovflpage(rel, rbuf, bstrategy);

        rbuf = _hash_getbuf_with_strategy(rel,
                                          rblkno,
                                          HASH_WRITE,
                                          LH_OVERFLOW_PAGE,
                                          bstrategy);
        rpage = BufferGetPage(rbuf);
        ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
        Assert(ropaque->hasho_bucket == bucket);
    }

    /* NOTREACHED */
}
示例#18
0
文件: hash.c 项目: michaelpq/postgres
/*
 * Bulk deletion of all index entries pointing to a set of heap tuples.
 * The set of target tuples is specified via a callback routine that tells
 * whether any given heap tuple (identified by ItemPointer) is being deleted.
 *
 * This function also deletes the tuples that are moved by split to other
 * bucket.
 *
 * Result: a palloc'd struct containing statistical info for VACUUM displays.
 */
IndexBulkDeleteResult *
hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
               IndexBulkDeleteCallback callback, void *callback_state)
{
    Relation	rel = info->index;
    double		tuples_removed;
    double		num_index_tuples;
    double		orig_ntuples;
    Bucket		orig_maxbucket;
    Bucket		cur_maxbucket;
    Bucket		cur_bucket;
    Buffer		metabuf;
    HashMetaPage metap;
    HashMetaPageData local_metapage;

    tuples_removed = 0;
    num_index_tuples = 0;

    /*
     * Read the metapage to fetch original bucket and tuple counts.  Also, we
     * keep a copy of the last-seen metapage so that we can use its
     * hashm_spares[] values to compute bucket page addresses.  This is a bit
     * hokey but perfectly safe, since the interesting entries in the spares
     * array cannot change under us; and it beats rereading the metapage for
     * each bucket.
     */
    metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
    metap = HashPageGetMeta(BufferGetPage(metabuf));
    orig_maxbucket = metap->hashm_maxbucket;
    orig_ntuples = metap->hashm_ntuples;
    memcpy(&local_metapage, metap, sizeof(local_metapage));
    _hash_relbuf(rel, metabuf);

    /* Scan the buckets that we know exist */
    cur_bucket = 0;
    cur_maxbucket = orig_maxbucket;

loop_top:
    while (cur_bucket <= cur_maxbucket)
    {
        BlockNumber bucket_blkno;
        BlockNumber blkno;
        Buffer		bucket_buf;
        Buffer		buf;
        HashPageOpaque bucket_opaque;
        Page		page;
        bool		split_cleanup = false;

        /* Get address of bucket's start page */
        bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);

        blkno = bucket_blkno;

        /*
         * We need to acquire a cleanup lock on the primary bucket page to out
         * wait concurrent scans before deleting the dead tuples.
         */
        buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
        LockBufferForCleanup(buf);
        _hash_checkpage(rel, buf, LH_BUCKET_PAGE);

        page = BufferGetPage(buf);
        bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);

        /*
         * If the bucket contains tuples that are moved by split, then we need
         * to delete such tuples.  We can't delete such tuples if the split
         * operation on bucket is not finished as those are needed by scans.
         */
        if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
                H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
            split_cleanup = true;

        bucket_buf = buf;

        hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
                          local_metapage.hashm_maxbucket,
                          local_metapage.hashm_highmask,
                          local_metapage.hashm_lowmask, &tuples_removed,
                          &num_index_tuples, split_cleanup,
                          callback, callback_state);

        _hash_dropbuf(rel, bucket_buf);

        /* Advance to next bucket */
        cur_bucket++;
    }

    /* Write-lock metapage and check for split since we started */
    metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE);
    metap = HashPageGetMeta(BufferGetPage(metabuf));

    if (cur_maxbucket != metap->hashm_maxbucket)
    {
        /* There's been a split, so process the additional bucket(s) */
        cur_maxbucket = metap->hashm_maxbucket;
        memcpy(&local_metapage, metap, sizeof(local_metapage));
        _hash_relbuf(rel, metabuf);
        goto loop_top;
    }

    /* Okay, we're really done.  Update tuple count in metapage. */

    if (orig_maxbucket == metap->hashm_maxbucket &&
            orig_ntuples == metap->hashm_ntuples)
    {
        /*
         * No one has split or inserted anything since start of scan, so
         * believe our count as gospel.
         */
        metap->hashm_ntuples = num_index_tuples;
    }
    else
    {
        /*
         * Otherwise, our count is untrustworthy since we may have
         * double-scanned tuples in split buckets.  Proceed by dead-reckoning.
         * (Note: we still return estimated_count = false, because using this
         * count is better than not updating reltuples at all.)
         */
        if (metap->hashm_ntuples > tuples_removed)
            metap->hashm_ntuples -= tuples_removed;
        else
            metap->hashm_ntuples = 0;
        num_index_tuples = metap->hashm_ntuples;
    }

    _hash_wrtbuf(rel, metabuf);

    /* return statistics */
    if (stats == NULL)
        stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
    stats->estimated_count = false;
    stats->num_index_tuples = num_index_tuples;
    stats->tuples_removed += tuples_removed;
    /* hashvacuumcleanup will fill in num_pages */

    return stats;
}
示例#19
0
/*
 * Bulk deletion of all index entries pointing to a set of heap tuples.
 * The set of target tuples is specified via a callback routine that tells
 * whether any given heap tuple (identified by ItemPointer) is being deleted.
 *
 * Result: a palloc'd struct containing statistical info for VACUUM displays.
 */
IndexBulkDeleteResult *
hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
			   IndexBulkDeleteCallback callback, void *callback_state)
{
	Relation	rel = info->index;
	double		tuples_removed;
	double		num_index_tuples;
	double		orig_ntuples;
	Bucket		orig_maxbucket;
	Bucket		cur_maxbucket;
	Bucket		cur_bucket;
	Buffer		metabuf;
	HashMetaPage metap;
	HashMetaPageData local_metapage;

	tuples_removed = 0;
	num_index_tuples = 0;

	/*
	 * Read the metapage to fetch original bucket and tuple counts.  Also, we
	 * keep a copy of the last-seen metapage so that we can use its
	 * hashm_spares[] values to compute bucket page addresses.  This is a bit
	 * hokey but perfectly safe, since the interesting entries in the spares
	 * array cannot change under us; and it beats rereading the metapage for
	 * each bucket.
	 */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
	metap = HashPageGetMeta(BufferGetPage(metabuf));
	orig_maxbucket = metap->hashm_maxbucket;
	orig_ntuples = metap->hashm_ntuples;
	memcpy(&local_metapage, metap, sizeof(local_metapage));
	_hash_relbuf(rel, metabuf);

	/* Scan the buckets that we know exist */
	cur_bucket = 0;
	cur_maxbucket = orig_maxbucket;

loop_top:
	while (cur_bucket <= cur_maxbucket)
	{
		BlockNumber bucket_blkno;
		BlockNumber blkno;
		bool		bucket_dirty = false;

		/* Get address of bucket's start page */
		bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);

		/* Exclusive-lock the bucket so we can shrink it */
		_hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE);

		/* Shouldn't have any active scans locally, either */
		if (_hash_has_active_scan(rel, cur_bucket))
			elog(ERROR, "hash index has active scan during VACUUM");

		/* Scan each page in bucket */
		blkno = bucket_blkno;
		while (BlockNumberIsValid(blkno))
		{
			Buffer		buf;
			Page		page;
			HashPageOpaque opaque;
			OffsetNumber offno;
			OffsetNumber maxoffno;
			OffsetNumber deletable[MaxOffsetNumber];
			int			ndeletable = 0;

			vacuum_delay_point();

			buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
										   LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
											 info->strategy);
			page = BufferGetPage(buf);
			opaque = (HashPageOpaque) PageGetSpecialPointer(page);
			Assert(opaque->hasho_bucket == cur_bucket);

			/* Scan each tuple in page */
			maxoffno = PageGetMaxOffsetNumber(page);
			for (offno = FirstOffsetNumber;
				 offno <= maxoffno;
				 offno = OffsetNumberNext(offno))
			{
				IndexTuple	itup;
				ItemPointer htup;

				itup = (IndexTuple) PageGetItem(page,
												PageGetItemId(page, offno));
				htup = &(itup->t_tid);
				if (callback(htup, callback_state))
				{
					/* mark the item for deletion */
					deletable[ndeletable++] = offno;
					tuples_removed += 1;
				}
				else
					num_index_tuples += 1;
			}

			/*
			 * Apply deletions and write page if needed, advance to next page.
			 */
			blkno = opaque->hasho_nextblkno;

			if (ndeletable > 0)
			{
				PageIndexMultiDelete(page, deletable, ndeletable);
				_hash_wrtbuf(rel, buf);
				bucket_dirty = true;
			}
			else
				_hash_relbuf(rel, buf);
		}

		/* If we deleted anything, try to compact free space */
		if (bucket_dirty)
			_hash_squeezebucket(rel, cur_bucket, bucket_blkno,
								info->strategy);

		/* Release bucket lock */
		_hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE);

		/* Advance to next bucket */
		cur_bucket++;
	}

	/* Write-lock metapage and check for split since we started */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE);
	metap = HashPageGetMeta(BufferGetPage(metabuf));

	if (cur_maxbucket != metap->hashm_maxbucket)
	{
		/* There's been a split, so process the additional bucket(s) */
		cur_maxbucket = metap->hashm_maxbucket;
		memcpy(&local_metapage, metap, sizeof(local_metapage));
		_hash_relbuf(rel, metabuf);
		goto loop_top;
	}

	/* Okay, we're really done.  Update tuple count in metapage. */

	if (orig_maxbucket == metap->hashm_maxbucket &&
		orig_ntuples == metap->hashm_ntuples)
	{
		/*
		 * No one has split or inserted anything since start of scan, so
		 * believe our count as gospel.
		 */
		metap->hashm_ntuples = num_index_tuples;
	}
	else
	{
		/*
		 * Otherwise, our count is untrustworthy since we may have
		 * double-scanned tuples in split buckets.  Proceed by dead-reckoning.
		 * (Note: we still return estimated_count = false, because using this
		 * count is better than not updating reltuples at all.)
		 */
		if (metap->hashm_ntuples > tuples_removed)
			metap->hashm_ntuples -= tuples_removed;
		else
			metap->hashm_ntuples = 0;
		num_index_tuples = metap->hashm_ntuples;
	}

	_hash_wrtbuf(rel, metabuf);

	/* return statistics */
	if (stats == NULL)
		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
	stats->estimated_count = false;
	stats->num_index_tuples = num_index_tuples;
	stats->tuples_removed += tuples_removed;
	/* hashvacuumcleanup will fill in num_pages */

	return stats;
}
示例#20
0
文件: hash.c 项目: sunyangkobe/cscd43
/*
 * Bulk deletion of all index entries pointing to a set of heap tuples.
 * The set of target tuples is specified via a callback routine that tells
 * whether any given heap tuple (identified by ItemPointer) is being deleted.
 *
 * Result: a palloc'd struct containing statistical info for VACUUM displays.
 */
Datum
hashbulkdelete(PG_FUNCTION_ARGS)
{
	Relation	rel = (Relation) PG_GETARG_POINTER(0);
	IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
	void	   *callback_state = (void *) PG_GETARG_POINTER(2);
	IndexBulkDeleteResult *result;
	BlockNumber num_pages;
	double		tuples_removed;
	double		num_index_tuples;
	double		orig_ntuples;
	Bucket		orig_maxbucket;
	Bucket		cur_maxbucket;
	Bucket		cur_bucket;
	Buffer		metabuf;
	HashMetaPage metap;
	HashMetaPageData local_metapage;

	tuples_removed = 0;
	num_index_tuples = 0;

	/*
	 * Read the metapage to fetch original bucket and tuple counts.  Also,
	 * we keep a copy of the last-seen metapage so that we can use its
	 * hashm_spares[] values to compute bucket page addresses.  This is a
	 * bit hokey but perfectly safe, since the interesting entries in the
	 * spares array cannot change under us; and it beats rereading the
	 * metapage for each bucket.
	 */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
	metap = (HashMetaPage) BufferGetPage(metabuf);
	_hash_checkpage(rel, (Page) metap, LH_META_PAGE);
	orig_maxbucket = metap->hashm_maxbucket;
	orig_ntuples = metap->hashm_ntuples;
	memcpy(&local_metapage, metap, sizeof(local_metapage));
	_hash_relbuf(rel, metabuf);

	/* Scan the buckets that we know exist */
	cur_bucket = 0;
	cur_maxbucket = orig_maxbucket;

loop_top:
	while (cur_bucket <= cur_maxbucket)
	{
		BlockNumber bucket_blkno;
		BlockNumber blkno;
		bool		bucket_dirty = false;

		/* Get address of bucket's start page */
		bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);

		/* Exclusive-lock the bucket so we can shrink it */
		_hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE);

		/* Shouldn't have any active scans locally, either */
		if (_hash_has_active_scan(rel, cur_bucket))
			elog(ERROR, "hash index has active scan during VACUUM");

		/* Scan each page in bucket */
		blkno = bucket_blkno;
		while (BlockNumberIsValid(blkno))
		{
			Buffer		buf;
			Page		page;
			HashPageOpaque opaque;
			OffsetNumber offno;
			OffsetNumber maxoffno;
			bool		page_dirty = false;

			buf = _hash_getbuf(rel, blkno, HASH_WRITE);
			page = BufferGetPage(buf);
			_hash_checkpage(rel, page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
			opaque = (HashPageOpaque) PageGetSpecialPointer(page);
			Assert(opaque->hasho_bucket == cur_bucket);

			/* Scan each tuple in page */
			offno = FirstOffsetNumber;
			maxoffno = PageGetMaxOffsetNumber(page);
			while (offno <= maxoffno)
			{
				HashItem	hitem;
				ItemPointer htup;

				hitem = (HashItem) PageGetItem(page,
											   PageGetItemId(page, offno));
				htup = &(hitem->hash_itup.t_tid);
				if (callback(htup, callback_state))
				{
					/* delete the item from the page */
					PageIndexTupleDelete(page, offno);
					bucket_dirty = page_dirty = true;

					/* don't increment offno, instead decrement maxoffno */
					maxoffno = OffsetNumberPrev(maxoffno);

					tuples_removed += 1;
				}
				else
				{
					offno = OffsetNumberNext(offno);

					num_index_tuples += 1;
				}
			}

			/*
			 * Write page if needed, advance to next page.
			 */
			blkno = opaque->hasho_nextblkno;

			if (page_dirty)
				_hash_wrtbuf(rel, buf);
			else
				_hash_relbuf(rel, buf);
		}

		/* If we deleted anything, try to compact free space */
		if (bucket_dirty)
			_hash_squeezebucket(rel, cur_bucket, bucket_blkno);

		/* Release bucket lock */
		_hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE);

		/* Advance to next bucket */
		cur_bucket++;
	}

	/* Write-lock metapage and check for split since we started */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
	metap = (HashMetaPage) BufferGetPage(metabuf);
	_hash_checkpage(rel, (Page) metap, LH_META_PAGE);

	if (cur_maxbucket != metap->hashm_maxbucket)
	{
		/* There's been a split, so process the additional bucket(s) */
		cur_maxbucket = metap->hashm_maxbucket;
		memcpy(&local_metapage, metap, sizeof(local_metapage));
		_hash_relbuf(rel, metabuf);
		goto loop_top;
	}

	/* Okay, we're really done.  Update tuple count in metapage. */

	if (orig_maxbucket == metap->hashm_maxbucket &&
		orig_ntuples == metap->hashm_ntuples)
	{
		/*
		 * No one has split or inserted anything since start of scan,
		 * so believe our count as gospel.
		 */
		metap->hashm_ntuples = num_index_tuples;
	}
	else
	{
		/*
		 * Otherwise, our count is untrustworthy since we may have
		 * double-scanned tuples in split buckets.  Proceed by
		 * dead-reckoning.
		 */
		if (metap->hashm_ntuples > tuples_removed)
			metap->hashm_ntuples -= tuples_removed;
		else
			metap->hashm_ntuples = 0;
		num_index_tuples = metap->hashm_ntuples;
	}

	_hash_wrtbuf(rel, metabuf);

	/* return statistics */
	num_pages = RelationGetNumberOfBlocks(rel);

	result = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
	result->num_pages = num_pages;
	result->num_index_tuples = num_index_tuples;
	result->tuples_removed = tuples_removed;

	PG_RETURN_POINTER(result);
}