Beispiel #1
0
/*
 *	_hash_get_newblock_from_oldbucket() -- get the block number of a bucket
 *			that will be generated after split from old bucket.
 *
 * This is used to find the new bucket from old bucket based on current table
 * half.  It is mainly required to finish the incomplete splits where we are
 * sure that not more than one bucket could have split in progress from old
 * bucket.
 */
BlockNumber
_hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket)
{
	Bucket		new_bucket;
	Buffer		metabuf;
	HashMetaPage metap;
	BlockNumber blkno;

	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
	metap = HashPageGetMeta(BufferGetPage(metabuf));

	new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket,
													metap->hashm_lowmask,
													metap->hashm_maxbucket);
	blkno = BUCKET_TO_BLKNO(metap, new_bucket);

	_hash_relbuf(rel, metabuf);

	return blkno;
}
Beispiel #2
0
/*
 * Helper function to perform deletion of index entries from a bucket.
 *
 * This function expects that the caller has acquired a cleanup lock on the
 * primary bucket page, and will return with a write lock again held on the
 * primary bucket page.  The lock won't necessarily be held continuously,
 * though, because we'll release it when visiting overflow pages.
 *
 * It would be very bad if this function cleaned a page while some other
 * backend was in the midst of scanning it, because hashgettuple assumes
 * that the next valid TID will be greater than or equal to the current
 * valid TID.  There can't be any concurrent scans in progress when we first
 * enter this function because of the cleanup lock we hold on the primary
 * bucket page, but as soon as we release that lock, there might be.  We
 * handle that by conspiring to prevent those scans from passing our cleanup
 * scan.  To do that, we lock the next page in the bucket chain before
 * releasing the lock on the previous page.  (This type of lock chaining is
 * not ideal, so we might want to look for a better solution at some point.)
 *
 * We need to retain a pin on the primary bucket to ensure that no concurrent
 * split can start.
 */
void
hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
				  BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
				  uint32 maxbucket, uint32 highmask, uint32 lowmask,
				  double *tuples_removed, double *num_index_tuples,
				  bool split_cleanup,
				  IndexBulkDeleteCallback callback, void *callback_state)
{
	BlockNumber blkno;
	Buffer		buf;
	Bucket		new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
	bool		bucket_dirty = false;

	blkno = bucket_blkno;
	buf = bucket_buf;

	if (split_cleanup)
		new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
														lowmask, maxbucket);

	/* Scan each page in bucket */
	for (;;)
	{
		HashPageOpaque opaque;
		OffsetNumber offno;
		OffsetNumber maxoffno;
		Buffer		next_buf;
		Page		page;
		OffsetNumber deletable[MaxOffsetNumber];
		int			ndeletable = 0;
		bool		retain_pin = false;
		bool		clear_dead_marking = false;

		vacuum_delay_point();

		page = BufferGetPage(buf);
		opaque = (HashPageOpaque) PageGetSpecialPointer(page);

		/* Scan each tuple in page */
		maxoffno = PageGetMaxOffsetNumber(page);
		for (offno = FirstOffsetNumber;
			 offno <= maxoffno;
			 offno = OffsetNumberNext(offno))
		{
			ItemPointer htup;
			IndexTuple	itup;
			Bucket		bucket;
			bool		kill_tuple = false;

			itup = (IndexTuple) PageGetItem(page,
											PageGetItemId(page, offno));
			htup = &(itup->t_tid);

			/*
			 * To remove the dead tuples, we strictly want to rely on results
			 * of callback function.  refer btvacuumpage for detailed reason.
			 */
			if (callback && callback(htup, callback_state))
			{
				kill_tuple = true;
				if (tuples_removed)
					*tuples_removed += 1;
			}
			else if (split_cleanup)
			{
				/* delete the tuples that are moved by split. */
				bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
											  maxbucket,
											  highmask,
											  lowmask);
				/* mark the item for deletion */
				if (bucket != cur_bucket)
				{
					/*
					 * We expect tuples to either belong to current bucket or
					 * new_bucket.  This is ensured because we don't allow
					 * further splits from bucket that contains garbage. See
					 * comments in _hash_expandtable.
					 */
					Assert(bucket == new_bucket);
					kill_tuple = true;
				}
			}

			if (kill_tuple)
			{
				/* mark the item for deletion */
				deletable[ndeletable++] = offno;
			}
			else
			{
				/* we're keeping it, so count it */
				if (num_index_tuples)
					*num_index_tuples += 1;
			}
		}

		/* retain the pin on primary bucket page till end of bucket scan */
		if (blkno == bucket_blkno)
			retain_pin = true;
		else
			retain_pin = false;

		blkno = opaque->hasho_nextblkno;

		/*
		 * Apply deletions, advance to next page and write page if needed.
		 */
		if (ndeletable > 0)
		{
			/* No ereport(ERROR) until changes are logged */
			START_CRIT_SECTION();

			PageIndexMultiDelete(page, deletable, ndeletable);
			bucket_dirty = true;

			/*
			 * Let us mark the page as clean if vacuum removes the DEAD tuples
			 * from an index page. We do this by clearing
			 * LH_PAGE_HAS_DEAD_TUPLES flag.
			 */
			if (tuples_removed && *tuples_removed > 0 &&
				H_HAS_DEAD_TUPLES(opaque))
			{
				opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
				clear_dead_marking = true;
			}

			MarkBufferDirty(buf);

			/* XLOG stuff */
			if (RelationNeedsWAL(rel))
			{
				xl_hash_delete xlrec;
				XLogRecPtr	recptr;

				xlrec.clear_dead_marking = clear_dead_marking;
				xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false;

				XLogBeginInsert();
				XLogRegisterData((char *) &xlrec, SizeOfHashDelete);

				/*
				 * bucket buffer needs to be registered to ensure that we can
				 * acquire a cleanup lock on it during replay.
				 */
				if (!xlrec.is_primary_bucket_page)
					XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE);

				XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
				XLogRegisterBufData(1, (char *) deletable,
									ndeletable * sizeof(OffsetNumber));

				recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
				PageSetLSN(BufferGetPage(buf), recptr);
			}

			END_CRIT_SECTION();
		}

		/* bail out if there are no more pages to scan. */
		if (!BlockNumberIsValid(blkno))
			break;

		next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
											  LH_OVERFLOW_PAGE,
											  bstrategy);

		/*
		 * release the lock on previous page after acquiring the lock on next
		 * page
		 */
		if (retain_pin)
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
		else
			_hash_relbuf(rel, buf);

		buf = next_buf;
	}

	/*
	 * lock the bucket page to clear the garbage flag and squeeze the bucket.
	 * if the current buffer is same as bucket buffer, then we already have
	 * lock on bucket page.
	 */
	if (buf != bucket_buf)
	{
		_hash_relbuf(rel, buf);
		LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
	}

	/*
	 * Clear the garbage flag from bucket after deleting the tuples that are
	 * moved by split.  We purposefully clear the flag before squeeze bucket,
	 * so that after restart, vacuum shouldn't again try to delete the moved
	 * by split tuples.
	 */
	if (split_cleanup)
	{
		HashPageOpaque bucket_opaque;
		Page		page;

		page = BufferGetPage(bucket_buf);
		bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);

		/* No ereport(ERROR) until changes are logged */
		START_CRIT_SECTION();

		bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
		MarkBufferDirty(bucket_buf);

		/* XLOG stuff */
		if (RelationNeedsWAL(rel))
		{
			XLogRecPtr	recptr;

			XLogBeginInsert();
			XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);

			recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
			PageSetLSN(page, recptr);
		}

		END_CRIT_SECTION();
	}

	/*
	 * If we have deleted anything, try to compact free space.  For squeezing
	 * the bucket, we must have a cleanup lock, else it can impact the
	 * ordering of tuples for a scan that has started before it.
	 */
	if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
		_hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
							bstrategy);
	else
		LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
}
Beispiel #3
0
/*
 * Helper function to perform deletion of index entries from a bucket.
 *
 * This function expects that the caller has acquired a cleanup lock on the
 * primary bucket page, and will return with a write lock again held on the
 * primary bucket page.  The lock won't necessarily be held continuously,
 * though, because we'll release it when visiting overflow pages.
 *
 * It would be very bad if this function cleaned a page while some other
 * backend was in the midst of scanning it, because hashgettuple assumes
 * that the next valid TID will be greater than or equal to the current
 * valid TID.  There can't be any concurrent scans in progress when we first
 * enter this function because of the cleanup lock we hold on the primary
 * bucket page, but as soon as we release that lock, there might be.  We
 * handle that by conspiring to prevent those scans from passing our cleanup
 * scan.  To do that, we lock the next page in the bucket chain before
 * releasing the lock on the previous page.  (This type of lock chaining is
 * not ideal, so we might want to look for a better solution at some point.)
 *
 * We need to retain a pin on the primary bucket to ensure that no concurrent
 * split can start.
 */
void
hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
                  BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
                  uint32 maxbucket, uint32 highmask, uint32 lowmask,
                  double *tuples_removed, double *num_index_tuples,
                  bool split_cleanup,
                  IndexBulkDeleteCallback callback, void *callback_state)
{
    BlockNumber blkno;
    Buffer		buf;
    Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
    bool		bucket_dirty = false;

    blkno = bucket_blkno;
    buf = bucket_buf;

    if (split_cleanup)
        new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
                     lowmask, maxbucket);

    /* Scan each page in bucket */
    for (;;)
    {
        HashPageOpaque opaque;
        OffsetNumber offno;
        OffsetNumber maxoffno;
        Buffer		next_buf;
        Page		page;
        OffsetNumber deletable[MaxOffsetNumber];
        int			ndeletable = 0;
        bool		retain_pin = false;
        bool		curr_page_dirty = false;

        vacuum_delay_point();

        page = BufferGetPage(buf);
        opaque = (HashPageOpaque) PageGetSpecialPointer(page);

        /* Scan each tuple in page */
        maxoffno = PageGetMaxOffsetNumber(page);
        for (offno = FirstOffsetNumber;
                offno <= maxoffno;
                offno = OffsetNumberNext(offno))
        {
            ItemPointer htup;
            IndexTuple	itup;
            Bucket		bucket;
            bool		kill_tuple = false;

            itup = (IndexTuple) PageGetItem(page,
                                            PageGetItemId(page, offno));
            htup = &(itup->t_tid);

            /*
             * To remove the dead tuples, we strictly want to rely on results
             * of callback function.  refer btvacuumpage for detailed reason.
             */
            if (callback && callback(htup, callback_state))
            {
                kill_tuple = true;
                if (tuples_removed)
                    *tuples_removed += 1;
            }
            else if (split_cleanup)
            {
                /* delete the tuples that are moved by split. */
                bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
                                              maxbucket,
                                              highmask,
                                              lowmask);
                /* mark the item for deletion */
                if (bucket != cur_bucket)
                {
                    /*
                     * We expect tuples to either belong to curent bucket or
                     * new_bucket.  This is ensured because we don't allow
                     * further splits from bucket that contains garbage. See
                     * comments in _hash_expandtable.
                     */
                    Assert(bucket == new_bucket);
                    kill_tuple = true;
                }
            }

            if (kill_tuple)
            {
                /* mark the item for deletion */
                deletable[ndeletable++] = offno;
            }
            else
            {
                /* we're keeping it, so count it */
                if (num_index_tuples)
                    *num_index_tuples += 1;
            }
        }

        /* retain the pin on primary bucket page till end of bucket scan */
        if (blkno == bucket_blkno)
            retain_pin = true;
        else
            retain_pin = false;

        blkno = opaque->hasho_nextblkno;

        /*
         * Apply deletions, advance to next page and write page if needed.
         */
        if (ndeletable > 0)
        {
            PageIndexMultiDelete(page, deletable, ndeletable);
            bucket_dirty = true;
            curr_page_dirty = true;
        }

        /* bail out if there are no more pages to scan. */
        if (!BlockNumberIsValid(blkno))
            break;

        next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
                                              LH_OVERFLOW_PAGE,
                                              bstrategy);

        /*
         * release the lock on previous page after acquiring the lock on next
         * page
         */
        if (curr_page_dirty)
        {
            if (retain_pin)
                _hash_chgbufaccess(rel, buf, HASH_WRITE, HASH_NOLOCK);
            else
                _hash_wrtbuf(rel, buf);
            curr_page_dirty = false;
        }
        else if (retain_pin)
            _hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);
        else
            _hash_relbuf(rel, buf);

        buf = next_buf;
    }

    /*
     * lock the bucket page to clear the garbage flag and squeeze the bucket.
     * if the current buffer is same as bucket buffer, then we already have
     * lock on bucket page.
     */
    if (buf != bucket_buf)
    {
        _hash_relbuf(rel, buf);
        _hash_chgbufaccess(rel, bucket_buf, HASH_NOLOCK, HASH_WRITE);
    }

    /*
     * Clear the garbage flag from bucket after deleting the tuples that are
     * moved by split.  We purposefully clear the flag before squeeze bucket,
     * so that after restart, vacuum shouldn't again try to delete the moved
     * by split tuples.
     */
    if (split_cleanup)
    {
        HashPageOpaque bucket_opaque;
        Page		page;

        page = BufferGetPage(bucket_buf);
        bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);

        bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
    }

    /*
     * If we have deleted anything, try to compact free space.  For squeezing
     * the bucket, we must have a cleanup lock, else it can impact the
     * ordering of tuples for a scan that has started before it.
     */
    if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
        _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
                            bstrategy);
    else
        _hash_chgbufaccess(rel, bucket_buf, HASH_WRITE, HASH_NOLOCK);
}