Example #1
0
/*
 *	hashendscan() -- close down a scan
 */
Datum
hashendscan(PG_FUNCTION_ARGS)
{
	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
	HashScanOpaque so = (HashScanOpaque) scan->opaque;
	Relation	rel = scan->indexRelation;

	/* don't need scan registered anymore */
	_hash_dropscan(scan);

	/* release any pin we still hold */
	if (BufferIsValid(so->hashso_curbuf))
		_hash_dropbuf(rel, so->hashso_curbuf);
	so->hashso_curbuf = InvalidBuffer;

	/* release lock on bucket, too */
	if (so->hashso_bucket_blkno)
		_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
	so->hashso_bucket_blkno = 0;

	pfree(so);
	scan->opaque = NULL;

	PG_RETURN_VOID();
}
Example #2
0
/*
 *	hashrescan() -- rescan an index relation
 */
void
hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
		   ScanKey orderbys, int norderbys)
{
	HashScanOpaque so = (HashScanOpaque) scan->opaque;
	Relation	rel = scan->indexRelation;

	/* release any pin we still hold */
	if (BufferIsValid(so->hashso_curbuf))
		_hash_dropbuf(rel, so->hashso_curbuf);
	so->hashso_curbuf = InvalidBuffer;

	/* release lock on bucket, too */
	if (so->hashso_bucket_blkno)
		_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
	so->hashso_bucket_blkno = 0;

	/* set position invalid (this will cause _hash_first call) */
	ItemPointerSetInvalid(&(so->hashso_curpos));
	ItemPointerSetInvalid(&(so->hashso_heappos));

	/* Update scan key, if a new one is given */
	if (scankey && scan->numberOfKeys > 0)
	{
		memmove(scan->keyData,
				scankey,
				scan->numberOfKeys * sizeof(ScanKeyData));
		so->hashso_bucket_valid = false;
	}
}
Example #3
0
/*
 *	hashrescan() -- rescan an index relation
 */
Datum
hashrescan(PG_FUNCTION_ARGS)
{
	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
	ScanKey		scankey = (ScanKey) PG_GETARG_POINTER(1);

	/* remaining arguments are ignored */
	HashScanOpaque so = (HashScanOpaque) scan->opaque;
	Relation	rel = scan->indexRelation;

	/* release any pin we still hold */
	if (BufferIsValid(so->hashso_curbuf))
		_hash_dropbuf(rel, so->hashso_curbuf);
	so->hashso_curbuf = InvalidBuffer;

	/* release lock on bucket, too */
	if (so->hashso_bucket_blkno)
		_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
	so->hashso_bucket_blkno = 0;

	/* set position invalid (this will cause _hash_first call) */
	ItemPointerSetInvalid(&(so->hashso_curpos));
	ItemPointerSetInvalid(&(so->hashso_heappos));

	/* Update scan key, if a new one is given */
	if (scankey && scan->numberOfKeys > 0)
	{
		memmove(scan->keyData,
				scankey,
				scan->numberOfKeys * sizeof(ScanKeyData));
		so->hashso_bucket_valid = false;
	}

	PG_RETURN_VOID();
}
Example #4
0
/*
 *	hashrescan() -- rescan an index relation
 */
Datum
hashrescan(PG_FUNCTION_ARGS)
{
	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
	ScanKey		scankey = (ScanKey) PG_GETARG_POINTER(1);
	HashScanOpaque so = (HashScanOpaque) scan->opaque;
	Relation	rel = scan->indexRelation;

	/* if we are called from beginscan, so is still NULL */
	if (so)
	{
		/* release any pins we still hold */
		if (BufferIsValid(so->hashso_curbuf))
			_hash_dropbuf(rel, so->hashso_curbuf);
		so->hashso_curbuf = InvalidBuffer;

		if (BufferIsValid(so->hashso_mrkbuf))
			_hash_dropbuf(rel, so->hashso_mrkbuf);
		so->hashso_mrkbuf = InvalidBuffer;

		/* release lock on bucket, too */
		if (so->hashso_bucket_blkno)
			_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
		so->hashso_bucket_blkno = 0;
	}

	/* set positions invalid (this will cause _hash_first call) */
	ItemPointerSetInvalid(&(scan->currentItemData));
	ItemPointerSetInvalid(&(scan->currentMarkData));

	/* Update scan key, if a new one is given */
	if (scankey && scan->numberOfKeys > 0)
	{
		memmove(scan->keyData,
				scankey,
				scan->numberOfKeys * sizeof(ScanKeyData));
		if (so)
			so->hashso_bucket_valid = false;
	}

	PG_RETURN_VOID();
}
Example #5
0
/*
 * pgstat_hash_page -- check tuples in a hash page
 */
static void
pgstat_hash_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
				 BufferAccessStrategy bstrategy)
{
	Buffer		buf;
	Page		page;

	_hash_getlock(rel, blkno, HASH_SHARE);
	buf = _hash_getbuf_with_strategy(rel, blkno, HASH_READ, 0, bstrategy);
	page = BufferGetPage(buf);

	if (PageGetSpecialSize(page) == MAXALIGN(sizeof(HashPageOpaqueData)))
	{
		HashPageOpaque opaque;

		opaque = (HashPageOpaque) PageGetSpecialPointer(page);
		switch (opaque->hasho_flag)
		{
			case LH_UNUSED_PAGE:
				stat->free_space += BLCKSZ;
				break;
			case LH_BUCKET_PAGE:
			case LH_OVERFLOW_PAGE:
				pgstat_index_page(stat, page, FirstOffsetNumber,
								  PageGetMaxOffsetNumber(page));
				break;
			case LH_BITMAP_PAGE:
			case LH_META_PAGE:
			default:
				break;
		}
	}
	else
	{
		/* maybe corrupted */
	}

	_hash_relbuf(rel, buf);
	_hash_droplock(rel, blkno, HASH_SHARE);
}
Example #6
0
/*
 *	hashendscan() -- close down a scan
 */
void
hashendscan(IndexScanDesc scan)
{
	HashScanOpaque so = (HashScanOpaque) scan->opaque;
	Relation	rel = scan->indexRelation;

	/* don't need scan registered anymore */
	_hash_dropscan(scan);

	/* release any pin we still hold */
	if (BufferIsValid(so->hashso_curbuf))
		_hash_dropbuf(rel, so->hashso_curbuf);
	so->hashso_curbuf = InvalidBuffer;

	/* release lock on bucket, too */
	if (so->hashso_bucket_blkno)
		_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
	so->hashso_bucket_blkno = 0;

	pfree(so);
	scan->opaque = NULL;
}
Example #7
0
/*
 * Bulk deletion of all index entries pointing to a set of heap tuples.
 * The set of target tuples is specified via a callback routine that tells
 * whether any given heap tuple (identified by ItemPointer) is being deleted.
 *
 * Result: a palloc'd struct containing statistical info for VACUUM displays.
 */
IndexBulkDeleteResult *
hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
			   IndexBulkDeleteCallback callback, void *callback_state)
{
	Relation	rel = info->index;
	double		tuples_removed;
	double		num_index_tuples;
	double		orig_ntuples;
	Bucket		orig_maxbucket;
	Bucket		cur_maxbucket;
	Bucket		cur_bucket;
	Buffer		metabuf;
	HashMetaPage metap;
	HashMetaPageData local_metapage;

	tuples_removed = 0;
	num_index_tuples = 0;

	/*
	 * Read the metapage to fetch original bucket and tuple counts.  Also, we
	 * keep a copy of the last-seen metapage so that we can use its
	 * hashm_spares[] values to compute bucket page addresses.  This is a bit
	 * hokey but perfectly safe, since the interesting entries in the spares
	 * array cannot change under us; and it beats rereading the metapage for
	 * each bucket.
	 */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
	metap = HashPageGetMeta(BufferGetPage(metabuf));
	orig_maxbucket = metap->hashm_maxbucket;
	orig_ntuples = metap->hashm_ntuples;
	memcpy(&local_metapage, metap, sizeof(local_metapage));
	_hash_relbuf(rel, metabuf);

	/* Scan the buckets that we know exist */
	cur_bucket = 0;
	cur_maxbucket = orig_maxbucket;

loop_top:
	while (cur_bucket <= cur_maxbucket)
	{
		BlockNumber bucket_blkno;
		BlockNumber blkno;
		bool		bucket_dirty = false;

		/* Get address of bucket's start page */
		bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);

		/* Exclusive-lock the bucket so we can shrink it */
		_hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE);

		/* Shouldn't have any active scans locally, either */
		if (_hash_has_active_scan(rel, cur_bucket))
			elog(ERROR, "hash index has active scan during VACUUM");

		/* Scan each page in bucket */
		blkno = bucket_blkno;
		while (BlockNumberIsValid(blkno))
		{
			Buffer		buf;
			Page		page;
			HashPageOpaque opaque;
			OffsetNumber offno;
			OffsetNumber maxoffno;
			OffsetNumber deletable[MaxOffsetNumber];
			int			ndeletable = 0;

			vacuum_delay_point();

			buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
										   LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
											 info->strategy);
			page = BufferGetPage(buf);
			opaque = (HashPageOpaque) PageGetSpecialPointer(page);
			Assert(opaque->hasho_bucket == cur_bucket);

			/* Scan each tuple in page */
			maxoffno = PageGetMaxOffsetNumber(page);
			for (offno = FirstOffsetNumber;
				 offno <= maxoffno;
				 offno = OffsetNumberNext(offno))
			{
				IndexTuple	itup;
				ItemPointer htup;

				itup = (IndexTuple) PageGetItem(page,
												PageGetItemId(page, offno));
				htup = &(itup->t_tid);
				if (callback(htup, callback_state))
				{
					/* mark the item for deletion */
					deletable[ndeletable++] = offno;
					tuples_removed += 1;
				}
				else
					num_index_tuples += 1;
			}

			/*
			 * Apply deletions and write page if needed, advance to next page.
			 */
			blkno = opaque->hasho_nextblkno;

			if (ndeletable > 0)
			{
				PageIndexMultiDelete(page, deletable, ndeletable);
				_hash_wrtbuf(rel, buf);
				bucket_dirty = true;
			}
			else
				_hash_relbuf(rel, buf);
		}

		/* If we deleted anything, try to compact free space */
		if (bucket_dirty)
			_hash_squeezebucket(rel, cur_bucket, bucket_blkno,
								info->strategy);

		/* Release bucket lock */
		_hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE);

		/* Advance to next bucket */
		cur_bucket++;
	}

	/* Write-lock metapage and check for split since we started */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE);
	metap = HashPageGetMeta(BufferGetPage(metabuf));

	if (cur_maxbucket != metap->hashm_maxbucket)
	{
		/* There's been a split, so process the additional bucket(s) */
		cur_maxbucket = metap->hashm_maxbucket;
		memcpy(&local_metapage, metap, sizeof(local_metapage));
		_hash_relbuf(rel, metabuf);
		goto loop_top;
	}

	/* Okay, we're really done.  Update tuple count in metapage. */

	if (orig_maxbucket == metap->hashm_maxbucket &&
		orig_ntuples == metap->hashm_ntuples)
	{
		/*
		 * No one has split or inserted anything since start of scan, so
		 * believe our count as gospel.
		 */
		metap->hashm_ntuples = num_index_tuples;
	}
	else
	{
		/*
		 * Otherwise, our count is untrustworthy since we may have
		 * double-scanned tuples in split buckets.  Proceed by dead-reckoning.
		 * (Note: we still return estimated_count = false, because using this
		 * count is better than not updating reltuples at all.)
		 */
		if (metap->hashm_ntuples > tuples_removed)
			metap->hashm_ntuples -= tuples_removed;
		else
			metap->hashm_ntuples = 0;
		num_index_tuples = metap->hashm_ntuples;
	}

	_hash_wrtbuf(rel, metabuf);

	/* return statistics */
	if (stats == NULL)
		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
	stats->estimated_count = false;
	stats->num_index_tuples = num_index_tuples;
	stats->tuples_removed += tuples_removed;
	/* hashvacuumcleanup will fill in num_pages */

	return stats;
}
Example #8
0
/*
 *	_hash_first() -- Find the first item in a scan.
 *
 *		Find the first item in the index that
 *		satisfies the qualification associated with the scan descriptor. On
 *		success, the page containing the current index tuple is read locked
 *		and pinned, and the scan's opaque data entry is updated to
 *		include the buffer.
 */
bool
_hash_first(IndexScanDesc scan, ScanDirection dir)
{
	Relation	rel = scan->indexRelation;
	HashScanOpaque so = (HashScanOpaque) scan->opaque;
	uint32		hashkey;
	Bucket		bucket;
	BlockNumber blkno;
	Buffer		buf;
	Buffer		metabuf;
	Page		page;
	HashPageOpaque opaque;
	HashMetaPage metap;
	IndexTuple	itup;
	ItemPointer current;
	OffsetNumber offnum;

	MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD;

	pgstat_count_index_scan(rel);

	current = &(scan->currentItemData);
	ItemPointerSetInvalid(current);

	/*
	 * We do not support hash scans with no index qualification, because we
	 * would have to read the whole index rather than just one bucket. That
	 * creates a whole raft of problems, since we haven't got a practical way
	 * to lock all the buckets against splits or compactions.
	 */
	if (scan->numberOfKeys < 1)
		ereport(ERROR,
				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
				 errmsg("hash indexes do not support whole-index scans")));

	/*
	 * If the constant in the index qual is NULL, assume it cannot match any
	 * items in the index.
	 */
	if (scan->keyData[0].sk_flags & SK_ISNULL)
		return false;

	/*
	 * Okay to compute the hash key.  We want to do this before acquiring any
	 * locks, in case a user-defined hash function happens to be slow.
	 */
	hashkey = _hash_datum2hashkey(rel, scan->keyData[0].sk_argument);

	/*
	 * Acquire shared split lock so we can compute the target bucket safely
	 * (see README).
	 */
	_hash_getlock(rel, 0, HASH_SHARE);

	/* Read the metapage */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
	_hash_checkpage(rel, metabuf, LH_META_PAGE);
	metap = (HashMetaPage) BufferGetPage(metabuf);

	/*
	 * Compute the target bucket number, and convert to block number.
	 */
	bucket = _hash_hashkey2bucket(hashkey,
								  metap->hashm_maxbucket,
								  metap->hashm_highmask,
								  metap->hashm_lowmask);

	blkno = BUCKET_TO_BLKNO(metap, bucket);

	/* done with the metapage */
	_hash_relbuf(rel, metabuf);

	/*
	 * Acquire share lock on target bucket; then we can release split lock.
	 */
	_hash_getlock(rel, blkno, HASH_SHARE);

	_hash_droplock(rel, 0, HASH_SHARE);

	/* Update scan opaque state to show we have lock on the bucket */
	so->hashso_bucket = bucket;
	so->hashso_bucket_valid = true;
	so->hashso_bucket_blkno = blkno;

	/* Fetch the primary bucket page for the bucket */
	buf = _hash_getbuf(rel, blkno, HASH_READ);
	_hash_checkpage(rel, buf, LH_BUCKET_PAGE);
	page = BufferGetPage(buf);
	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
	Assert(opaque->hasho_bucket == bucket);

	/* If a backwards scan is requested, move to the end of the chain */
	if (ScanDirectionIsBackward(dir))
	{
		while (BlockNumberIsValid(opaque->hasho_nextblkno))
			_hash_readnext(rel, &buf, &page, &opaque);
	}

	/* Now find the first tuple satisfying the qualification */
	if (!_hash_step(scan, &buf, dir))
		return false;

	/* if we're here, _hash_step found a valid tuple */
	offnum = ItemPointerGetOffsetNumber(current);
	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
	page = BufferGetPage(buf);
	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
	scan->xs_ctup.t_self = itup->t_tid;

	return true;
}
Example #9
0
/*
 *	_hash_first() -- Find the first item in a scan.
 *
 *		Find the first item in the index that
 *		satisfies the qualification associated with the scan descriptor. On
 *		success, the page containing the current index tuple is read locked
 *		and pinned, and the scan's opaque data entry is updated to
 *		include the buffer.
 */
bool
_hash_first(IndexScanDesc scan, ScanDirection dir)
{
	Relation	rel = scan->indexRelation;
	HashScanOpaque so = (HashScanOpaque) scan->opaque;
	ScanKey		cur;
	uint32		hashkey;
	Bucket		bucket;
	BlockNumber blkno;
	Buffer		buf;
	Buffer		metabuf;
	Page		page;
	HashPageOpaque opaque;
	HashMetaPage metap;
	IndexTuple	itup;
	ItemPointer current;
	OffsetNumber offnum;

	pgstat_count_index_scan(rel);

	current = &(so->hashso_curpos);
	ItemPointerSetInvalid(current);

	/*
	 * We do not support hash scans with no index qualification, because we
	 * would have to read the whole index rather than just one bucket. That
	 * creates a whole raft of problems, since we haven't got a practical way
	 * to lock all the buckets against splits or compactions.
	 */
	if (scan->numberOfKeys < 1)
		ereport(ERROR,
				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
				 errmsg("hash indexes do not support whole-index scans")));

	/* There may be more than one index qual, but we hash only the first */
	cur = &scan->keyData[0];

	/* We support only single-column hash indexes */
	Assert(cur->sk_attno == 1);
	/* And there's only one operator strategy, too */
	Assert(cur->sk_strategy == HTEqualStrategyNumber);

	/*
	 * If the constant in the index qual is NULL, assume it cannot match any
	 * items in the index.
	 */
	if (cur->sk_flags & SK_ISNULL)
		return false;

	/*
	 * Okay to compute the hash key.  We want to do this before acquiring any
	 * locks, in case a user-defined hash function happens to be slow.
	 *
	 * If scankey operator is not a cross-type comparison, we can use the
	 * cached hash function; otherwise gotta look it up in the catalogs.
	 *
	 * We support the convention that sk_subtype == InvalidOid means the
	 * opclass input type; this is a hack to simplify life for ScanKeyInit().
	 */
	if (cur->sk_subtype == rel->rd_opcintype[0] ||
		cur->sk_subtype == InvalidOid)
		hashkey = _hash_datum2hashkey(rel, cur->sk_argument);
	else
		hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument,
										   cur->sk_subtype);

	so->hashso_sk_hash = hashkey;

	/*
	 * Acquire shared split lock so we can compute the target bucket safely
	 * (see README).
	 */
	_hash_getlock(rel, 0, HASH_SHARE);

	/* Read the metapage */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
	metap = HashPageGetMeta(BufferGetPage(metabuf));

	/*
	 * Compute the target bucket number, and convert to block number.
	 */
	bucket = _hash_hashkey2bucket(hashkey,
								  metap->hashm_maxbucket,
								  metap->hashm_highmask,
								  metap->hashm_lowmask);

	blkno = BUCKET_TO_BLKNO(metap, bucket);

	/* done with the metapage */
	_hash_relbuf(rel, metabuf);

	/*
	 * Acquire share lock on target bucket; then we can release split lock.
	 */
	_hash_getlock(rel, blkno, HASH_SHARE);

	_hash_droplock(rel, 0, HASH_SHARE);

	/* Update scan opaque state to show we have lock on the bucket */
	so->hashso_bucket = bucket;
	so->hashso_bucket_valid = true;
	so->hashso_bucket_blkno = blkno;

	/* Fetch the primary bucket page for the bucket */
	buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE);
	page = BufferGetPage(buf);
	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
	Assert(opaque->hasho_bucket == bucket);

	/* If a backwards scan is requested, move to the end of the chain */
	if (ScanDirectionIsBackward(dir))
	{
		while (BlockNumberIsValid(opaque->hasho_nextblkno))
			_hash_readnext(rel, &buf, &page, &opaque);
	}

	/* Now find the first tuple satisfying the qualification */
	if (!_hash_step(scan, &buf, dir))
		return false;

	/* if we're here, _hash_step found a valid tuple */
	offnum = ItemPointerGetOffsetNumber(current);
	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
	page = BufferGetPage(buf);
	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
	so->hashso_heappos = itup->t_tid;

	return true;
}
Example #10
0
/*
 * Attempt to expand the hash table by creating one new bucket.
 *
 * This will silently do nothing if it cannot get the needed locks.
 *
 * The caller should hold no locks on the hash index.
 *
 * The caller must hold a pin, but no lock, on the metapage buffer.
 * The buffer is returned in the same state.
 */
void
_hash_expandtable(Relation rel, Buffer metabuf)
{
	HashMetaPage metap;
	Bucket		old_bucket;
	Bucket		new_bucket;
	uint32		spare_ndx;
	BlockNumber start_oblkno;
	BlockNumber start_nblkno;
	uint32		maxbucket;
	uint32		highmask;
	uint32		lowmask;

	/*
	 * Obtain the page-zero lock to assert the right to begin a split
	 * (see README).
	 *
	 * Note: deadlock should be impossible here. Our own backend could only
	 * be holding bucket sharelocks due to stopped indexscans; those will not
	 * block other holders of the page-zero lock, who are only interested in
	 * acquiring bucket sharelocks themselves.  Exclusive bucket locks are
	 * only taken here and in hashbulkdelete, and neither of these operations
	 * needs any additional locks to complete.  (If, due to some flaw in this
	 * reasoning, we manage to deadlock anyway, it's okay to error out; the
	 * index will be left in a consistent state.)
	 */
	_hash_getlock(rel, 0, HASH_EXCLUSIVE);

	/* Write-lock the meta page */
	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

	metap = (HashMetaPage) BufferGetPage(metabuf);
	_hash_checkpage(rel, (Page) metap, LH_META_PAGE);

	/*
	 * Check to see if split is still needed; someone else might have already
	 * done one while we waited for the lock.
	 *
	 * Make sure this stays in sync with_hash_doinsert()
	 */
	if (metap->hashm_ntuples <=
		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
		goto fail;

	/*
	 * Determine which bucket is to be split, and attempt to lock the old
	 * bucket.  If we can't get the lock, give up.
	 *
	 * The lock protects us against other backends, but not against our own
	 * backend.  Must check for active scans separately.
	 *
	 * Ideally we would lock the new bucket too before proceeding, but if
	 * we are about to cross a splitpoint then the BUCKET_TO_BLKNO mapping
	 * isn't correct yet.  For simplicity we update the metapage first and
	 * then lock.  This should be okay because no one else should be trying
	 * to lock the new bucket yet...
	 */
	new_bucket = metap->hashm_maxbucket + 1;
	old_bucket = (new_bucket & metap->hashm_lowmask);

	start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);

	if (_hash_has_active_scan(rel, old_bucket))
		goto fail;

	if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE))
		goto fail;

	/*
	 * Okay to proceed with split.  Update the metapage bucket mapping info.
	 */
	metap->hashm_maxbucket = new_bucket;

	if (new_bucket > metap->hashm_highmask)
	{
		/* Starting a new doubling */
		metap->hashm_lowmask = metap->hashm_highmask;
		metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
	}

	/*
	 * If the split point is increasing (hashm_maxbucket's log base 2
	 * increases), we need to adjust the hashm_spares[] array and
	 * hashm_ovflpoint so that future overflow pages will be created beyond
	 * this new batch of bucket pages.
	 *
	 * XXX should initialize new bucket pages to prevent out-of-order
	 * page creation?  Don't wanna do it right here though.
	 */
	spare_ndx = _hash_log2(metap->hashm_maxbucket + 1);
	if (spare_ndx > metap->hashm_ovflpoint)
	{
		Assert(spare_ndx == metap->hashm_ovflpoint + 1);
		metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
		metap->hashm_ovflpoint = spare_ndx;
	}

	/* now we can compute the new bucket's primary block number */
	start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);

	Assert(!_hash_has_active_scan(rel, new_bucket));

	if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE))
		elog(PANIC, "could not get lock on supposedly new bucket");

	/*
	 * Copy bucket mapping info now; this saves re-accessing the meta page
	 * inside _hash_splitbucket's inner loop.  Note that once we drop the
	 * split lock, other splits could begin, so these values might be out of
	 * date before _hash_splitbucket finishes.  That's okay, since all it
	 * needs is to tell which of these two buckets to map hashkeys into.
	 */
	maxbucket = metap->hashm_maxbucket;
	highmask = metap->hashm_highmask;
	lowmask = metap->hashm_lowmask;

	/* Write out the metapage and drop lock, but keep pin */
	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);

	/* Release split lock; okay for other splits to occur now */
	_hash_droplock(rel, 0, HASH_EXCLUSIVE);

	/* Relocate records to the new bucket */
	_hash_splitbucket(rel, metabuf, old_bucket, new_bucket,
					  start_oblkno, start_nblkno,
					  maxbucket, highmask, lowmask);

	/* Release bucket locks, allowing others to access them */
	_hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
	_hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);

	return;

	/* Here if decide not to split or fail to acquire old bucket lock */
fail:

	/* We didn't write the metapage, so just drop lock */
	_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);

	/* Release split lock */
	_hash_droplock(rel, 0, HASH_EXCLUSIVE);
}
Example #11
0
/*
 * Attempt to expand the hash table by creating one new bucket.
 *
 * This will silently do nothing if it cannot get the needed locks.
 *
 * The caller should hold no locks on the hash index.
 *
 * The caller must hold a pin, but no lock, on the metapage buffer.
 * The buffer is returned in the same state.
 */
void
_hash_expandtable(Relation rel, Buffer metabuf)
{
    HashMetaPage metap;
    Bucket		old_bucket;
    Bucket		new_bucket;
    uint32		spare_ndx;
    BlockNumber start_oblkno;
    BlockNumber start_nblkno;
    uint32		maxbucket;
    uint32		highmask;
    uint32		lowmask;

    /*
     * Obtain the page-zero lock to assert the right to begin a split (see
     * README).
     *
     * Note: deadlock should be impossible here. Our own backend could only be
     * holding bucket sharelocks due to stopped indexscans; those will not
     * block other holders of the page-zero lock, who are only interested in
     * acquiring bucket sharelocks themselves.	Exclusive bucket locks are
     * only taken here and in hashbulkdelete, and neither of these operations
     * needs any additional locks to complete.	(If, due to some flaw in this
     * reasoning, we manage to deadlock anyway, it's okay to error out; the
     * index will be left in a consistent state.)
     */
    _hash_getlock(rel, 0, HASH_EXCLUSIVE);

    /* Write-lock the meta page */
    _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

    _hash_checkpage(rel, metabuf, LH_META_PAGE);
    metap = HashPageGetMeta(BufferGetPage(metabuf));

    /*
     * Check to see if split is still needed; someone else might have already
     * done one while we waited for the lock.
     *
     * Make sure this stays in sync with _hash_doinsert()
     */
    if (metap->hashm_ntuples <=
            (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
        goto fail;

    /*
     * Can't split anymore if maxbucket has reached its maximum possible
     * value.
     *
     * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because
     * the calculation maxbucket+1 mustn't overflow).  Currently we restrict
     * to half that because of overflow looping in _hash_log2() and
     * insufficient space in hashm_spares[].  It's moot anyway because an
     * index with 2^32 buckets would certainly overflow BlockNumber and hence
     * _hash_alloc_buckets() would fail, but if we supported buckets smaller
     * than a disk block then this would be an independent constraint.
     *
     * If you change this, see also the maximum initial number of buckets in
     * _hash_metapinit().
     */
    if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
        goto fail;

    /*
     * Determine which bucket is to be split, and attempt to lock the old
     * bucket.	If we can't get the lock, give up.
     *
     * The lock protects us against other backends, but not against our own
     * backend.  Must check for active scans separately.
     */
    new_bucket = metap->hashm_maxbucket + 1;

    old_bucket = (new_bucket & metap->hashm_lowmask);

    start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);

    if (_hash_has_active_scan(rel, old_bucket))
        goto fail;

    if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE))
        goto fail;

    /*
     * Likewise lock the new bucket (should never fail).
     *
     * Note: it is safe to compute the new bucket's blkno here, even though we
     * may still need to update the BUCKET_TO_BLKNO mapping.  This is because
     * the current value of hashm_spares[hashm_ovflpoint] correctly shows
     * where we are going to put a new splitpoint's worth of buckets.
     */
    start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);

    if (_hash_has_active_scan(rel, new_bucket))
        elog(ERROR, "scan in progress on supposedly new bucket");

    if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE))
        elog(ERROR, "could not get lock on supposedly new bucket");

    /*
     * If the split point is increasing (hashm_maxbucket's log base 2
     * increases), we need to allocate a new batch of bucket pages.
     */
    spare_ndx = _hash_log2(new_bucket + 1);
    if (spare_ndx > metap->hashm_ovflpoint)
    {
        Assert(spare_ndx == metap->hashm_ovflpoint + 1);

        /*
         * The number of buckets in the new splitpoint is equal to the total
         * number already in existence, i.e. new_bucket.  Currently this maps
         * one-to-one to blocks required, but someday we may need a more
         * complicated calculation here.
         */
        if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket))
        {
            /* can't split due to BlockNumber overflow */
            _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
            _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);
            goto fail;
        }
    }

    /*
     * Okay to proceed with split.	Update the metapage bucket mapping info.
     *
     * Since we are scribbling on the metapage data right in the shared
     * buffer, any failure in this next little bit leaves us with a big
     * problem: the metapage is effectively corrupt but could get written back
     * to disk.  We don't really expect any failure, but just to be sure,
     * establish a critical section.
     */
    START_CRIT_SECTION();

    metap->hashm_maxbucket = new_bucket;

    if (new_bucket > metap->hashm_highmask)
    {
        /* Starting a new doubling */
        metap->hashm_lowmask = metap->hashm_highmask;
        metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
    }

    /*
     * If the split point is increasing (hashm_maxbucket's log base 2
     * increases), we need to adjust the hashm_spares[] array and
     * hashm_ovflpoint so that future overflow pages will be created beyond
     * this new batch of bucket pages.
     */
    if (spare_ndx > metap->hashm_ovflpoint)
    {
        metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
        metap->hashm_ovflpoint = spare_ndx;
    }

    /* Done mucking with metapage */
    END_CRIT_SECTION();

    /*
     * Copy bucket mapping info now; this saves re-accessing the meta page
     * inside _hash_splitbucket's inner loop.  Note that once we drop the
     * split lock, other splits could begin, so these values might be out of
     * date before _hash_splitbucket finishes.	That's okay, since all it
     * needs is to tell which of these two buckets to map hashkeys into.
     */
    maxbucket = metap->hashm_maxbucket;
    highmask = metap->hashm_highmask;
    lowmask = metap->hashm_lowmask;

    /* Write out the metapage and drop lock, but keep pin */
    _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);

    /* Release split lock; okay for other splits to occur now */
    _hash_droplock(rel, 0, HASH_EXCLUSIVE);

    /* Relocate records to the new bucket */
    _hash_splitbucket(rel, metabuf, old_bucket, new_bucket,
                      start_oblkno, start_nblkno,
                      maxbucket, highmask, lowmask);

    /* Release bucket locks, allowing others to access them */
    _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
    _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);

    return;

    /* Here if decide not to split or fail to acquire old bucket lock */
fail:

    /* We didn't write the metapage, so just drop lock */
    _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);

    /* Release split lock */
    _hash_droplock(rel, 0, HASH_EXCLUSIVE);
}
Example #12
0
/*
 *	_hash_doinsert() -- Handle insertion of a single index tuple.
 *
 *		This routine is called by the public interface routines, hashbuild
 *		and hashinsert.  By here, itup is completely filled in.
 */
void
_hash_doinsert(Relation rel, IndexTuple itup)
{
	Buffer		buf;
	Buffer		metabuf;
	HashMetaPage metap;
	BlockNumber blkno;
	Page		page;
	HashPageOpaque pageopaque;
	Size		itemsz;
	bool		do_expand;
	uint32		hashkey;
	Bucket		bucket;

	/*
	 * Get the hash key for the item (it's stored in the index tuple itself).
	 */
	hashkey = _hash_get_indextuple_hashkey(itup);

	/* compute item size too */
	itemsz = IndexTupleDSize(*itup);
	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
								 * need to be consistent */

	/*
	 * Acquire shared split lock so we can compute the target bucket safely
	 * (see README).
	 */
	_hash_getlock(rel, 0, HASH_SHARE);

	/* Read the metapage */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
	metap = HashPageGetMeta(BufferGetPage(metabuf));

	/*
	 * Check whether the item can fit on a hash page at all. (Eventually, we
	 * ought to try to apply TOAST methods if not.)  Note that at this point,
	 * itemsz doesn't include the ItemId.
	 *
	 * XXX this is useless code if we are only storing hash keys.
	 */
	if (itemsz > HashMaxItemSize((Page) metap))
		ereport(ERROR,
				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
				 errmsg("index row size %lu exceeds hash maximum %lu",
						(unsigned long) itemsz,
						(unsigned long) HashMaxItemSize((Page) metap)),
			errhint("Values larger than a buffer page cannot be indexed.")));

	/*
	 * Compute the target bucket number, and convert to block number.
	 */
	bucket = _hash_hashkey2bucket(hashkey,
								  metap->hashm_maxbucket,
								  metap->hashm_highmask,
								  metap->hashm_lowmask);

	blkno = BUCKET_TO_BLKNO(metap, bucket);

	/* release lock on metapage, but keep pin since we'll need it again */
	_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);

	/*
	 * Acquire share lock on target bucket; then we can release split lock.
	 */
	_hash_getlock(rel, blkno, HASH_SHARE);

	_hash_droplock(rel, 0, HASH_SHARE);

	/* Fetch the primary bucket page for the bucket */
	buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);
	page = BufferGetPage(buf);
	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
	Assert(pageopaque->hasho_bucket == bucket);

	/* Do the insertion */
	while (PageGetFreeSpace(page) < itemsz)
	{
		/*
		 * no space on this page; check for an overflow page
		 */
		BlockNumber nextblkno = pageopaque->hasho_nextblkno;

		if (BlockNumberIsValid(nextblkno))
		{
			/*
			 * ovfl page exists; go get it.  if it doesn't have room, we'll
			 * find out next pass through the loop test above.
			 */
			_hash_relbuf(rel, buf);
			buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
			page = BufferGetPage(buf);
		}
		else
		{
			/*
			 * we're at the end of the bucket chain and we haven't found a
			 * page with enough room.  allocate a new overflow page.
			 */

			/* release our write lock without modifying buffer */
			_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);

			/* chain to a new overflow page */
			buf = _hash_addovflpage(rel, metabuf, buf);
			page = BufferGetPage(buf);

			/* should fit now, given test above */
			Assert(PageGetFreeSpace(page) >= itemsz);
		}
		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
		Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE);
		Assert(pageopaque->hasho_bucket == bucket);
	}

	/* found page with enough space, so add the item here */
	(void) _hash_pgaddtup(rel, buf, itemsz, itup);

	/* write and release the modified page */
	_hash_wrtbuf(rel, buf);

	/* We can drop the bucket lock now */
	_hash_droplock(rel, blkno, HASH_SHARE);

	/*
	 * Write-lock the metapage so we can increment the tuple count. After
	 * incrementing it, check to see if it's time for a split.
	 */
	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

	metap->hashm_ntuples += 1;

	/* Make sure this stays in sync with _hash_expandtable() */
	do_expand = metap->hashm_ntuples >
		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);

	/* Write out the metapage and drop lock, but keep pin */
	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);

	/* Attempt to split if a split is needed */
	if (do_expand)
		_hash_expandtable(rel, metabuf);

	/* Finally drop our pin on the metapage */
	_hash_dropbuf(rel, metabuf);
}
Example #13
0
/*
 *	_hash_doinsert() -- Handle insertion of a single HashItem in the table.
 *
 *		This routine is called by the public interface routines, hashbuild
 *		and hashinsert.  By here, hashitem is completely filled in.
 *		The datum to be used as a "key" is in the hashitem.
 */
InsertIndexResult
_hash_doinsert(Relation rel, HashItem hitem)
{
	Buffer		buf;
	Buffer		metabuf;
	HashMetaPage metap;
	IndexTuple	itup;
	BlockNumber itup_blkno;
	OffsetNumber itup_off;
	InsertIndexResult res;
	BlockNumber blkno;
	Page		page;
	HashPageOpaque pageopaque;
	Size		itemsz;
	bool		do_expand;
	uint32		hashkey;
	Bucket		bucket;
	Datum		datum;
	bool		isnull;

	/*
	 * Compute the hash key for the item.  We do this first so as not to
	 * need to hold any locks while running the hash function.
	 */
	itup = &(hitem->hash_itup);
	if (rel->rd_rel->relnatts != 1)
		elog(ERROR, "hash indexes support only one index key");
	datum = index_getattr(itup, 1, RelationGetDescr(rel), &isnull);
	Assert(!isnull);
	hashkey = _hash_datum2hashkey(rel, datum);

	/* compute item size too */
	itemsz = IndexTupleDSize(hitem->hash_itup)
		+ (sizeof(HashItemData) - sizeof(IndexTupleData));

	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but
								 * we need to be consistent */

	/*
	 * Acquire shared split lock so we can compute the target bucket
	 * safely (see README).
	 */
	_hash_getlock(rel, 0, HASH_SHARE);

	/* Read the metapage */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
	metap = (HashMetaPage) BufferGetPage(metabuf);
	_hash_checkpage(rel, (Page) metap, LH_META_PAGE);

	/*
	 * Check whether the item can fit on a hash page at all. (Eventually,
	 * we ought to try to apply TOAST methods if not.)  Note that at this
	 * point, itemsz doesn't include the ItemId.
	 */
	if (itemsz > HashMaxItemSize((Page) metap))
		ereport(ERROR,
				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
				 errmsg("index row size %lu exceeds hash maximum %lu",
						(unsigned long) itemsz,
						(unsigned long) HashMaxItemSize((Page) metap))));

	/*
	 * Compute the target bucket number, and convert to block number.
	 */
	bucket = _hash_hashkey2bucket(hashkey,
								  metap->hashm_maxbucket,
								  metap->hashm_highmask,
								  metap->hashm_lowmask);

	blkno = BUCKET_TO_BLKNO(metap, bucket);

	/* release lock on metapage, but keep pin since we'll need it again */
	_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);

	/*
	 * Acquire share lock on target bucket; then we can release split lock.
	 */
	_hash_getlock(rel, blkno, HASH_SHARE);

	_hash_droplock(rel, 0, HASH_SHARE);

	/* Fetch the primary bucket page for the bucket */
	buf = _hash_getbuf(rel, blkno, HASH_WRITE);
	page = BufferGetPage(buf);
	_hash_checkpage(rel, page, LH_BUCKET_PAGE);
	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
	Assert(pageopaque->hasho_bucket == bucket);

	/* Do the insertion */
	while (PageGetFreeSpace(page) < itemsz)
	{
		/*
		 * no space on this page; check for an overflow page
		 */
		BlockNumber	nextblkno = pageopaque->hasho_nextblkno;

		if (BlockNumberIsValid(nextblkno))
		{
			/*
			 * ovfl page exists; go get it.  if it doesn't have room,
			 * we'll find out next pass through the loop test above.
			 */
			_hash_relbuf(rel, buf);
			buf = _hash_getbuf(rel, nextblkno, HASH_WRITE);
			page = BufferGetPage(buf);
		}
		else
		{
			/*
			 * we're at the end of the bucket chain and we haven't found a
			 * page with enough room.  allocate a new overflow page.
			 */

			/* release our write lock without modifying buffer */
			_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);

			/* chain to a new overflow page */
			buf = _hash_addovflpage(rel, metabuf, buf);
			page = BufferGetPage(buf);

			/* should fit now, given test above */
			Assert(PageGetFreeSpace(page) >= itemsz);
		}
		_hash_checkpage(rel, page, LH_OVERFLOW_PAGE);
		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
		Assert(pageopaque->hasho_bucket == bucket);
	}

	/* found page with enough space, so add the item here */
	itup_off = _hash_pgaddtup(rel, buf, itemsz, hitem);
	itup_blkno = BufferGetBlockNumber(buf);

	/* write and release the modified page */
	_hash_wrtbuf(rel, buf);

	/* We can drop the bucket lock now */
	_hash_droplock(rel, blkno, HASH_SHARE);

	/*
	 * Write-lock the metapage so we can increment the tuple count.
	 * After incrementing it, check to see if it's time for a split.
	 */
	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

	metap->hashm_ntuples += 1;

	/* Make sure this stays in sync with _hash_expandtable() */
	do_expand = metap->hashm_ntuples >
		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);

	/* Write out the metapage and drop lock, but keep pin */
	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);

	/* Attempt to split if a split is needed */
	if (do_expand)
		_hash_expandtable(rel, metabuf);

	/* Finally drop our pin on the metapage */
	_hash_dropbuf(rel, metabuf);

	/* Create the return data structure */
	res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));

	ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);

	return res;
}
Example #14
0
/*
 * Bulk deletion of all index entries pointing to a set of heap tuples.
 * The set of target tuples is specified via a callback routine that tells
 * whether any given heap tuple (identified by ItemPointer) is being deleted.
 *
 * Result: a palloc'd struct containing statistical info for VACUUM displays.
 */
Datum
hashbulkdelete(PG_FUNCTION_ARGS)
{
	Relation	rel = (Relation) PG_GETARG_POINTER(0);
	IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
	void	   *callback_state = (void *) PG_GETARG_POINTER(2);
	IndexBulkDeleteResult *result;
	BlockNumber num_pages;
	double		tuples_removed;
	double		num_index_tuples;
	double		orig_ntuples;
	Bucket		orig_maxbucket;
	Bucket		cur_maxbucket;
	Bucket		cur_bucket;
	Buffer		metabuf;
	HashMetaPage metap;
	HashMetaPageData local_metapage;

	tuples_removed = 0;
	num_index_tuples = 0;

	/*
	 * Read the metapage to fetch original bucket and tuple counts.  Also,
	 * we keep a copy of the last-seen metapage so that we can use its
	 * hashm_spares[] values to compute bucket page addresses.  This is a
	 * bit hokey but perfectly safe, since the interesting entries in the
	 * spares array cannot change under us; and it beats rereading the
	 * metapage for each bucket.
	 */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
	metap = (HashMetaPage) BufferGetPage(metabuf);
	_hash_checkpage(rel, (Page) metap, LH_META_PAGE);
	orig_maxbucket = metap->hashm_maxbucket;
	orig_ntuples = metap->hashm_ntuples;
	memcpy(&local_metapage, metap, sizeof(local_metapage));
	_hash_relbuf(rel, metabuf);

	/* Scan the buckets that we know exist */
	cur_bucket = 0;
	cur_maxbucket = orig_maxbucket;

loop_top:
	while (cur_bucket <= cur_maxbucket)
	{
		BlockNumber bucket_blkno;
		BlockNumber blkno;
		bool		bucket_dirty = false;

		/* Get address of bucket's start page */
		bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);

		/* Exclusive-lock the bucket so we can shrink it */
		_hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE);

		/* Shouldn't have any active scans locally, either */
		if (_hash_has_active_scan(rel, cur_bucket))
			elog(ERROR, "hash index has active scan during VACUUM");

		/* Scan each page in bucket */
		blkno = bucket_blkno;
		while (BlockNumberIsValid(blkno))
		{
			Buffer		buf;
			Page		page;
			HashPageOpaque opaque;
			OffsetNumber offno;
			OffsetNumber maxoffno;
			bool		page_dirty = false;

			buf = _hash_getbuf(rel, blkno, HASH_WRITE);
			page = BufferGetPage(buf);
			_hash_checkpage(rel, page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
			opaque = (HashPageOpaque) PageGetSpecialPointer(page);
			Assert(opaque->hasho_bucket == cur_bucket);

			/* Scan each tuple in page */
			offno = FirstOffsetNumber;
			maxoffno = PageGetMaxOffsetNumber(page);
			while (offno <= maxoffno)
			{
				HashItem	hitem;
				ItemPointer htup;

				hitem = (HashItem) PageGetItem(page,
											   PageGetItemId(page, offno));
				htup = &(hitem->hash_itup.t_tid);
				if (callback(htup, callback_state))
				{
					/* delete the item from the page */
					PageIndexTupleDelete(page, offno);
					bucket_dirty = page_dirty = true;

					/* don't increment offno, instead decrement maxoffno */
					maxoffno = OffsetNumberPrev(maxoffno);

					tuples_removed += 1;
				}
				else
				{
					offno = OffsetNumberNext(offno);

					num_index_tuples += 1;
				}
			}

			/*
			 * Write page if needed, advance to next page.
			 */
			blkno = opaque->hasho_nextblkno;

			if (page_dirty)
				_hash_wrtbuf(rel, buf);
			else
				_hash_relbuf(rel, buf);
		}

		/* If we deleted anything, try to compact free space */
		if (bucket_dirty)
			_hash_squeezebucket(rel, cur_bucket, bucket_blkno);

		/* Release bucket lock */
		_hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE);

		/* Advance to next bucket */
		cur_bucket++;
	}

	/* Write-lock metapage and check for split since we started */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
	metap = (HashMetaPage) BufferGetPage(metabuf);
	_hash_checkpage(rel, (Page) metap, LH_META_PAGE);

	if (cur_maxbucket != metap->hashm_maxbucket)
	{
		/* There's been a split, so process the additional bucket(s) */
		cur_maxbucket = metap->hashm_maxbucket;
		memcpy(&local_metapage, metap, sizeof(local_metapage));
		_hash_relbuf(rel, metabuf);
		goto loop_top;
	}

	/* Okay, we're really done.  Update tuple count in metapage. */

	if (orig_maxbucket == metap->hashm_maxbucket &&
		orig_ntuples == metap->hashm_ntuples)
	{
		/*
		 * No one has split or inserted anything since start of scan,
		 * so believe our count as gospel.
		 */
		metap->hashm_ntuples = num_index_tuples;
	}
	else
	{
		/*
		 * Otherwise, our count is untrustworthy since we may have
		 * double-scanned tuples in split buckets.  Proceed by
		 * dead-reckoning.
		 */
		if (metap->hashm_ntuples > tuples_removed)
			metap->hashm_ntuples -= tuples_removed;
		else
			metap->hashm_ntuples = 0;
		num_index_tuples = metap->hashm_ntuples;
	}

	_hash_wrtbuf(rel, metabuf);

	/* return statistics */
	num_pages = RelationGetNumberOfBlocks(rel);

	result = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
	result->num_pages = num_pages;
	result->num_index_tuples = num_index_tuples;
	result->tuples_removed = tuples_removed;

	PG_RETURN_POINTER(result);
}
Example #15
0
/*
 *	_hash_first() -- Find the first item in a scan.
 *
 *		Find the first item in the index that
 *		satisfies the qualification associated with the scan descriptor. On
 *		success, the page containing the current index tuple is read locked
 *		and pinned, and the scan's opaque data entry is updated to
 *		include the buffer.
 */
bool _hash_first(struct index_scan *scan, enum scandir dir)
{
	struct relation *rel = scan->indexRelation;
	struct hash_scan_opaque_data *so = (struct hash_scan_opaque_data *)scan->opaque;
	struct scankey *cur;
	uint32 hashkey;
	bucket_t bucket;
	block_t blkno;
	buf_id_t buf;
	buf_id_t metabuf;
	page_p page;
	struct hash_page *opaque;
	struct hash_meta_page_data *metap;
	struct index_tuple *itup;
	struct item_ptr *current;
	item_id_t offnum;

	stat_index_scan(rel);

	current = &(so->hashso_curpos);
	ITEM_PTR_SET_INVALID(current);

	/*
	 * We do not support hash scans with no index qualification, because we
	 * would have to read the whole index rather than just one bucket. That
	 * creates a whole raft of problems, since we haven't got a practical way
	 * to lock all the buckets against splits or compactions.
	 */
	if (scan->numberOfKeys < 1) {
		ereport(ERROR, (
		errcode(E_FEATURE_NOT_SUPPORTED),
		errmsg("hash indexes do not support whole-index scans")));
	}

	/* There may be more than one index qual, but we hash only the first */
	cur = &scan->keyData[0];

	/* We support only single-column hash indexes */
	ASSERT(cur->sk_attno == 1);

	/* And there's only one operator strategy, too */
	ASSERT(cur->sk_strategy == HT_EQ_STRATEGY_NR);

	/*
	 * If the constant in the index qual is NULL, assume it cannot match any
	 * items in the index.
	 */
	if (cur->sk_flags & SK_ISNULL)
		return false;

	/*
	 * Okay to compute the hash key.  We want to do this before acquiring any
	 * locks, in case a user-defined hash function happens to be slow.
	 *
	 * If scankey operator is not a cross-type comparison, we can use the
	 * cached hash function; otherwise gotta look it up in the catalogs.
	 *
	 * We support the convention that sk_subtype == INVALID_OID means the
	 * opclass input type; this is a hack to simplify life for scankey_init().
	 */
	if (cur->sk_subtype == rel->rd_opcintype[0]
		|| cur->sk_subtype == INVALID_OID)
		hashkey = _hash_datum2hashkey(rel, cur->sk_argument);
	else
		hashkey = _hash_datum2hashkey_type(rel, cur->sk_argument, cur->sk_subtype);

	so->hashso_sk_hash = hashkey;

	/*
	 * Acquire shared split lock so we can compute the target bucket safely
	 * (see README).
	 */
	_hash_getlock(rel, 0, HASH_SHARE);

	/* Read the metapage */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
	metap = HASH_PAGE_GET_META(BUF_PAGE(metabuf));

	/*
	 * Compute the target bucket number, and convert to block number.
	 */
	bucket = _hash_hashkey2bucket(
		hashkey,
		metap->hashm_maxbucket,	
		metap->hashm_highmask,
		metap->hashm_lowmask);

	blkno = BUCKET_TO_BLKNO(metap, bucket);

	/* done with the metapage */
	_hash_relbuf(rel, metabuf);

	/*
	 * Acquire share lock on target bucket; then we can release split lock.
	 */
	_hash_getlock(rel, blkno, HASH_SHARE);
	_hash_droplock(rel, 0, HASH_SHARE);

	/* Update scan opaque state to show we have lock on the bucket */
	so->hashso_bucket = bucket;
	so->hashso_bucket_valid = true;
	so->hashso_bucket_blkno = blkno;

	/* Fetch the primary bucket page for the bucket */
	buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE);
	page = BUF_PAGE(buf);
	opaque = (struct hash_page*) PAGE_SPECIAL_PTR(page);
	ASSERT(opaque->hasho_bucket == bucket);

	/* If a backwards scan is requested, move to the end of the chain */
	if (SCANDIR_BACKWARD(dir)) {
		while (BLK_NR_VALID(opaque->hasho_nextblkno))
			_hash_readnext(rel, &buf, &page, &opaque);
	}

	/* Now find the first tuple satisfying the qualification */
	if (!_hash_step(scan, &buf, dir))
		return false;

	/* if we're here, _hash_step found a valid tuple */
	offnum = ITEM_PTR_OFFSET(current);
	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
	page = BUF_PAGE(buf);
	itup = (struct index_tuple *)PAGE_GET_ITEM(page, PAGE_ITEM_ID(page, offnum));
	so->hashso_heappos = itup->t_tid;

	return true;
}