Exemple #1
0
/* -------------------------------------------------
 * GetHashPageStatistics()
 *
 * Collect statistics of single hash page
 * -------------------------------------------------
 */
static void
GetHashPageStatistics(Page page, HashPageStat * stat)
{
	OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
	HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page);
	int			off;

	stat->dead_items = stat->live_items = 0;
	stat->page_size = PageGetPageSize(page);

	/* hash page opaque data */
	stat->hasho_prevblkno = opaque->hasho_prevblkno;
	stat->hasho_nextblkno = opaque->hasho_nextblkno;
	stat->hasho_bucket = opaque->hasho_bucket;
	stat->hasho_flag = opaque->hasho_flag;
	stat->hasho_page_id = opaque->hasho_page_id;

	/* count live and dead tuples, and free space */
	for (off = FirstOffsetNumber; off <= maxoff; off++)
	{
		ItemId		id = PageGetItemId(page, off);

		if (!ItemIdIsDead(id))
			stat->live_items++;
		else
			stat->dead_items++;
	}
	stat->free_size = PageGetFreeSpace(page);
}
static bool
entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off,
				   GinBtreeEntryInsertData *insertData)
{
	Size		releasedsz = 0;
	Size		addedsz;
	Page		page = BufferGetPage(buf);

	Assert(insertData->entry);
	Assert(!GinPageIsData(page));

	if (insertData->isDelete)
	{
		IndexTuple	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));

		releasedsz = MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
	}

	addedsz = MAXALIGN(IndexTupleSize(insertData->entry)) + sizeof(ItemIdData);

	if (PageGetFreeSpace(page) + releasedsz >= addedsz)
		return true;

	return false;
}
Exemple #3
0
/*
 * Release resources associated with a BrinBuildState.
 */
static void
terminate_brin_buildstate(BrinBuildState *state)
{
	/*
	 * Release the last index buffer used.  We might as well ensure that
	 * whatever free space remains in that page is available in FSM, too.
	 */
	if (!BufferIsInvalid(state->bs_currentInsertBuf))
	{
		Page		page;
		Size		freespace;
		BlockNumber blk;

		page = BufferGetPage(state->bs_currentInsertBuf);
		freespace = PageGetFreeSpace(page);
		blk = BufferGetBlockNumber(state->bs_currentInsertBuf);
		ReleaseBuffer(state->bs_currentInsertBuf);
		RecordPageWithFreeSpace(state->bs_irel, blk, freespace);
		FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1);
	}

	brin_free_desc(state->bs_bdesc);
	pfree(state->bs_dtuple);
	pfree(state);
}
/*
 * Return the amount of free space on a regular BRIN index page.
 *
 * If the page is not a regular page, or has been marked with the
 * BRIN_EVACUATE_PAGE flag, returns 0.
 */
static Size
br_page_get_freespace(Page page)
{
	if (!BRIN_IS_REGULAR_PAGE(page) ||
		(BrinPageFlags(page) & BRIN_EVACUATE_PAGE) != 0)
		return 0;
	else
		return PageGetFreeSpace(page);
}
Exemple #5
0
/*
 * Check space for itup vector on page
 */
static int
gistnospace(Page page, IndexTuple *itvec, int len)
{
	unsigned int size = 0;
	int			i;

	for (i = 0; i < len; i++)
		size += IndexTupleSize(itvec[i]) + sizeof(ItemIdData);

	return (PageGetFreeSpace(page) < size);
}
Exemple #6
0
/*
 * Return the amount of free space on a regular BRIN index page.
 *
 * If the page is not a regular page, or has been marked with the
 * BRIN_EVACUATE_PAGE flag, returns 0.
 */
static Size
br_page_get_freespace(Page page)
{
	BrinSpecialSpace *special;

	special = (BrinSpecialSpace *) PageGetSpecialPointer(page);
	if (!BRIN_IS_REGULAR_PAGE(page) ||
		(special->flags & BRIN_EVACUATE_PAGE) != 0)
		return 0;
	else
		return PageGetFreeSpace(page);
}
Exemple #7
0
/*
 * PageGetHeapFreeSpace
 *		Returns the size of the free (allocatable) space on a page,
 *		reduced by the space needed for a new line pointer.
 *
 * The difference between this and PageGetFreeSpace is that this will return
 * zero if there are already MaxHeapTuplesPerPage line pointers in the page
 * and none are free.  We use this to enforce that no more than
 * MaxHeapTuplesPerPage line pointers are created on a heap page.  (Although
 * no more tuples than that could fit anyway, in the presence of redirected
 * or dead line pointers it'd be possible to have too many line pointers.
 * To avoid breaking code that assumes MaxHeapTuplesPerPage is a hard limit
 * on the number of line pointers, we make this extra check.)
 */
Size
PageGetHeapFreeSpace(Page page)
{
	Size		space;

	space = PageGetFreeSpace(page);
	if (space > 0)
	{
		OffsetNumber offnum,
					nline;

		/*
		 * Are there already MaxHeapTuplesPerPage line pointers in the page?
		 */
		nline = PageGetMaxOffsetNumber(page);
		if (nline >= MaxHeapTuplesPerPage)
		{
			if (PageHasFreeLinePointers((PageHeader) page))
			{
				/*
				 * Since this is just a hint, we must confirm that there is
				 * indeed a free line pointer
				 */
				for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum))
				{
					ItemId		lp = PageGetItemId(page, offnum);

					if (!ItemIdIsUsed(lp))
						break;
				}

				if (offnum > nline)
				{
					/*
					 * The hint is wrong, but we can't clear it here since we
					 * don't have the ability to mark the page dirty.
					 */
					space = 0;
				}
			}
			else
			{
				/*
				 * Although the hint might be wrong, PageAddItem will believe
				 * it anyway, so we must believe it too.
				 */
				space = 0;
			}
		}
	}
	return space;
}
Exemple #8
0
/*
 *	lazy_vacuum_heap() -- second pass over the heap
 *
 *		This routine marks dead tuples as unused and compacts out free
 *		space on their pages.  Pages not having dead tuples recorded from
 *		lazy_scan_heap are not visited at all.
 *
 * Note: the reason for doing this as a second pass is we cannot remove
 * the tuples until we've removed their index entries, and we want to
 * process index entry removal in batches as large as possible.
 */
static void
lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
{
	MIRROREDLOCK_BUFMGR_DECLARE;

	int			tupindex;
	int			npages;
	PGRUsage	ru0;

	pg_rusage_init(&ru0);
	npages = 0;

	tupindex = 0;

	/* Fetch gp_persistent_relation_node information that will be added to XLOG record. */
	RelationFetchGpRelationNodeForXLog(onerel);

	while (tupindex < vacrelstats->num_dead_tuples)
	{
		BlockNumber tblk;
		Buffer		buf;
		Page		page;

		vacuum_delay_point();

		tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);

		/* -------- MirroredLock ---------- */
		MIRROREDLOCK_BUFMGR_LOCK;

		buf = ReadBuffer(onerel, tblk);
		LockBufferForCleanup(buf);
		tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats);
		/* Now that we've compacted the page, record its available space */
		page = BufferGetPage(buf);
		lazy_record_free_space(vacrelstats, tblk,
							   PageGetFreeSpace(page));
		UnlockReleaseBuffer(buf);

		MIRROREDLOCK_BUFMGR_UNLOCK;
		/* -------- MirroredLock ---------- */

		npages++;
	}

	ereport(elevel,
			(errmsg("\"%s\": removed %d row versions in %d pages",
					RelationGetRelationName(onerel),
					tupindex, npages),
			 errdetail("%s.",
					   pg_rusage_show(&ru0))));
}
/*
 * Release resources associated with a BrinBuildState.
 */
static void
terminate_brin_buildstate(BrinBuildState *state)
{
	/* release the last index buffer used */
	if (!BufferIsInvalid(state->bs_currentInsertBuf))
	{
		Page		page;

		page = BufferGetPage(state->bs_currentInsertBuf);
		RecordPageWithFreeSpace(state->bs_irel,
							BufferGetBlockNumber(state->bs_currentInsertBuf),
								PageGetFreeSpace(page));
		ReleaseBuffer(state->bs_currentInsertBuf);
	}

	brin_free_desc(state->bs_bdesc);
	pfree(state->bs_dtuple);
	pfree(state);
}
Exemple #10
0
/*
 * Check space for itup vector on page
 */
bool
gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace)
{
	unsigned int size = freespace,
				deleted = 0;
	int			i;

	for (i = 0; i < len; i++)
		size += IndexTupleSize(itvec[i]) + sizeof(ItemIdData);

	if (todelete != InvalidOffsetNumber)
	{
		IndexTuple	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, todelete));

		deleted = IndexTupleSize(itup) + sizeof(ItemIdData);
	}

	return (PageGetFreeSpace(page) + deleted < size);
}
Exemple #11
0
static void
gist_dumptree(Relation r, int level, BlockNumber blk, OffsetNumber coff)
{
	Buffer		buffer;
	Page		page;
	GISTPageOpaque opaque;
	IndexTuple	which;
	ItemId		iid;
	OffsetNumber i,
				maxoff;
	BlockNumber cblk;
	char	   *pred;

	pred = (char *) palloc(sizeof(char) * level + 1);
	MemSet(pred, '\t', level);
	pred[level] = '\0';

	buffer = ReadBuffer(r, blk);
	page = (Page) BufferGetPage(buffer);
	opaque = (GISTPageOpaque) PageGetSpecialPointer(page);

	maxoff = PageGetMaxOffsetNumber(page);

	elog(DEBUG4, "%sPage: %d %s blk: %d maxoff: %d free: %d", pred,
		 coff, (opaque->flags & F_LEAF) ? "LEAF" : "INTE", (int) blk,
		 (int) maxoff, PageGetFreeSpace(page));

	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
	{
		iid = PageGetItemId(page, i);
		which = (IndexTuple) PageGetItem(page, iid);
		cblk = ItemPointerGetBlockNumber(&(which->t_tid));
#ifdef PRINTTUPLE
		elog(DEBUG4, "%s  Tuple. blk: %d size: %d", pred, (int) cblk,
			 IndexTupleSize(which));
#endif

		if (!(opaque->flags & F_LEAF))
			gist_dumptree(r, level + 1, cblk, i);
	}
	ReleaseBuffer(buffer);
	pfree(pred);
}
Exemple #12
0
static bool
entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off)
{
	Size		itupsz = 0;
	Page		page = BufferGetPage(buf);

	Assert(btree->entry);
	Assert(!GinPageIsData(page));

	if (btree->isDelete)
	{
		IndexTuple	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));

		itupsz = MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
	}

	if (PageGetFreeSpace(page) + itupsz >= MAXALIGN(IndexTupleSize(btree->entry)) + sizeof(ItemIdData))
		return true;

	return false;
}
Exemple #13
0
/*
 * pgstat_index_page -- for generic index page
 */
static void
pgstat_index_page(pgstattuple_type *stat, Page page,
				  OffsetNumber minoff, OffsetNumber maxoff)
{
	OffsetNumber i;

	stat->free_space += PageGetFreeSpace(page);

	for (i = minoff; i <= maxoff; i = OffsetNumberNext(i))
	{
		ItemId		itemid = PageGetItemId(page, i);

		if (ItemIdIsDead(itemid))
		{
			stat->dead_tuple_count++;
			stat->dead_tuple_len += ItemIdGetLength(itemid);
		}
		else
		{
			stat->tuple_count++;
			stat->tuple_len += ItemIdGetLength(itemid);
		}
	}
}
/*
 * pgstattuple_real
 *
 * The real work occurs here
 */
static Datum
pgstattuple_real(Relation rel, FunctionCallInfo fcinfo)
{
	HeapScanDesc scan;
	HeapTuple	tuple;
	BlockNumber nblocks;
	BlockNumber block = 0;		/* next block to count free space in */
	BlockNumber tupblock;
	Buffer		buffer;
	uint64		table_len;
	uint64		tuple_len = 0;
	uint64		dead_tuple_len = 0;
	uint64		tuple_count = 0;
	uint64		dead_tuple_count = 0;
	double		tuple_percent;
	double		dead_tuple_percent;
	uint64		free_space = 0; /* free/reusable space in bytes */
	double		free_percent;	/* free/reusable space in % */
	TupleDesc	tupdesc;
	AttInMetadata *attinmeta;
	char	  **values;
	int			i;
	Datum		result;

	/* Build a tuple descriptor for our result type */
	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
		elog(ERROR, "return type must be a row type");

	/* make sure we have a persistent copy of the tupdesc */
	tupdesc = CreateTupleDescCopy(tupdesc);

	/*
	 * Generate attribute metadata needed later to produce tuples from raw C
	 * strings
	 */
	attinmeta = TupleDescGetAttInMetadata(tupdesc);

	scan = heap_beginscan(rel, SnapshotAny, 0, NULL);

	nblocks = scan->rs_nblocks; /* # blocks to be scanned */

	/* scan the relation */
	while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
	{
		/* must hold a buffer lock to call HeapTupleSatisfiesNow */
		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);

		if (HeapTupleSatisfiesNow(tuple->t_data, scan->rs_cbuf))
		{
			tuple_len += tuple->t_len;
			tuple_count++;
		}
		else
		{
			dead_tuple_len += tuple->t_len;
			dead_tuple_count++;
		}

		LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);

		/*
		 * To avoid physically reading the table twice, try to do the
		 * free-space scan in parallel with the heap scan.	However,
		 * heap_getnext may find no tuples on a given page, so we cannot
		 * simply examine the pages returned by the heap scan.
		 */
		tupblock = BlockIdGetBlockNumber(&tuple->t_self.ip_blkid);

		while (block <= tupblock)
		{
			buffer = ReadBuffer(rel, block);
			LockBuffer(buffer, BUFFER_LOCK_SHARE);
			free_space += PageGetFreeSpace((Page) BufferGetPage(buffer));
			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
			ReleaseBuffer(buffer);
			block++;
		}
	}
	heap_endscan(scan);

	while (block < nblocks)
	{
		buffer = ReadBuffer(rel, block);
		free_space += PageGetFreeSpace((Page) BufferGetPage(buffer));
		ReleaseBuffer(buffer);
		block++;
	}

	heap_close(rel, AccessShareLock);

	table_len = (uint64) nblocks *BLCKSZ;

	if (nblocks == 0)
	{
		tuple_percent = 0.0;
		dead_tuple_percent = 0.0;
		free_percent = 0.0;
	}
	else
	{
		tuple_percent = (double) tuple_len *100.0 / table_len;
		dead_tuple_percent = (double) dead_tuple_len *100.0 / table_len;
		free_percent = (double) free_space *100.0 / table_len;
	}

	/*
	 * Prepare a values array for constructing the tuple. This should be an
	 * array of C strings which will be processed later by the appropriate
	 * "in" functions.
	 */
	values = (char **) palloc(NCOLUMNS * sizeof(char *));
	for (i = 0; i < NCOLUMNS; i++)
		values[i] = (char *) palloc(NCHARS * sizeof(char));
	i = 0;
	snprintf(values[i++], NCHARS, INT64_FORMAT, table_len);
	snprintf(values[i++], NCHARS, INT64_FORMAT, tuple_count);
	snprintf(values[i++], NCHARS, INT64_FORMAT, tuple_len);
	snprintf(values[i++], NCHARS, "%.2f", tuple_percent);
	snprintf(values[i++], NCHARS, INT64_FORMAT, dead_tuple_count);
	snprintf(values[i++], NCHARS, INT64_FORMAT, dead_tuple_len);
	snprintf(values[i++], NCHARS, "%.2f", dead_tuple_percent);
	snprintf(values[i++], NCHARS, INT64_FORMAT, free_space);
	snprintf(values[i++], NCHARS, "%.2f", free_percent);

	/* build a tuple */
	tuple = BuildTupleFromCStrings(attinmeta, values);

	/* make the tuple into a datum */
	result = HeapTupleGetDatum(tuple);

	/* Clean up */
	for (i = 0; i < NCOLUMNS; i++)
		pfree(values[i]);
	pfree(values);

	return (result);
}
Exemple #15
0
/* -------------------------------------------------
 * GetBTPageStatistics()
 *
 * Collect statistics of single b-tree page
 * -------------------------------------------------
 */
static void
GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat)
{
	Page		page = BufferGetPage(buffer);
	PageHeader	phdr = (PageHeader) page;
	OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	int			item_size = 0;
	int			off;

	stat->blkno = blkno;

	stat->max_avail = BLCKSZ - (BLCKSZ - phdr->pd_special + SizeOfPageHeaderData);

	stat->dead_items = stat->live_items = 0;

	stat->page_size = PageGetPageSize(page);

	/* page type (flags) */
	if (P_ISDELETED(opaque))
	{
		stat->type = 'd';
		stat->btpo.xact = opaque->btpo.xact;
		return;
	}
	else if (P_IGNORE(opaque))
		stat->type = 'e';
	else if (P_ISLEAF(opaque))
		stat->type = 'l';
	else if (P_ISROOT(opaque))
		stat->type = 'r';
	else
		stat->type = 'i';

	/* btpage opaque data */
	stat->btpo_prev = opaque->btpo_prev;
	stat->btpo_next = opaque->btpo_next;
	stat->btpo.level = opaque->btpo.level;
	stat->btpo_flags = opaque->btpo_flags;
	stat->btpo_cycleid = opaque->btpo_cycleid;

	/* count live and dead tuples, and free space */
	for (off = FirstOffsetNumber; off <= maxoff; off++)
	{
		IndexTuple	itup;

		ItemId		id = PageGetItemId(page, off);

		itup = (IndexTuple) PageGetItem(page, id);

		item_size += IndexTupleSize(itup);

		if (!ItemIdIsDead(id))
			stat->live_items++;
		else
			stat->dead_items++;
	}
	stat->free_size = PageGetFreeSpace(page);

	if ((stat->live_items + stat->dead_items) > 0)
		stat->avg_item_size = item_size / (stat->live_items + stat->dead_items);
	else
		stat->avg_item_size = 0;
}
Exemple #16
0
/*
 *	_hash_doinsert() -- Handle insertion of a single HashItem in the table.
 *
 *		This routine is called by the public interface routines, hashbuild
 *		and hashinsert.  By here, hashitem is completely filled in.
 *		The datum to be used as a "key" is in the hashitem.
 */
InsertIndexResult
_hash_doinsert(Relation rel, HashItem hitem)
{
	Buffer		buf;
	Buffer		metabuf;
	HashMetaPage metap;
	IndexTuple	itup;
	BlockNumber itup_blkno;
	OffsetNumber itup_off;
	InsertIndexResult res;
	BlockNumber blkno;
	Page		page;
	HashPageOpaque pageopaque;
	Size		itemsz;
	bool		do_expand;
	uint32		hashkey;
	Bucket		bucket;
	Datum		datum;
	bool		isnull;

	/*
	 * Compute the hash key for the item.  We do this first so as not to
	 * need to hold any locks while running the hash function.
	 */
	itup = &(hitem->hash_itup);
	if (rel->rd_rel->relnatts != 1)
		elog(ERROR, "hash indexes support only one index key");
	datum = index_getattr(itup, 1, RelationGetDescr(rel), &isnull);
	Assert(!isnull);
	hashkey = _hash_datum2hashkey(rel, datum);

	/* compute item size too */
	itemsz = IndexTupleDSize(hitem->hash_itup)
		+ (sizeof(HashItemData) - sizeof(IndexTupleData));

	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but
								 * we need to be consistent */

	/*
	 * Acquire shared split lock so we can compute the target bucket
	 * safely (see README).
	 */
	_hash_getlock(rel, 0, HASH_SHARE);

	/* Read the metapage */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
	metap = (HashMetaPage) BufferGetPage(metabuf);
	_hash_checkpage(rel, (Page) metap, LH_META_PAGE);

	/*
	 * Check whether the item can fit on a hash page at all. (Eventually,
	 * we ought to try to apply TOAST methods if not.)  Note that at this
	 * point, itemsz doesn't include the ItemId.
	 */
	if (itemsz > HashMaxItemSize((Page) metap))
		ereport(ERROR,
				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
				 errmsg("index row size %lu exceeds hash maximum %lu",
						(unsigned long) itemsz,
						(unsigned long) HashMaxItemSize((Page) metap))));

	/*
	 * Compute the target bucket number, and convert to block number.
	 */
	bucket = _hash_hashkey2bucket(hashkey,
								  metap->hashm_maxbucket,
								  metap->hashm_highmask,
								  metap->hashm_lowmask);

	blkno = BUCKET_TO_BLKNO(metap, bucket);

	/* release lock on metapage, but keep pin since we'll need it again */
	_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);

	/*
	 * Acquire share lock on target bucket; then we can release split lock.
	 */
	_hash_getlock(rel, blkno, HASH_SHARE);

	_hash_droplock(rel, 0, HASH_SHARE);

	/* Fetch the primary bucket page for the bucket */
	buf = _hash_getbuf(rel, blkno, HASH_WRITE);
	page = BufferGetPage(buf);
	_hash_checkpage(rel, page, LH_BUCKET_PAGE);
	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
	Assert(pageopaque->hasho_bucket == bucket);

	/* Do the insertion */
	while (PageGetFreeSpace(page) < itemsz)
	{
		/*
		 * no space on this page; check for an overflow page
		 */
		BlockNumber	nextblkno = pageopaque->hasho_nextblkno;

		if (BlockNumberIsValid(nextblkno))
		{
			/*
			 * ovfl page exists; go get it.  if it doesn't have room,
			 * we'll find out next pass through the loop test above.
			 */
			_hash_relbuf(rel, buf);
			buf = _hash_getbuf(rel, nextblkno, HASH_WRITE);
			page = BufferGetPage(buf);
		}
		else
		{
			/*
			 * we're at the end of the bucket chain and we haven't found a
			 * page with enough room.  allocate a new overflow page.
			 */

			/* release our write lock without modifying buffer */
			_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);

			/* chain to a new overflow page */
			buf = _hash_addovflpage(rel, metabuf, buf);
			page = BufferGetPage(buf);

			/* should fit now, given test above */
			Assert(PageGetFreeSpace(page) >= itemsz);
		}
		_hash_checkpage(rel, page, LH_OVERFLOW_PAGE);
		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
		Assert(pageopaque->hasho_bucket == bucket);
	}

	/* found page with enough space, so add the item here */
	itup_off = _hash_pgaddtup(rel, buf, itemsz, hitem);
	itup_blkno = BufferGetBlockNumber(buf);

	/* write and release the modified page */
	_hash_wrtbuf(rel, buf);

	/* We can drop the bucket lock now */
	_hash_droplock(rel, blkno, HASH_SHARE);

	/*
	 * Write-lock the metapage so we can increment the tuple count.
	 * After incrementing it, check to see if it's time for a split.
	 */
	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

	metap->hashm_ntuples += 1;

	/* Make sure this stays in sync with _hash_expandtable() */
	do_expand = metap->hashm_ntuples >
		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);

	/* Write out the metapage and drop lock, but keep pin */
	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);

	/* Attempt to split if a split is needed */
	if (do_expand)
		_hash_expandtable(rel, metabuf);

	/* Finally drop our pin on the metapage */
	_hash_dropbuf(rel, metabuf);

	/* Create the return data structure */
	res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));

	ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);

	return res;
}
Exemple #17
0
/*
 * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
 *
 * We are splitting a bucket that consists of a base bucket page and zero
 * or more overflow (bucket chain) pages.  We must relocate tuples that
 * belong in the new bucket, and compress out any free space in the old
 * bucket.
 *
 * The caller must hold exclusive locks on both buckets to ensure that
 * no one else is trying to access them (see README).
 *
 * The caller must hold a pin, but no lock, on the metapage buffer.
 * The buffer is returned in the same state.  (The metapage is only
 * touched if it becomes necessary to add or remove overflow pages.)
 */
static void
_hash_splitbucket(Relation rel,
				  Buffer metabuf,
				  Bucket obucket,
				  Bucket nbucket,
				  BlockNumber start_oblkno,
				  BlockNumber start_nblkno,
				  uint32 maxbucket,
				  uint32 highmask,
				  uint32 lowmask)
{
	Bucket		bucket;
	Buffer		obuf;
	Buffer		nbuf;
	BlockNumber oblkno;
	BlockNumber nblkno;
	bool		null;
	Datum		datum;
	HashItem	hitem;
	HashPageOpaque oopaque;
	HashPageOpaque nopaque;
	IndexTuple	itup;
	Size		itemsz;
	OffsetNumber ooffnum;
	OffsetNumber noffnum;
	OffsetNumber omaxoffnum;
	Page		opage;
	Page		npage;
	TupleDesc	itupdesc = RelationGetDescr(rel);

	/*
	 * It should be okay to simultaneously write-lock pages from each
	 * bucket, since no one else can be trying to acquire buffer lock
	 * on pages of either bucket.
	 */
	oblkno = start_oblkno;
	nblkno = start_nblkno;
	obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
	nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE);
	opage = BufferGetPage(obuf);
	npage = BufferGetPage(nbuf);

	_hash_checkpage(rel, opage, LH_BUCKET_PAGE);
	oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);

	/* initialize the new bucket's primary page */
	_hash_pageinit(npage, BufferGetPageSize(nbuf));
	nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
	nopaque->hasho_prevblkno = InvalidBlockNumber;
	nopaque->hasho_nextblkno = InvalidBlockNumber;
	nopaque->hasho_bucket = nbucket;
	nopaque->hasho_flag = LH_BUCKET_PAGE;
	nopaque->hasho_filler = HASHO_FILL;

	/*
	 * Partition the tuples in the old bucket between the old bucket and the
	 * new bucket, advancing along the old bucket's overflow bucket chain
	 * and adding overflow pages to the new bucket as needed.
	 */
	ooffnum = FirstOffsetNumber;
	omaxoffnum = PageGetMaxOffsetNumber(opage);
	for (;;)
	{
		/*
		 * at each iteration through this loop, each of these variables
		 * should be up-to-date: obuf opage oopaque ooffnum omaxoffnum
		 */

		/* check if we're at the end of the page */
		if (ooffnum > omaxoffnum)
		{
			/* at end of page, but check for an(other) overflow page */
			oblkno = oopaque->hasho_nextblkno;
			if (!BlockNumberIsValid(oblkno))
				break;
			/*
			 * we ran out of tuples on this particular page, but we
			 * have more overflow pages; advance to next page.
			 */
			_hash_wrtbuf(rel, obuf);

			obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
			opage = BufferGetPage(obuf);
			_hash_checkpage(rel, opage, LH_OVERFLOW_PAGE);
			oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
			ooffnum = FirstOffsetNumber;
			omaxoffnum = PageGetMaxOffsetNumber(opage);
			continue;
		}

		/*
		 * Re-hash the tuple to determine which bucket it now belongs in.
		 *
		 * It is annoying to call the hash function while holding locks,
		 * but releasing and relocking the page for each tuple is unappealing
		 * too.
		 */
		hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum));
		itup = &(hitem->hash_itup);
		datum = index_getattr(itup, 1, itupdesc, &null);
		Assert(!null);

		bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum),
									  maxbucket, highmask, lowmask);

		if (bucket == nbucket)
		{
			/*
			 * insert the tuple into the new bucket.  if it doesn't fit on
			 * the current page in the new bucket, we must allocate a new
			 * overflow page and place the tuple on that page instead.
			 */
			itemsz = IndexTupleDSize(hitem->hash_itup)
				+ (sizeof(HashItemData) - sizeof(IndexTupleData));

			itemsz = MAXALIGN(itemsz);

			if (PageGetFreeSpace(npage) < itemsz)
			{
				/* write out nbuf and drop lock, but keep pin */
				_hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK);
				/* chain to a new overflow page */
				nbuf = _hash_addovflpage(rel, metabuf, nbuf);
				npage = BufferGetPage(nbuf);
				_hash_checkpage(rel, npage, LH_OVERFLOW_PAGE);
				/* we don't need nopaque within the loop */
			}

			noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
			if (PageAddItem(npage, (Item) hitem, itemsz, noffnum, LP_USED)
				== InvalidOffsetNumber)
				elog(ERROR, "failed to add index item to \"%s\"",
					 RelationGetRelationName(rel));

			/*
			 * now delete the tuple from the old bucket.  after this
			 * section of code, 'ooffnum' will actually point to the
			 * ItemId to which we would point if we had advanced it before
			 * the deletion (PageIndexTupleDelete repacks the ItemId
			 * array).	this also means that 'omaxoffnum' is exactly one
			 * less than it used to be, so we really can just decrement it
			 * instead of calling PageGetMaxOffsetNumber.
			 */
			PageIndexTupleDelete(opage, ooffnum);
			omaxoffnum = OffsetNumberPrev(omaxoffnum);
		}
		else
		{
			/*
			 * the tuple stays on this page.  we didn't move anything, so
			 * we didn't delete anything and therefore we don't have to
			 * change 'omaxoffnum'.
			 */
			Assert(bucket == obucket);
			ooffnum = OffsetNumberNext(ooffnum);
		}
	}

	/*
	 * We're at the end of the old bucket chain, so we're done partitioning
	 * the tuples.  Before quitting, call _hash_squeezebucket to ensure the
	 * tuples remaining in the old bucket (including the overflow pages) are
	 * packed as tightly as possible.  The new bucket is already tight.
	 */
	_hash_wrtbuf(rel, obuf);
	_hash_wrtbuf(rel, nbuf);

	_hash_squeezebucket(rel, obucket, start_oblkno);
}
Exemple #18
0
/*
 * RelationGetBufferForTuple
 *
 *	Returns pinned and exclusive-locked buffer of a page in given relation
 *	with free space >= given len.
 *
 *	If otherBuffer is not InvalidBuffer, then it references a previously
 *	pinned buffer of another page in the same relation; on return, this
 *	buffer will also be exclusive-locked.  (This case is used by heap_update;
 *	the otherBuffer contains the tuple being updated.)
 *
 *	The reason for passing otherBuffer is that if two backends are doing
 *	concurrent heap_update operations, a deadlock could occur if they try
 *	to lock the same two buffers in opposite orders.  To ensure that this
 *	can't happen, we impose the rule that buffers of a relation must be
 *	locked in increasing page number order.  This is most conveniently done
 *	by having RelationGetBufferForTuple lock them both, with suitable care
 *	for ordering.
 *
 *	NOTE: it is unlikely, but not quite impossible, for otherBuffer to be the
 *	same buffer we select for insertion of the new tuple (this could only
 *	happen if space is freed in that page after heap_update finds there's not
 *	enough there).	In that case, the page will be pinned and locked only once.
 *
 *	If use_fsm is true (the normal case), we use FSM to help us find free
 *	space.	If use_fsm is false, we always append a new empty page to the
 *	end of the relation if the tuple won't fit on the current target page.
 *	This can save some cycles when we know the relation is new and doesn't
 *	contain useful amounts of free space.
 *
 *	The use_fsm = false case is also useful for non-WAL-logged additions to a
 *	relation, if the caller holds exclusive lock and is careful to invalidate
 *	relation->rd_targblock before the first insertion --- that ensures that
 *	all insertions will occur into newly added pages and not be intermixed
 *	with tuples from other transactions.  That way, a crash can't risk losing
 *	any committed data of other transactions.  (See heap_insert's comments
 *	for additional constraints needed for safe usage of this behavior.)
 *
 *	We always try to avoid filling existing pages further than the fillfactor.
 *	This is OK since this routine is not consulted when updating a tuple and
 *	keeping it on the same page, which is the scenario fillfactor is meant
 *	to reserve space for.
 *
 *	ereport(ERROR) is allowed here, so this routine *must* be called
 *	before any (unlogged) changes are made in buffer pool.
 */
Buffer
RelationGetBufferForTuple(Relation relation, Size len,
						  Buffer otherBuffer, bool use_fsm)
{
	Buffer		buffer = InvalidBuffer;
	Page		pageHeader;
	Size		pageFreeSpace,
				saveFreeSpace;
	BlockNumber targetBlock,
				otherBlock;
	bool		needLock;

	MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD;

	len = MAXALIGN(len);		/* be conservative */

	/*
	 * If we're gonna fail for oversize tuple, do it right away
	 */
	if (len > MaxHeapTupleSize)
		ereport(ERROR,
				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
				 errmsg("row is too big: size %lu, maximum size %lu",
						(unsigned long) len,
						(unsigned long) MaxHeapTupleSize)));

	/* Compute desired extra freespace due to fillfactor option */
	saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
												   HEAP_DEFAULT_FILLFACTOR);

	if (otherBuffer != InvalidBuffer)
		otherBlock = BufferGetBlockNumber(otherBuffer);
	else
		otherBlock = InvalidBlockNumber;		/* just to keep compiler quiet */

	/*
	 * We first try to put the tuple on the same page we last inserted a tuple
	 * on, as cached in the relcache entry.  If that doesn't work, we ask the
	 * shared Free Space Map to locate a suitable page.  Since the FSM's info
	 * might be out of date, we have to be prepared to loop around and retry
	 * multiple times.	(To insure this isn't an infinite loop, we must update
	 * the FSM with the correct amount of free space on each page that proves
	 * not to be suitable.)  If the FSM has no record of a page with enough
	 * free space, we give up and extend the relation.
	 *
	 * When use_fsm is false, we either put the tuple onto the existing target
	 * page or extend the relation.
	 */
	if (len + saveFreeSpace <= MaxHeapTupleSize)
		targetBlock = relation->rd_targblock;
	else
	{
		/* can't fit, don't screw up FSM request tracking by trying */
		targetBlock = InvalidBlockNumber;
		use_fsm = false;
	}

	if (targetBlock == InvalidBlockNumber && use_fsm)
	{
		/*
		 * We have no cached target page, so ask the FSM for an initial
		 * target.
		 */
		targetBlock = GetPageWithFreeSpace(&relation->rd_node,
										   len + saveFreeSpace);

		/*
		 * If the FSM knows nothing of the rel, try the last page before we
		 * give up and extend.	This avoids one-tuple-per-page syndrome during
		 * bootstrapping or in a recently-started system.
		 */
		if (targetBlock == InvalidBlockNumber)
		{
			BlockNumber nblocks = RelationGetNumberOfBlocks(relation);

			if (nblocks > 0)
				targetBlock = nblocks - 1;
		}
	}

	while (targetBlock != InvalidBlockNumber)
	{
		/*
		 * Read and exclusive-lock the target block, as well as the other
		 * block if one was given, taking suitable care with lock ordering and
		 * the possibility they are the same block.
		 */
		if (otherBuffer == InvalidBuffer)
		{
			/* easy case */
			buffer = ReadBuffer(relation, targetBlock);
			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
		}
		else if (otherBlock == targetBlock)
		{
			/* also easy case */
			buffer = otherBuffer;
			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
		}
		else if (otherBlock < targetBlock)
		{
			/* lock other buffer first */
			buffer = ReadBuffer(relation, targetBlock);
			LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
		}
		else
		{
			/* lock target buffer first */
			buffer = ReadBuffer(relation, targetBlock);
			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
			LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
		}

		/*
		 * Now we can check to see if there's enough free space here. If so,
		 * we're done.
		 */
		pageHeader = (Page) BufferGetPage(buffer);
		pageFreeSpace = PageGetFreeSpace(pageHeader);
		if (len + saveFreeSpace <= pageFreeSpace)
		{
			/* use this page as future insert target, too */
			relation->rd_targblock = targetBlock;
			return buffer;
		}

		/*
		 * Not enough space, so we must give up our page locks and pin (if
		 * any) and prepare to look elsewhere.	We don't care which order we
		 * unlock the two buffers in, so this can be slightly simpler than the
		 * code above.
		 */
		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
		if (otherBuffer == InvalidBuffer)
			ReleaseBuffer(buffer);
		else if (otherBlock != targetBlock)
		{
			LockBuffer(otherBuffer, BUFFER_LOCK_UNLOCK);
			ReleaseBuffer(buffer);
		}

		/* Without FSM, always fall out of the loop and extend */
		if (!use_fsm)
			break;

		/*
		 * Update FSM as to condition of this page, and ask for another page
		 * to try.
		 */
		targetBlock = RecordAndGetPageWithFreeSpace(&relation->rd_node,
													targetBlock,
													pageFreeSpace,
													len + saveFreeSpace);
	}

	/*
	 * Have to extend the relation.
	 *
	 * We have to use a lock to ensure no one else is extending the rel at the
	 * same time, else we will both try to initialize the same new page.  We
	 * can skip locking for new or temp relations, however, since no one else
	 * could be accessing them.
	 */
	needLock = !RELATION_IS_LOCAL(relation);

	if (needLock)
		LockRelationForExtension(relation, ExclusiveLock);

	/*
	 * XXX This does an lseek - rather expensive - but at the moment it is the
	 * only way to accurately determine how many blocks are in a relation.	Is
	 * it worth keeping an accurate file length in shared memory someplace,
	 * rather than relying on the kernel to do it for us?
	 */
	buffer = ReadBuffer(relation, P_NEW);

	/*
	 * We can be certain that locking the otherBuffer first is OK, since it
	 * must have a lower page number.
	 */
	if (otherBuffer != InvalidBuffer)
		LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);

	/*
	 * Now acquire lock on the new page.
	 */
	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);

	/*
	 * Release the file-extension lock; it's now OK for someone else to extend
	 * the relation some more.	Note that we cannot release this lock before
	 * we have buffer lock on the new page, or we risk a race condition
	 * against vacuumlazy.c --- see comments therein.
	 */
	if (needLock)
		UnlockRelationForExtension(relation, ExclusiveLock);

	/*
	 * We need to initialize the empty new page.  Double-check that it really
	 * is empty (this should never happen, but if it does we don't want to
	 * risk wiping out valid data).
	 */
	pageHeader = (Page) BufferGetPage(buffer);

	if (!PageIsNew((PageHeader) pageHeader))
		elog(ERROR, "page %u of relation \"%s\" should be empty but is not",
			 BufferGetBlockNumber(buffer),
			 RelationGetRelationName(relation));

	PageInit(pageHeader, BufferGetPageSize(buffer), 0);

	if (len > PageGetFreeSpace(pageHeader))
	{
		/* We should not get here given the test at the top */
		elog(PANIC, "tuple is too big: size %lu", (unsigned long) len);
	}

	/*
	 * Remember the new page as our target for future insertions.
	 *
	 * XXX should we enter the new page into the free space map immediately,
	 * or just keep it for this backend's exclusive use in the short run
	 * (until VACUUM sees it)?	Seems to depend on whether you expect the
	 * current backend to make more insertions or not, which is probably a
	 * good bet most of the time.  So for now, don't add it to FSM yet.
	 */
	relation->rd_targblock = BufferGetBlockNumber(buffer);

	return buffer;
}
Exemple #19
0
/*
 *	_hash_squeezebucket(rel, bucket)
 *
 *	Try to squeeze the tuples onto pages occurring earlier in the
 *	bucket chain in an attempt to free overflow pages. When we start
 *	the "squeezing", the page from which we start taking tuples (the
 *	"read" page) is the last bucket in the bucket chain and the page
 *	onto which we start squeezing tuples (the "write" page) is the
 *	first page in the bucket chain.  The read page works backward and
 *	the write page works forward; the procedure terminates when the
 *	read page and write page are the same page.
 *
 *	At completion of this procedure, it is guaranteed that all pages in
 *	the bucket are nonempty, unless the bucket is totally empty (in
 *	which case all overflow pages will be freed).  The original implementation
 *	required that to be true on entry as well, but it's a lot easier for
 *	callers to leave empty overflow pages and let this guy clean it up.
 *
 *	Caller must hold exclusive lock on the target bucket.  This allows
 *	us to safely lock multiple pages in the bucket.
 */
void
_hash_squeezebucket(Relation rel,
					Bucket bucket,
					BlockNumber bucket_blkno)
{
	Buffer		wbuf;
	Buffer		rbuf = 0;
	BlockNumber wblkno;
	BlockNumber rblkno;
	Page		wpage;
	Page		rpage;
	HashPageOpaque wopaque;
	HashPageOpaque ropaque;
	OffsetNumber woffnum;
	OffsetNumber roffnum;
	IndexTuple	itup;
	Size		itemsz;

	/*
	 * start squeezing into the base bucket page.
	 */
	wblkno = bucket_blkno;
	wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE);
	_hash_checkpage(rel, wbuf, LH_BUCKET_PAGE);
	wpage = BufferGetPage(wbuf);
	wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);

	/*
	 * if there aren't any overflow pages, there's nothing to squeeze.
	 */
	if (!BlockNumberIsValid(wopaque->hasho_nextblkno))
	{
		_hash_relbuf(rel, wbuf);
		return;
	}

	/*
	 * find the last page in the bucket chain by starting at the base bucket
	 * page and working forward.
	 */
	ropaque = wopaque;
	do
	{
		rblkno = ropaque->hasho_nextblkno;
		if (ropaque != wopaque)
			_hash_relbuf(rel, rbuf);
		rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE);
		_hash_checkpage(rel, rbuf, LH_OVERFLOW_PAGE);
		rpage = BufferGetPage(rbuf);
		ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
		Assert(ropaque->hasho_bucket == bucket);
	} while (BlockNumberIsValid(ropaque->hasho_nextblkno));

	/*
	 * squeeze the tuples.
	 */
	roffnum = FirstOffsetNumber;
	for (;;)
	{
		/* this test is needed in case page is empty on entry */
		if (roffnum <= PageGetMaxOffsetNumber(rpage))
		{
			itup = (IndexTuple) PageGetItem(rpage,
											PageGetItemId(rpage, roffnum));
			itemsz = IndexTupleDSize(*itup);
			itemsz = MAXALIGN(itemsz);

			/*
			 * Walk up the bucket chain, looking for a page big enough for
			 * this item.  Exit if we reach the read page.
			 */
			while (PageGetFreeSpace(wpage) < itemsz)
			{
				Assert(!PageIsEmpty(wpage));

				wblkno = wopaque->hasho_nextblkno;
				Assert(BlockNumberIsValid(wblkno));

				_hash_wrtbuf(rel, wbuf);

				if (rblkno == wblkno)
				{
					/* wbuf is already released */
					_hash_wrtbuf(rel, rbuf);
					return;
				}

				wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE);
				_hash_checkpage(rel, wbuf, LH_OVERFLOW_PAGE);
				wpage = BufferGetPage(wbuf);
				wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
				Assert(wopaque->hasho_bucket == bucket);
			}

			/*
			 * we have found room so insert on the "write" page.
			 */
			woffnum = OffsetNumberNext(PageGetMaxOffsetNumber(wpage));
			if (PageAddItem(wpage, (Item) itup, itemsz, woffnum, LP_USED)
				== InvalidOffsetNumber)
				elog(ERROR, "failed to add index item to \"%s\"",
					 RelationGetRelationName(rel));

			/*
			 * delete the tuple from the "read" page. PageIndexTupleDelete
			 * repacks the ItemId array, so 'roffnum' will be "advanced" to
			 * the "next" ItemId.
			 */
			PageIndexTupleDelete(rpage, roffnum);
		}

		/*
		 * if the "read" page is now empty because of the deletion (or because
		 * it was empty when we got to it), free it.
		 *
		 * Tricky point here: if our read and write pages are adjacent in the
		 * bucket chain, our write lock on wbuf will conflict with
		 * _hash_freeovflpage's attempt to update the sibling links of the
		 * removed page.  However, in that case we are done anyway, so we can
		 * simply drop the write lock before calling _hash_freeovflpage.
		 */
		if (PageIsEmpty(rpage))
		{
			rblkno = ropaque->hasho_prevblkno;
			Assert(BlockNumberIsValid(rblkno));

			/* are we freeing the page adjacent to wbuf? */
			if (rblkno == wblkno)
			{
				/* yes, so release wbuf lock first */
				_hash_wrtbuf(rel, wbuf);
				/* free this overflow page (releases rbuf) */
				_hash_freeovflpage(rel, rbuf);
				/* done */
				return;
			}

			/* free this overflow page, then get the previous one */
			_hash_freeovflpage(rel, rbuf);

			rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE);
			_hash_checkpage(rel, rbuf, LH_OVERFLOW_PAGE);
			rpage = BufferGetPage(rbuf);
			ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
			Assert(ropaque->hasho_bucket == bucket);

			roffnum = FirstOffsetNumber;
		}
	}

	/* NOTREACHED */
}
Exemple #20
0
static Datum
pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo)
{
	Datum		result;
	BlockNumber nblocks;
	BlockNumber blkno;
	BTIndexStat indexStat;
	BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);

	if (!IS_INDEX(rel) || !IS_BTREE(rel))
		elog(ERROR, "relation \"%s\" is not a btree index",
			 RelationGetRelationName(rel));

	/*
	 * Reject attempts to read non-local temporary relations; we would be
	 * likely to get wrong data since we have no visibility into the owning
	 * session's local buffers.
	 */
	if (RELATION_IS_OTHER_TEMP(rel))
		ereport(ERROR,
				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
				 errmsg("cannot access temporary tables of other sessions")));

	/*
	 * Read metapage
	 */
	{
		Buffer		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, 0, RBM_NORMAL, bstrategy);
		Page		page = BufferGetPage(buffer);
		BTMetaPageData *metad = BTPageGetMeta(page);

		indexStat.version = metad->btm_version;
		indexStat.level = metad->btm_level;
		indexStat.root_blkno = metad->btm_root;

		ReleaseBuffer(buffer);
	}

	/* -- init counters -- */
	indexStat.internal_pages = 0;
	indexStat.leaf_pages = 0;
	indexStat.empty_pages = 0;
	indexStat.deleted_pages = 0;

	indexStat.max_avail = 0;
	indexStat.free_space = 0;

	indexStat.fragments = 0;

	/*
	 * Scan all blocks except the metapage
	 */
	nblocks = RelationGetNumberOfBlocks(rel);

	for (blkno = 1; blkno < nblocks; blkno++)
	{
		Buffer		buffer;
		Page		page;
		BTPageOpaque opaque;

		CHECK_FOR_INTERRUPTS();

		/* Read and lock buffer */
		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
		LockBuffer(buffer, BUFFER_LOCK_SHARE);

		page = BufferGetPage(buffer);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);

		/* Determine page type, and update totals */

		if (P_ISDELETED(opaque))
			indexStat.deleted_pages++;
		else if (P_IGNORE(opaque))
			indexStat.empty_pages++;	/* this is the "half dead" state */
		else if (P_ISLEAF(opaque))
		{
			int			max_avail;

			max_avail = BLCKSZ - (BLCKSZ - ((PageHeader) page)->pd_special + SizeOfPageHeaderData);
			indexStat.max_avail += max_avail;
			indexStat.free_space += PageGetFreeSpace(page);

			indexStat.leaf_pages++;

			/*
			 * If the next leaf is on an earlier block, it means a
			 * fragmentation.
			 */
			if (opaque->btpo_next != P_NONE && opaque->btpo_next < blkno)
				indexStat.fragments++;
		}
		else
			indexStat.internal_pages++;

		/* Unlock and release buffer */
		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
		ReleaseBuffer(buffer);
	}

	relation_close(rel, AccessShareLock);

	/*----------------------------
	 * Build a result tuple
	 *----------------------------
	 */
	{
		TupleDesc	tupleDesc;
		int			j;
		char	   *values[10];
		HeapTuple	tuple;

		/* Build a tuple descriptor for our result type */
		if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
			elog(ERROR, "return type must be a row type");

		j = 0;
		values[j++] = psprintf("%d", indexStat.version);
		values[j++] = psprintf("%d", indexStat.level);
		values[j++] = psprintf(INT64_FORMAT,
							   (1 +		/* include the metapage in index_size */
								indexStat.leaf_pages +
								indexStat.internal_pages +
								indexStat.deleted_pages +
								indexStat.empty_pages) * BLCKSZ);
		values[j++] = psprintf("%u", indexStat.root_blkno);
		values[j++] = psprintf(INT64_FORMAT, indexStat.internal_pages);
		values[j++] = psprintf(INT64_FORMAT, indexStat.leaf_pages);
		values[j++] = psprintf(INT64_FORMAT, indexStat.empty_pages);
		values[j++] = psprintf(INT64_FORMAT, indexStat.deleted_pages);
		if (indexStat.max_avail > 0)
			values[j++] = psprintf("%.2f",
								   100.0 - (double) indexStat.free_space / (double) indexStat.max_avail * 100.0);
		else
			values[j++] = pstrdup("NaN");
		if (indexStat.leaf_pages > 0)
			values[j++] = psprintf("%.2f",
								   (double) indexStat.fragments / (double) indexStat.leaf_pages * 100.0);
		else
			values[j++] = pstrdup("NaN");

		tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
									   values);

		result = HeapTupleGetDatum(tuple);
	}

	return result;
}
Exemple #21
0
/*
 * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
 *
 * We are splitting a bucket that consists of a base bucket page and zero
 * or more overflow (bucket chain) pages.  We must relocate tuples that
 * belong in the new bucket, and compress out any free space in the old
 * bucket.
 *
 * The caller must hold exclusive locks on both buckets to ensure that
 * no one else is trying to access them (see README).
 *
 * The caller must hold a pin, but no lock, on the metapage buffer.
 * The buffer is returned in the same state.  (The metapage is only
 * touched if it becomes necessary to add or remove overflow pages.)
 */
static void
_hash_splitbucket(Relation rel,
                  Buffer metabuf,
                  Bucket obucket,
                  Bucket nbucket,
                  BlockNumber start_oblkno,
                  BlockNumber start_nblkno,
                  uint32 maxbucket,
                  uint32 highmask,
                  uint32 lowmask)
{
    BlockNumber oblkno;
    BlockNumber nblkno;
    Buffer		obuf;
    Buffer		nbuf;
    Page		opage;
    Page		npage;
    HashPageOpaque oopaque;
    HashPageOpaque nopaque;

    /*
     * It should be okay to simultaneously write-lock pages from each bucket,
     * since no one else can be trying to acquire buffer lock on pages of
     * either bucket.
     */
    oblkno = start_oblkno;
    obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_BUCKET_PAGE);
    opage = BufferGetPage(obuf);
    oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);

    nblkno = start_nblkno;
    nbuf = _hash_getnewbuf(rel, nblkno, MAIN_FORKNUM);
    npage = BufferGetPage(nbuf);

    /* initialize the new bucket's primary page */
    nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
    nopaque->hasho_prevblkno = InvalidBlockNumber;
    nopaque->hasho_nextblkno = InvalidBlockNumber;
    nopaque->hasho_bucket = nbucket;
    nopaque->hasho_flag = LH_BUCKET_PAGE;
    nopaque->hasho_page_id = HASHO_PAGE_ID;

    /*
     * Partition the tuples in the old bucket between the old bucket and the
     * new bucket, advancing along the old bucket's overflow bucket chain and
     * adding overflow pages to the new bucket as needed.  Outer loop iterates
     * once per page in old bucket.
     */
    for (;;)
    {
        OffsetNumber ooffnum;
        OffsetNumber omaxoffnum;
        OffsetNumber deletable[MaxOffsetNumber];
        int			ndeletable = 0;

        /* Scan each tuple in old page */
        omaxoffnum = PageGetMaxOffsetNumber(opage);
        for (ooffnum = FirstOffsetNumber;
                ooffnum <= omaxoffnum;
                ooffnum = OffsetNumberNext(ooffnum))
        {
            IndexTuple	itup;
            Size		itemsz;
            Bucket		bucket;

            /*
             * Fetch the item's hash key (conveniently stored in the item) and
             * determine which bucket it now belongs in.
             */
            itup = (IndexTuple) PageGetItem(opage,
                                            PageGetItemId(opage, ooffnum));
            bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
                                          maxbucket, highmask, lowmask);

            if (bucket == nbucket)
            {
                /*
                 * insert the tuple into the new bucket.  if it doesn't fit on
                 * the current page in the new bucket, we must allocate a new
                 * overflow page and place the tuple on that page instead.
                 */
                itemsz = IndexTupleDSize(*itup);
                itemsz = MAXALIGN(itemsz);

                if (PageGetFreeSpace(npage) < itemsz)
                {
                    /* write out nbuf and drop lock, but keep pin */
                    _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK);
                    /* chain to a new overflow page */
                    nbuf = _hash_addovflpage(rel, metabuf, nbuf);
                    npage = BufferGetPage(nbuf);
                    /* we don't need nblkno or nopaque within the loop */
                }

                /*
                 * Insert tuple on new page, using _hash_pgaddtup to ensure
                 * correct ordering by hashkey.  This is a tad inefficient
                 * since we may have to shuffle itempointers repeatedly.
                 * Possible future improvement: accumulate all the items for
                 * the new page and qsort them before insertion.
                 */
                (void) _hash_pgaddtup(rel, nbuf, itemsz, itup);

                /*
                 * Mark tuple for deletion from old page.
                 */
                deletable[ndeletable++] = ooffnum;
            }
            else
            {
                /*
                 * the tuple stays on this page, so nothing to do.
                 */
                Assert(bucket == obucket);
            }
        }

        oblkno = oopaque->hasho_nextblkno;

        /*
         * Done scanning this old page.  If we moved any tuples, delete them
         * from the old page.
         */
        if (ndeletable > 0)
        {
            PageIndexMultiDelete(opage, deletable, ndeletable);
            _hash_wrtbuf(rel, obuf);
        }
        else
            _hash_relbuf(rel, obuf);

        /* Exit loop if no more overflow pages in old bucket */
        if (!BlockNumberIsValid(oblkno))
            break;

        /* Else, advance to next old page */
        obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
        opage = BufferGetPage(obuf);
        oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
    }

    /*
     * We're at the end of the old bucket chain, so we're done partitioning
     * the tuples.	Before quitting, call _hash_squeezebucket to ensure the
     * tuples remaining in the old bucket (including the overflow pages) are
     * packed as tightly as possible.  The new bucket is already tight.
     */
    _hash_wrtbuf(rel, nbuf);

    _hash_squeezebucket(rel, obucket, start_oblkno, NULL);
}
Exemple #22
0
/* ------------------------------------------------------
 * pgstatindex()
 *
 * Usage: SELECT * FROM pgstatindex('t1_pkey');
 * ------------------------------------------------------
 */
Datum
pgstatindex(PG_FUNCTION_ARGS)
{
	text	   *relname = PG_GETARG_TEXT_P(0);
	Relation	rel;
	RangeVar   *relrv;
	Datum		result;
	BlockNumber nblocks;
	BlockNumber blkno;
	BTIndexStat indexStat;

	if (!superuser())
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
				 (errmsg("must be superuser to use pgstattuple functions"))));

	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
	rel = relation_openrv(relrv, AccessShareLock);

	if (!IS_INDEX(rel) || !IS_BTREE(rel))
		elog(ERROR, "relation \"%s\" is not a btree index",
			 RelationGetRelationName(rel));

	/*
	 * Reject attempts to read non-local temporary relations; we would be
	 * likely to get wrong data since we have no visibility into the owning
	 * session's local buffers.
	 */
	if (RELATION_IS_OTHER_TEMP(rel))
		ereport(ERROR,
				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
				 errmsg("cannot access temporary tables of other sessions")));

	/*
	 * Read metapage
	 */
	{
		Buffer		buffer = ReadBuffer(rel, 0);
		Page		page = BufferGetPage(buffer);
		BTMetaPageData *metad = BTPageGetMeta(page);

		indexStat.version = metad->btm_version;
		indexStat.level = metad->btm_level;
		indexStat.root_blkno = metad->btm_root;

		ReleaseBuffer(buffer);
	}

	/* -- init counters -- */
	indexStat.root_pages = 0;
	indexStat.internal_pages = 0;
	indexStat.leaf_pages = 0;
	indexStat.empty_pages = 0;
	indexStat.deleted_pages = 0;

	indexStat.max_avail = 0;
	indexStat.free_space = 0;

	indexStat.fragments = 0;

	/*
	 * Scan all blocks except the metapage
	 */
	nblocks = RelationGetNumberOfBlocks(rel);

	for (blkno = 1; blkno < nblocks; blkno++)
	{
		Buffer		buffer;
		Page		page;
		BTPageOpaque opaque;

		/* Read and lock buffer */
		buffer = ReadBuffer(rel, blkno);
		LockBuffer(buffer, BUFFER_LOCK_SHARE);

		page = BufferGetPage(buffer);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);

		/* Determine page type, and update totals */

		if (P_ISLEAF(opaque))
		{
			int			max_avail;

			max_avail = BLCKSZ - (BLCKSZ - ((PageHeader) page)->pd_special + SizeOfPageHeaderData);
			indexStat.max_avail += max_avail;
			indexStat.free_space += PageGetFreeSpace(page);

			indexStat.leaf_pages++;

			/*
			 * If the next leaf is on an earlier block, it means a
			 * fragmentation.
			 */
			if (opaque->btpo_next != P_NONE && opaque->btpo_next < blkno)
				indexStat.fragments++;
		}
		else if (P_ISDELETED(opaque))
			indexStat.deleted_pages++;
		else if (P_IGNORE(opaque))
			indexStat.empty_pages++;
		else if (P_ISROOT(opaque))
			indexStat.root_pages++;
		else
			indexStat.internal_pages++;

		/* Unlock and release buffer */
		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
		ReleaseBuffer(buffer);
	}

	relation_close(rel, AccessShareLock);

	/*----------------------------
	 * Build a result tuple
	 *----------------------------
	 */
	{
		TupleDesc	tupleDesc;
		int			j;
		char	   *values[10];
		HeapTuple	tuple;

		/* Build a tuple descriptor for our result type */
		if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
			elog(ERROR, "return type must be a row type");

		j = 0;
		values[j] = palloc(32);
		snprintf(values[j++], 32, "%d", indexStat.version);
		values[j] = palloc(32);
		snprintf(values[j++], 32, "%d", indexStat.level);
		values[j] = palloc(32);
		snprintf(values[j++], 32, INT64_FORMAT,
				 (indexStat.root_pages +
				  indexStat.leaf_pages +
				  indexStat.internal_pages +
				  indexStat.deleted_pages +
				  indexStat.empty_pages) * BLCKSZ);
		values[j] = palloc(32);
		snprintf(values[j++], 32, "%u", indexStat.root_blkno);
		values[j] = palloc(32);
		snprintf(values[j++], 32, INT64_FORMAT, indexStat.internal_pages);
		values[j] = palloc(32);
		snprintf(values[j++], 32, INT64_FORMAT, indexStat.leaf_pages);
		values[j] = palloc(32);
		snprintf(values[j++], 32, INT64_FORMAT, indexStat.empty_pages);
		values[j] = palloc(32);
		snprintf(values[j++], 32, INT64_FORMAT, indexStat.deleted_pages);
		values[j] = palloc(32);
		if (indexStat.max_avail > 0)
			snprintf(values[j++], 32, "%.2f",
					 100.0 - (double) indexStat.free_space / (double) indexStat.max_avail * 100.0);
		else
			snprintf(values[j++], 32, "NaN");
		values[j] = palloc(32);
		if (indexStat.leaf_pages > 0)
			snprintf(values[j++], 32, "%.2f",
					 (double) indexStat.fragments / (double) indexStat.leaf_pages * 100.0);
		else
			snprintf(values[j++], 32, "NaN");

		tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
									   values);

		result = HeapTupleGetDatum(tuple);
	}

	PG_RETURN_DATUM(result);
}
Exemple #23
0
/*
 * pgstattuple_real
 *
 * The real work occurs here
 */
static Datum
pgstattuple_real(Relation rel)
{
	HeapScanDesc scan;
	HeapTuple	tuple;
	BlockNumber nblocks;
	BlockNumber block = 0;		/* next block to count free space in */
	BlockNumber tupblock;
	Buffer		buffer;
	uint64		table_len;
	uint64		tuple_len = 0;
	uint64		dead_tuple_len = 0;
	uint64		tuple_count = 0;
	uint64		dead_tuple_count = 0;
	double		tuple_percent;
	double		dead_tuple_percent;
	uint64		free_space = 0; /* free/reusable space in bytes */
	double		free_percent;	/* free/reusable space in % */
	TupleDesc	tupdesc;
	TupleTableSlot *slot;
	AttInMetadata *attinmeta;
	char	  **values;
	int			i;
	Datum		result;

	/*
	 * Build a tuple description for a pgstattupe_type tuple
	 */
	tupdesc = RelationNameGetTupleDesc(DUMMY_TUPLE);

	/* allocate a slot for a tuple with this tupdesc */
	slot = TupleDescGetSlot(tupdesc);

	/*
	 * Generate attribute metadata needed later to produce tuples from raw
	 * C strings
	 */
	attinmeta = TupleDescGetAttInMetadata(tupdesc);

	nblocks = RelationGetNumberOfBlocks(rel);
	scan = heap_beginscan(rel, SnapshotAny, 0, NULL);

	/* scan the relation */
	while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
	{
		uint16		sv_infomask;

		sv_infomask = tuple->t_data->t_infomask;
		if (HeapTupleSatisfiesNow(tuple->t_data))
		{
			tuple_len += tuple->t_len;
			tuple_count++;
		}
		else
		{
			dead_tuple_len += tuple->t_len;
			dead_tuple_count++;
		}
		if (sv_infomask != tuple->t_data->t_infomask)
			SetBufferCommitInfoNeedsSave(scan->rs_cbuf);

		/*
		 * To avoid physically reading the table twice, try to do the
		 * free-space scan in parallel with the heap scan.	However,
		 * heap_getnext may find no tuples on a given page, so we cannot
		 * simply examine the pages returned by the heap scan.
		 */
		tupblock = BlockIdGetBlockNumber(&tuple->t_self.ip_blkid);

		while (block <= tupblock)
		{
			buffer = ReadBuffer(rel, block);
			free_space += PageGetFreeSpace((Page) BufferGetPage(buffer));
			ReleaseBuffer(buffer);
			block++;
		}
	}
	heap_endscan(scan);

	while (block < nblocks)
	{
		buffer = ReadBuffer(rel, block);
		free_space += PageGetFreeSpace((Page) BufferGetPage(buffer));
		ReleaseBuffer(buffer);
		block++;
	}

	heap_close(rel, AccessShareLock);

	table_len = (uint64) nblocks *BLCKSZ;

	if (nblocks == 0)
	{
		tuple_percent = 0.0;
		dead_tuple_percent = 0.0;
		free_percent = 0.0;
	}
	else
	{
		tuple_percent = (double) tuple_len *100.0 / table_len;
		dead_tuple_percent = (double) dead_tuple_len *100.0 / table_len;
		free_percent = (double) free_space *100.0 / table_len;
	}

	/*
	 * Prepare a values array for storage in our slot. This should be an
	 * array of C strings which will be processed later by the appropriate
	 * "in" functions.
	 */
	values = (char **) palloc(NCOLUMNS * sizeof(char *));
	for (i = 0; i < NCOLUMNS; i++)
		values[i] = (char *) palloc(NCHARS * sizeof(char));
	i = 0;
	snprintf(values[i++], NCHARS, INT64_FORMAT, table_len);
	snprintf(values[i++], NCHARS, INT64_FORMAT, tuple_count);
	snprintf(values[i++], NCHARS, INT64_FORMAT, tuple_len);
	snprintf(values[i++], NCHARS, "%.2f", tuple_percent);
	snprintf(values[i++], NCHARS, INT64_FORMAT, dead_tuple_count);
	snprintf(values[i++], NCHARS, INT64_FORMAT, dead_tuple_len);
	snprintf(values[i++], NCHARS, "%.2f", dead_tuple_percent);
	snprintf(values[i++], NCHARS, INT64_FORMAT, free_space);
	snprintf(values[i++], NCHARS, "%.2f", free_percent);

	/* build a tuple */
	tuple = BuildTupleFromCStrings(attinmeta, values);

	/* make the tuple into a datum */
	result = TupleGetDatum(slot, tuple);

	/* Clean up */
	for (i = 0; i < NCOLUMNS; i++)
		pfree(values[i]);
	pfree(values);

	return (result);
}
Exemple #24
0
/*
 *	_hash_doinsert() -- Handle insertion of a single index tuple.
 *
 *		This routine is called by the public interface routines, hashbuild
 *		and hashinsert.  By here, itup is completely filled in.
 */
void
_hash_doinsert(Relation rel, IndexTuple itup)
{
	Buffer		buf;
	Buffer		metabuf;
	HashMetaPage metap;
	BlockNumber blkno;
	Page		page;
	HashPageOpaque pageopaque;
	Size		itemsz;
	bool		do_expand;
	uint32		hashkey;
	Bucket		bucket;

	/*
	 * Get the hash key for the item (it's stored in the index tuple itself).
	 */
	hashkey = _hash_get_indextuple_hashkey(itup);

	/* compute item size too */
	itemsz = IndexTupleDSize(*itup);
	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
								 * need to be consistent */

	/*
	 * Acquire shared split lock so we can compute the target bucket safely
	 * (see README).
	 */
	_hash_getlock(rel, 0, HASH_SHARE);

	/* Read the metapage */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
	metap = HashPageGetMeta(BufferGetPage(metabuf));

	/*
	 * Check whether the item can fit on a hash page at all. (Eventually, we
	 * ought to try to apply TOAST methods if not.)  Note that at this point,
	 * itemsz doesn't include the ItemId.
	 *
	 * XXX this is useless code if we are only storing hash keys.
	 */
	if (itemsz > HashMaxItemSize((Page) metap))
		ereport(ERROR,
				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
				 errmsg("index row size %lu exceeds hash maximum %lu",
						(unsigned long) itemsz,
						(unsigned long) HashMaxItemSize((Page) metap)),
			errhint("Values larger than a buffer page cannot be indexed.")));

	/*
	 * Compute the target bucket number, and convert to block number.
	 */
	bucket = _hash_hashkey2bucket(hashkey,
								  metap->hashm_maxbucket,
								  metap->hashm_highmask,
								  metap->hashm_lowmask);

	blkno = BUCKET_TO_BLKNO(metap, bucket);

	/* release lock on metapage, but keep pin since we'll need it again */
	_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);

	/*
	 * Acquire share lock on target bucket; then we can release split lock.
	 */
	_hash_getlock(rel, blkno, HASH_SHARE);

	_hash_droplock(rel, 0, HASH_SHARE);

	/* Fetch the primary bucket page for the bucket */
	buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);
	page = BufferGetPage(buf);
	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
	Assert(pageopaque->hasho_bucket == bucket);

	/* Do the insertion */
	while (PageGetFreeSpace(page) < itemsz)
	{
		/*
		 * no space on this page; check for an overflow page
		 */
		BlockNumber nextblkno = pageopaque->hasho_nextblkno;

		if (BlockNumberIsValid(nextblkno))
		{
			/*
			 * ovfl page exists; go get it.  if it doesn't have room, we'll
			 * find out next pass through the loop test above.
			 */
			_hash_relbuf(rel, buf);
			buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
			page = BufferGetPage(buf);
		}
		else
		{
			/*
			 * we're at the end of the bucket chain and we haven't found a
			 * page with enough room.  allocate a new overflow page.
			 */

			/* release our write lock without modifying buffer */
			_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);

			/* chain to a new overflow page */
			buf = _hash_addovflpage(rel, metabuf, buf);
			page = BufferGetPage(buf);

			/* should fit now, given test above */
			Assert(PageGetFreeSpace(page) >= itemsz);
		}
		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
		Assert(pageopaque->hasho_flag == LH_OVERFLOW_PAGE);
		Assert(pageopaque->hasho_bucket == bucket);
	}

	/* found page with enough space, so add the item here */
	(void) _hash_pgaddtup(rel, buf, itemsz, itup);

	/* write and release the modified page */
	_hash_wrtbuf(rel, buf);

	/* We can drop the bucket lock now */
	_hash_droplock(rel, blkno, HASH_SHARE);

	/*
	 * Write-lock the metapage so we can increment the tuple count. After
	 * incrementing it, check to see if it's time for a split.
	 */
	_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);

	metap->hashm_ntuples += 1;

	/* Make sure this stays in sync with _hash_expandtable() */
	do_expand = metap->hashm_ntuples >
		(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1);

	/* Write out the metapage and drop lock, but keep pin */
	_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);

	/* Attempt to split if a split is needed */
	if (do_expand)
		_hash_expandtable(rel, metabuf);

	/* Finally drop our pin on the metapage */
	_hash_dropbuf(rel, metabuf);
}
Exemple #25
0
Datum
spgstat(PG_FUNCTION_ARGS)
{
    text    	*name=PG_GETARG_TEXT_P(0);
    char 		*relname=text_to_cstring(name);
    RangeVar   	*relvar;
    Relation    index;
    List       	*relname_list;
    Oid			relOid;
    BlockNumber	blkno = SPGIST_HEAD_BLKNO;
    BlockNumber	totalPages = 0,
                innerPages = 0,
                emptyPages = 0;
    double		usedSpace = 0.0;
    char		res[1024];
    int			bufferSize = -1;
    int64		innerTuples = 0,
                leafTuples = 0;


    relname_list = stringToQualifiedNameList(relname);
    relvar = makeRangeVarFromNameList(relname_list);
    relOid = RangeVarGetRelid(relvar, false);
    index = index_open(relOid, AccessExclusiveLock);

    if ( index->rd_am == NULL )
        elog(ERROR, "Relation %s.%s is not an index",
             get_namespace_name(RelationGetNamespace(index)),
             RelationGetRelationName(index) );
    totalPages = RelationGetNumberOfBlocks(index);

    for(blkno=SPGIST_HEAD_BLKNO; blkno<totalPages; blkno++)
    {
        Buffer	buffer;
        Page	page;

        buffer = ReadBuffer(index, blkno);
        LockBuffer(buffer, BUFFER_LOCK_SHARE);

        page = BufferGetPage(buffer);

        if (SpGistPageIsLeaf(page))
        {
            leafTuples += SpGistPageGetMaxOffset(page);
        }
        else
        {
            innerPages++;
            innerTuples += SpGistPageGetMaxOffset(page);
        }

        if (bufferSize < 0)
            bufferSize = BufferGetPageSize(buffer) - MAXALIGN(sizeof(SpGistPageOpaqueData)) -
                         SizeOfPageHeaderData;

        usedSpace += bufferSize - (PageGetFreeSpace(page) + sizeof(ItemIdData));

        if (PageGetFreeSpace(page) + sizeof(ItemIdData) == bufferSize)
            emptyPages++;

        UnlockReleaseBuffer(buffer);
    }

    index_close(index, AccessExclusiveLock);

    totalPages--; /* metapage */

    snprintf(res, sizeof(res),
             "totalPages:  %u\n"
             "innerPages:  %u\n"
             "leafPages:   %u\n"
             "emptyPages:  %u\n"
             "usedSpace:   %.2f kbytes\n"
             "freeSpace:   %.2f kbytes\n"
             "fillRatio:   %.2f%c\n"
             "leafTuples:  %lld\n"
             "innerTuples: %lld",
             totalPages, innerPages, totalPages - innerPages, emptyPages,
             usedSpace / 1024.0,
             (( (double) bufferSize ) * ( (double) totalPages ) - usedSpace) / 1024,
             100.0 * ( usedSpace / (( (double) bufferSize ) * ( (double) totalPages )) ),
             '%',
             leafTuples, innerTuples
            );

    PG_RETURN_TEXT_P(CStringGetTextDatum(res));
}
Exemple #26
0
/*
 *	lazy_scan_heap() -- scan an open heap relation
 *
 *		This routine sets commit status bits, builds lists of dead tuples
 *		and pages with free space, and calculates statistics on the number
 *		of live tuples in the heap.  When done, or when we run low on space
 *		for dead-tuple TIDs, invoke vacuuming of indexes and heap.
 *
 *		If there are no indexes then we just vacuum each dirty page as we
 *		process it, since there's no point in gathering many tuples.
 */
static void
lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
			   Relation *Irel, int nindexes, List *updated_stats, List *all_extra_oids)
{
	MIRROREDLOCK_BUFMGR_DECLARE;

	BlockNumber nblocks,
				blkno;
	HeapTupleData tuple;
	char	   *relname;
	BlockNumber empty_pages,
				vacuumed_pages;
	double		num_tuples,
				tups_vacuumed,
				nkeep,
				nunused;
	IndexBulkDeleteResult **indstats;
	int			i;
	int reindex_count = 1;
	PGRUsage	ru0;

	/* Fetch gp_persistent_relation_node information that will be added to XLOG record. */
	RelationFetchGpRelationNodeForXLog(onerel);

	pg_rusage_init(&ru0);

	relname = RelationGetRelationName(onerel);
	ereport(elevel,
			(errmsg("vacuuming \"%s.%s\"",
					get_namespace_name(RelationGetNamespace(onerel)),
					relname)));

	empty_pages = vacuumed_pages = 0;
	num_tuples = tups_vacuumed = nkeep = nunused = 0;

	indstats = (IndexBulkDeleteResult **)
		palloc0(nindexes * sizeof(IndexBulkDeleteResult *));

	nblocks = RelationGetNumberOfBlocks(onerel);
	vacrelstats->rel_pages = nblocks;
	vacrelstats->nonempty_pages = 0;

	lazy_space_alloc(vacrelstats, nblocks);

	for (blkno = 0; blkno < nblocks; blkno++)
	{
		Buffer		buf;
		Page		page;
		OffsetNumber offnum,
					maxoff;
		bool		tupgone,
					hastup;
		int			prev_dead_count;
		OffsetNumber frozen[MaxOffsetNumber];
		int			nfrozen;

		vacuum_delay_point();

		/*
		 * If we are close to overrunning the available space for dead-tuple
		 * TIDs, pause and do a cycle of vacuuming before we tackle this page.
		 */
		if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
			vacrelstats->num_dead_tuples > 0)
		{
			/* Remove index entries */
			for (i = 0; i < nindexes; i++)
			{
				List *extra_oids = get_oids_for_bitmap(all_extra_oids, Irel[i],
													   onerel, reindex_count);

				lazy_vacuum_index(Irel[i],
								  &indstats[i],
								  vacrelstats,
								  extra_oids);
				list_free(extra_oids);
			}
			reindex_count++;

			/* Remove tuples from heap */
			lazy_vacuum_heap(onerel, vacrelstats);
			/* Forget the now-vacuumed tuples, and press on */
			vacrelstats->num_dead_tuples = 0;
		}

		/* -------- MirroredLock ---------- */
		MIRROREDLOCK_BUFMGR_LOCK;

		buf = ReadBuffer(onerel, blkno);

		/* Initially, we only need shared access to the buffer */
		LockBuffer(buf, BUFFER_LOCK_SHARE);

		page = BufferGetPage(buf);

		if (PageIsNew(page))
		{
			/*
			 * An all-zeroes page could be left over if a backend extends the
			 * relation but crashes before initializing the page. Reclaim such
			 * pages for use.
			 *
			 * We have to be careful here because we could be looking at a
			 * page that someone has just added to the relation and not yet
			 * been able to initialize (see RelationGetBufferForTuple). To
			 * protect against that, release the buffer lock, grab the
			 * relation extension lock momentarily, and re-lock the buffer. If
			 * the page is still uninitialized by then, it must be left over
			 * from a crashed backend, and we can initialize it.
			 *
			 * We don't really need the relation lock when this is a new or
			 * temp relation, but it's probably not worth the code space to
			 * check that, since this surely isn't a critical path.
			 *
			 * Note: the comparable code in vacuum.c need not worry because
			 * it's got exclusive lock on the whole relation.
			 */
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);

			MIRROREDLOCK_BUFMGR_UNLOCK;
			/* -------- MirroredLock ---------- */

			LockRelationForExtension(onerel, ExclusiveLock);
			UnlockRelationForExtension(onerel, ExclusiveLock);

			/* -------- MirroredLock ---------- */
			MIRROREDLOCK_BUFMGR_LOCK;

			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);  /* LockBufferForCleanup(buf)? */
			if (PageIsNew(page))
			{
				ereport(WARNING,
				(errmsg("relation \"%s\" page %u is uninitialized --- fixing",
						relname, blkno)));
				PageInit(page, BufferGetPageSize(buf), 0);

				/* must record in xlog so that changetracking will know about this change */
				log_heap_newpage(onerel, page, blkno);

				empty_pages++;
				lazy_record_free_space(vacrelstats, blkno,
									   PageGetFreeSpace(page));
			}
			MarkBufferDirty(buf);
			UnlockReleaseBuffer(buf);

			MIRROREDLOCK_BUFMGR_UNLOCK;
			/* -------- MirroredLock ---------- */

			continue;
		}

		if (PageIsEmpty(page))
		{
			empty_pages++;
			lazy_record_free_space(vacrelstats, blkno,
								   PageGetFreeSpace(page));
			UnlockReleaseBuffer(buf);

			MIRROREDLOCK_BUFMGR_UNLOCK;
			/* -------- MirroredLock ---------- */

			continue;
		}

		nfrozen = 0;
		hastup = false;
		prev_dead_count = vacrelstats->num_dead_tuples;
		maxoff = PageGetMaxOffsetNumber(page);
		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
		{
			ItemId		itemid;

			itemid = PageGetItemId(page, offnum);

			if (!ItemIdIsUsed(itemid))
			{
				nunused += 1;
				continue;
			}

			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			tuple.t_len = ItemIdGetLength(itemid);
			ItemPointerSet(&(tuple.t_self), blkno, offnum);

			tupgone = false;

			switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf, false))
			{
				case HEAPTUPLE_DEAD:
					tupgone = true;		/* we can delete the tuple */
					break;
				case HEAPTUPLE_LIVE:
					/* Tuple is good --- but let's do some validity checks */
					if (onerel->rd_rel->relhasoids &&
						!OidIsValid(HeapTupleGetOid(&tuple)))
						elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
							 relname, blkno, offnum);
					break;
				case HEAPTUPLE_RECENTLY_DEAD:

					/*
					 * If tuple is recently deleted then we must not remove it
					 * from relation.
					 */
					nkeep += 1;
					break;
				case HEAPTUPLE_INSERT_IN_PROGRESS:
					/* This is an expected case during concurrent vacuum */
					break;
				case HEAPTUPLE_DELETE_IN_PROGRESS:
					/* This is an expected case during concurrent vacuum */
					break;
				default:
					elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
					break;
			}

			if (tupgone)
			{
				lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
				tups_vacuumed += 1;
			}
			else
			{
				num_tuples += 1;
				hastup = true;

				/*
				 * Each non-removable tuple must be checked to see if it
				 * needs freezing.  If we already froze anything, then
				 * we've already switched the buffer lock to exclusive.
				 */
				if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
									  (nfrozen > 0) ? InvalidBuffer : buf))
					frozen[nfrozen++] = offnum;
			}
		}						/* scan along page */

		/*
		 * If we froze any tuples, mark the buffer dirty, and write a WAL
		 * record recording the changes.  We must log the changes to be
		 * crash-safe against future truncation of CLOG.
		 */
		if (nfrozen > 0)
		{
			MarkBufferDirty(buf);
			/* no XLOG for temp tables, though */
			if (!onerel->rd_istemp)
			{
				XLogRecPtr	recptr;

				recptr = log_heap_freeze(onerel, buf, FreezeLimit,
										 frozen, nfrozen);
				PageSetLSN(page, recptr);
				PageSetTLI(page, ThisTimeLineID);
			}
		}

		/*
		 * If there are no indexes then we can vacuum the page right now
		 * instead of doing a second scan.
		 */
		if (nindexes == 0 &&
			vacrelstats->num_dead_tuples > 0)
		{
			/* Trade in buffer share lock for super-exclusive lock */
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
			LockBufferForCleanup(buf);

			/* Remove tuples from heap */
			lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats);
			/* Forget the now-vacuumed tuples, and press on */
			vacrelstats->num_dead_tuples = 0;
			vacuumed_pages++;
		}

		/*
		 * If we remembered any tuples for deletion, then the page will be
		 * visited again by lazy_vacuum_heap, which will compute and record
		 * its post-compaction free space.	If not, then we're done with this
		 * page, so remember its free space as-is.	(This path will always be
		 * taken if there are no indexes.)
		 */
		if (vacrelstats->num_dead_tuples == prev_dead_count)
		{
			lazy_record_free_space(vacrelstats, blkno,
								   PageGetFreeSpace(page));
		}

		/* Remember the location of the last page with nonremovable tuples */
		if (hastup)
			vacrelstats->nonempty_pages = blkno + 1;

		UnlockReleaseBuffer(buf);

		MIRROREDLOCK_BUFMGR_UNLOCK;
		/* -------- MirroredLock ---------- */

	}

	/* save stats for use later */
	vacrelstats->rel_tuples = num_tuples;
	vacrelstats->tuples_deleted = tups_vacuumed;

	/* If any tuples need to be deleted, perform final vacuum cycle */
	/* XXX put a threshold on min number of tuples here? */
	if (vacrelstats->num_dead_tuples > 0)
	{
		/* Remove index entries */
		for (i = 0; i < nindexes; i++)
		{
			List *extra_oids = get_oids_for_bitmap(all_extra_oids, Irel[i],
												   onerel, reindex_count);

			lazy_vacuum_index(Irel[i],
							  &indstats[i],
							  vacrelstats,
							  extra_oids);
			list_free(extra_oids);
		}
		reindex_count++;

		/* Remove tuples from heap */
		lazy_vacuum_heap(onerel, vacrelstats);
	}

	/* Do post-vacuum cleanup and statistics update for each index */
	for (i = 0; i < nindexes; i++)
		lazy_cleanup_index(Irel[i], indstats[i], vacrelstats, updated_stats);

	/* If no indexes, make log report that lazy_vacuum_heap would've made */
	if (vacuumed_pages)
		ereport(elevel,
				(errmsg("\"%s\": removed %.0f row versions in %u pages",
						RelationGetRelationName(onerel),
						tups_vacuumed, vacuumed_pages)));

	ereport(elevel,
			(errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
					RelationGetRelationName(onerel),
					tups_vacuumed, num_tuples, nblocks),
			 errdetail("%.0f dead row versions cannot be removed yet.\n"
					   "There were %.0f unused item pointers.\n"
					   "%u pages contain useful free space.\n"
					   "%u pages are entirely empty.\n"
					   "%s.",
					   nkeep,
					   nunused,
					   vacrelstats->tot_free_pages,
					   empty_pages,
					   pg_rusage_show(&ru0))));

	if (vacrelstats->tot_free_pages > MaxFSMPages)
		ereport(WARNING,
				(errmsg("relation \"%s.%s\" contains more than \"max_fsm_pages\" pages with useful free space",
						get_namespace_name(RelationGetNamespace(onerel)),
						relname),
				 errhint("Consider using VACUUM FULL on this relation or increasing the configuration parameter \"max_fsm_pages\".")));
}
Exemple #27
0
/*----------
 * Add an item to a disk page from the sort output.
 *
 * We must be careful to observe the page layout conventions of nbtsearch.c:
 * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
 * - on non-leaf pages, the key portion of the first item need not be
 *	 stored, we should store only the link.
 *
 * A leaf page being built looks like:
 *
 * +----------------+---------------------------------+
 * | PageHeaderData | linp0 linp1 linp2 ...           |
 * +-----------+----+---------------------------------+
 * | ... linpN |									  |
 * +-----------+--------------------------------------+
 * |	 ^ last										  |
 * |												  |
 * +-------------+------------------------------------+
 * |			 | itemN ...                          |
 * +-------------+------------------+-----------------+
 * |		  ... item3 item2 item1 | "special space" |
 * +--------------------------------+-----------------+
 *
 * Contrast this with the diagram in bufpage.h; note the mismatch
 * between linps and items.  This is because we reserve linp0 as a
 * placeholder for the pointer to the "high key" item; when we have
 * filled up the page, we will set linp0 to point to itemN and clear
 * linpN.  On the other hand, if we find this is the last (rightmost)
 * page, we leave the items alone and slide the linp array over.
 *
 * 'last' pointer indicates the last offset added to the page.
 *----------
 */
static void
_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
{
	Page		npage;
	BlockNumber nblkno;
	OffsetNumber last_off;
	Size		pgspc;
	Size		itupsz;

	/*
	 * This is a handy place to check for cancel interrupts during the btree
	 * load phase of index creation.
	 */
	CHECK_FOR_INTERRUPTS();

	npage = state->btps_page;
	nblkno = state->btps_blkno;
	last_off = state->btps_lastoff;

	pgspc = PageGetFreeSpace(npage);
	itupsz = IndexTupleDSize(*itup);
	itupsz = MAXALIGN(itupsz);

	/*
	 * Check whether the item can fit on a btree page at all. (Eventually, we
	 * ought to try to apply TOAST methods if not.) We actually need to be
	 * able to fit three items on every page, so restrict any one item to 1/3
	 * the per-page available space. Note that at this point, itupsz doesn't
	 * include the ItemId.
	 *
	 * NOTE: similar code appears in _bt_insertonpg() to defend against
	 * oversize items being inserted into an already-existing index. But
	 * during creation of an index, we don't go through there.
	 */
	if (itupsz > BTMaxItemSize(npage))
		ereport(ERROR,
				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
			errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
				   itupsz, BTMaxItemSize(npage),
				   RelationGetRelationName(wstate->index)),
		errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
				"Consider a function index of an MD5 hash of the value, "
				"or use full text indexing."),
				 errtableconstraint(wstate->heap,
									RelationGetRelationName(wstate->index))));

	/*
	 * Check to see if page is "full".  It's definitely full if the item won't
	 * fit.  Otherwise, compare to the target freespace derived from the
	 * fillfactor.  However, we must put at least two items on each page, so
	 * disregard fillfactor if we don't have that many.
	 */
	if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY))
	{
		/*
		 * Finish off the page and write it out.
		 */
		Page		opage = npage;
		BlockNumber oblkno = nblkno;
		ItemId		ii;
		ItemId		hii;
		IndexTuple	oitup;

		/* Create new page of same level */
		npage = _bt_blnewpage(state->btps_level);

		/* and assign it a page position */
		nblkno = wstate->btws_pages_alloced++;

		/*
		 * We copy the last item on the page into the new page, and then
		 * rearrange the old page so that the 'last item' becomes its high key
		 * rather than a true data item.  There had better be at least two
		 * items on the page already, else the page would be empty of useful
		 * data.
		 */
		Assert(last_off > P_FIRSTKEY);
		ii = PageGetItemId(opage, last_off);
		oitup = (IndexTuple) PageGetItem(opage, ii);
		_bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY);

		/*
		 * Move 'last' into the high key position on opage
		 */
		hii = PageGetItemId(opage, P_HIKEY);
		*hii = *ii;
		ItemIdSetUnused(ii);	/* redundant */
		((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);

		/*
		 * Link the old page into its parent, using its minimum key. If we
		 * don't have a parent, we have to create one; this adds a new btree
		 * level.
		 */
		if (state->btps_next == NULL)
			state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);

		Assert(state->btps_minkey != NULL);
		ItemPointerSet(&(state->btps_minkey->t_tid), oblkno, P_HIKEY);
		_bt_buildadd(wstate, state->btps_next, state->btps_minkey);
		pfree(state->btps_minkey);

		/*
		 * Save a copy of the minimum key for the new page.  We have to copy
		 * it off the old page, not the new one, in case we are not at leaf
		 * level.
		 */
		state->btps_minkey = CopyIndexTuple(oitup);

		/*
		 * Set the sibling links for both pages.
		 */
		{
			BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
			BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);

			oopaque->btpo_next = nblkno;
			nopaque->btpo_prev = oblkno;
			nopaque->btpo_next = P_NONE;		/* redundant */
		}

		/*
		 * Write out the old page.  We never need to touch it again, so we can
		 * free the opage workspace too.
		 */
		_bt_blwritepage(wstate, opage, oblkno);

		/*
		 * Reset last_off to point to new page
		 */
		last_off = P_FIRSTKEY;
	}

	/*
	 * If the new item is the first for its page, stash a copy for later. Note
	 * this will only happen for the first item on a level; on later pages,
	 * the first item for a page is copied from the prior page in the code
	 * above.
	 */
	if (last_off == P_HIKEY)
	{
		Assert(state->btps_minkey == NULL);
		state->btps_minkey = CopyIndexTuple(itup);
	}

	/*
	 * Add the new item into the current page.
	 */
	last_off = OffsetNumberNext(last_off);
	_bt_sortaddtup(npage, itupsz, itup, last_off);

	state->btps_page = npage;
	state->btps_blkno = nblkno;
	state->btps_lastoff = last_off;
}
Exemple #28
0
static int
nospace(Page p, IndexTuple it)
{
	return PageGetFreeSpace(p) < IndexTupleSize(it);
}
/*
 *	_hash_squeezebucket(rel, bucket)
 *
 *	Try to squeeze the tuples onto pages occurring earlier in the
 *	bucket chain in an attempt to free overflow pages. When we start
 *	the "squeezing", the page from which we start taking tuples (the
 *	"read" page) is the last bucket in the bucket chain and the page
 *	onto which we start squeezing tuples (the "write" page) is the
 *	first page in the bucket chain.  The read page works backward and
 *	the write page works forward; the procedure terminates when the
 *	read page and write page are the same page.
 *
 *	At completion of this procedure, it is guaranteed that all pages in
 *	the bucket are nonempty, unless the bucket is totally empty (in
 *	which case all overflow pages will be freed).  The original implementation
 *	required that to be true on entry as well, but it's a lot easier for
 *	callers to leave empty overflow pages and let this guy clean it up.
 *
 *	Caller must hold exclusive lock on the target bucket.  This allows
 *	us to safely lock multiple pages in the bucket.
 *
 *	Since this function is invoked in VACUUM, we provide an access strategy
 *	parameter that controls fetches of the bucket pages.
 */
void
_hash_squeezebucket(Relation rel,
                    Bucket bucket,
                    BlockNumber bucket_blkno,
                    BufferAccessStrategy bstrategy)
{
    BlockNumber wblkno;
    BlockNumber rblkno;
    Buffer		wbuf;
    Buffer		rbuf;
    Page		wpage;
    Page		rpage;
    HashPageOpaque wopaque;
    HashPageOpaque ropaque;
    bool		wbuf_dirty;

    /*
     * start squeezing into the base bucket page.
     */
    wblkno = bucket_blkno;
    wbuf = _hash_getbuf_with_strategy(rel,
                                      wblkno,
                                      HASH_WRITE,
                                      LH_BUCKET_PAGE,
                                      bstrategy);
    wpage = BufferGetPage(wbuf);
    wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);

    /*
     * if there aren't any overflow pages, there's nothing to squeeze.
     */
    if (!BlockNumberIsValid(wopaque->hasho_nextblkno))
    {
        _hash_relbuf(rel, wbuf);
        return;
    }

    /*
     * Find the last page in the bucket chain by starting at the base bucket
     * page and working forward.  Note: we assume that a hash bucket chain is
     * usually smaller than the buffer ring being used by VACUUM, else using
     * the access strategy here would be counterproductive.
     */
    rbuf = InvalidBuffer;
    ropaque = wopaque;
    do
    {
        rblkno = ropaque->hasho_nextblkno;
        if (rbuf != InvalidBuffer)
            _hash_relbuf(rel, rbuf);
        rbuf = _hash_getbuf_with_strategy(rel,
                                          rblkno,
                                          HASH_WRITE,
                                          LH_OVERFLOW_PAGE,
                                          bstrategy);
        rpage = BufferGetPage(rbuf);
        ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
        Assert(ropaque->hasho_bucket == bucket);
    } while (BlockNumberIsValid(ropaque->hasho_nextblkno));

    /*
     * squeeze the tuples.
     */
    wbuf_dirty = false;
    for (;;)
    {
        OffsetNumber roffnum;
        OffsetNumber maxroffnum;
        OffsetNumber deletable[MaxOffsetNumber];
        int			ndeletable = 0;

        /* Scan each tuple in "read" page */
        maxroffnum = PageGetMaxOffsetNumber(rpage);
        for (roffnum = FirstOffsetNumber;
                roffnum <= maxroffnum;
                roffnum = OffsetNumberNext(roffnum))
        {
            IndexTuple	itup;
            Size		itemsz;

            itup = (IndexTuple) PageGetItem(rpage,
                                            PageGetItemId(rpage, roffnum));
            itemsz = IndexTupleDSize(*itup);
            itemsz = MAXALIGN(itemsz);

            /*
             * Walk up the bucket chain, looking for a page big enough for
             * this item.  Exit if we reach the read page.
             */
            while (PageGetFreeSpace(wpage) < itemsz)
            {
                Assert(!PageIsEmpty(wpage));

                wblkno = wopaque->hasho_nextblkno;
                Assert(BlockNumberIsValid(wblkno));

                if (wbuf_dirty)
                    _hash_wrtbuf(rel, wbuf);
                else
                    _hash_relbuf(rel, wbuf);

                /* nothing more to do if we reached the read page */
                if (rblkno == wblkno)
                {
                    if (ndeletable > 0)
                    {
                        /* Delete tuples we already moved off read page */
                        PageIndexMultiDelete(rpage, deletable, ndeletable);
                        _hash_wrtbuf(rel, rbuf);
                    }
                    else
                        _hash_relbuf(rel, rbuf);
                    return;
                }

                wbuf = _hash_getbuf_with_strategy(rel,
                                                  wblkno,
                                                  HASH_WRITE,
                                                  LH_OVERFLOW_PAGE,
                                                  bstrategy);
                wpage = BufferGetPage(wbuf);
                wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
                Assert(wopaque->hasho_bucket == bucket);
                wbuf_dirty = false;
            }

            /*
             * we have found room so insert on the "write" page, being careful
             * to preserve hashkey ordering.  (If we insert many tuples into
             * the same "write" page it would be worth qsort'ing instead of
             * doing repeated _hash_pgaddtup.)
             */
            (void) _hash_pgaddtup(rel, wbuf, itemsz, itup);
            wbuf_dirty = true;

            /* remember tuple for deletion from "read" page */
            deletable[ndeletable++] = roffnum;
        }

        /*
         * If we reach here, there are no live tuples on the "read" page ---
         * it was empty when we got to it, or we moved them all.  So we can
         * just free the page without bothering with deleting tuples
         * individually.  Then advance to the previous "read" page.
         *
         * Tricky point here: if our read and write pages are adjacent in the
         * bucket chain, our write lock on wbuf will conflict with
         * _hash_freeovflpage's attempt to update the sibling links of the
         * removed page.  However, in that case we are done anyway, so we can
         * simply drop the write lock before calling _hash_freeovflpage.
         */
        rblkno = ropaque->hasho_prevblkno;
        Assert(BlockNumberIsValid(rblkno));

        /* are we freeing the page adjacent to wbuf? */
        if (rblkno == wblkno)
        {
            /* yes, so release wbuf lock first */
            if (wbuf_dirty)
                _hash_wrtbuf(rel, wbuf);
            else
                _hash_relbuf(rel, wbuf);
            /* free this overflow page (releases rbuf) */
            _hash_freeovflpage(rel, rbuf, bstrategy);
            /* done */
            return;
        }

        /* free this overflow page, then get the previous one */
        _hash_freeovflpage(rel, rbuf, bstrategy);

        rbuf = _hash_getbuf_with_strategy(rel,
                                          rblkno,
                                          HASH_WRITE,
                                          LH_OVERFLOW_PAGE,
                                          bstrategy);
        rpage = BufferGetPage(rbuf);
        ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
        Assert(ropaque->hasho_bucket == bucket);
    }

    /* NOTREACHED */
}
Exemple #30
0
static void
bitmap_xlog_insert_lovitem(bool redo, XLogRecPtr lsn, XLogRecord* record)
{
	xl_bm_lovitem	*xlrec = (xl_bm_lovitem*) XLogRecGetData(record);
	Relation reln;

	reln = XLogOpenRelation(xlrec->bm_node);
	if (!RelationIsValid(reln))
		return;

	if (redo)
	{
		Buffer			lovBuffer;
		Page			lovPage;

#ifdef BM_DEBUG
		ereport(LOG, (errcode(LOG), 
			errmsg("call bitmap_xlog_insert_lovitem: redo=%d, blkno=%d\n", 
					redo, xlrec->bm_lov_blkno)));
#endif

		lovBuffer = XLogReadBuffer(false, reln, xlrec->bm_lov_blkno);
		if (!BufferIsValid(lovBuffer))
			elog(PANIC, "bm_insert_redo: block unfound: %d",
				 xlrec->bm_lov_blkno);

		lovPage = BufferGetPage(lovBuffer);

		if (XLByteLT(PageGetLSN(lovPage), lsn))
		{
			if(xlrec->bm_isNewItem)
			{
				OffsetNumber	newOffset, itemSize;

				newOffset = OffsetNumberNext(PageGetMaxOffsetNumber(lovPage));
				if (newOffset != xlrec->bm_lov_offset)
					elog(PANIC, 
				"bm_insert_redo: LOV item is not inserted in pos %d(requested %d)",
						 newOffset, xlrec->bm_lov_offset);

				itemSize = sizeof(BMLOVItemData);
				if (itemSize > PageGetFreeSpace(lovPage))
					elog(PANIC, 
						 "bm_insert_redo: not enough space in LOV page %d",
						 xlrec->bm_lov_blkno);
		
				if (PageAddItem(lovPage, (Item)&(xlrec->bm_lovItem), itemSize, 
								newOffset, LP_USED) == InvalidOffsetNumber)
					ereport(ERROR,
							(errcode(ERRCODE_INTERNAL_ERROR),
							errmsg("failed to add LOV item to \"%s\"",
							RelationGetRelationName(reln))));
			}

			else{
				BMLOVItem oldLovItem;
				oldLovItem = (BMLOVItem)
					PageGetItem(lovPage, 
								PageGetItemId(lovPage, xlrec->bm_lov_offset));

				memcpy(oldLovItem, &(xlrec->bm_lovItem), sizeof(BMLOVItemData));
			}

			PageSetLSN(lovPage, lsn);
			PageSetTLI(lovPage, ThisTimeLineID);
			_bitmap_wrtbuf(lovBuffer);
		}

		else {
			_bitmap_relbuf(lovBuffer);
		}
	}

	else
		elog(PANIC, "bm_insert_undo: not implemented.");
}