Esempio n. 1
0
File: gistproc.c Progetto: 50wu/gpdb
/*
 * The GiST PickSplit method
 *
 * New linear algorithm, see 'New Linear Node Splitting Algorithm for R-tree',
 * C.H.Ang and T.C.Tan
 */
Datum
gist_box_picksplit(PG_FUNCTION_ARGS)
{
	GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
	GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1);
	OffsetNumber i;
	OffsetNumber *listL,
			   *listR,
			   *listB,
			   *listT;
	BOX		   *unionL,
			   *unionR,
			   *unionB,
			   *unionT;
	int			posL,
				posR,
				posB,
				posT;
	BOX			pageunion;
	BOX		   *cur;
	char		direction = ' ';
	bool		allisequal = true;
	OffsetNumber maxoff;
	int			nbytes;

	posL = posR = posB = posT = 0;
	maxoff = entryvec->n - 1;

	cur = DatumGetBoxP(entryvec->vector[FirstOffsetNumber].key);
	memcpy((void *) &pageunion, (void *) cur, sizeof(BOX));

	/* find MBR */
	for (i = OffsetNumberNext(FirstOffsetNumber); i <= maxoff; i = OffsetNumberNext(i))
	{
		cur = DatumGetBoxP(entryvec->vector[i].key);
		if (allisequal == true && (
								   pageunion.high.x != cur->high.x ||
								   pageunion.high.y != cur->high.y ||
								   pageunion.low.x != cur->low.x ||
								   pageunion.low.y != cur->low.y
								   ))
			allisequal = false;

		adjustBox(&pageunion, cur);
	}

	if (allisequal)
	{
		/*
		 * All entries are the same
		 */
		fallbackSplit(entryvec, v);
		PG_RETURN_POINTER(v);
	}

	nbytes = (maxoff + 2) * sizeof(OffsetNumber);
	listL = (OffsetNumber *) palloc(nbytes);
	listR = (OffsetNumber *) palloc(nbytes);
	listB = (OffsetNumber *) palloc(nbytes);
	listT = (OffsetNumber *) palloc(nbytes);
	unionL = (BOX *) palloc(sizeof(BOX));
	unionR = (BOX *) palloc(sizeof(BOX));
	unionB = (BOX *) palloc(sizeof(BOX));
	unionT = (BOX *) palloc(sizeof(BOX));

#define ADDLIST( list, unionD, pos, num ) do { \
	if ( pos ) { \
		if ( (unionD)->high.x < cur->high.x ) (unionD)->high.x	= cur->high.x; \
		if ( (unionD)->low.x  > cur->low.x	) (unionD)->low.x	= cur->low.x; \
		if ( (unionD)->high.y < cur->high.y ) (unionD)->high.y	= cur->high.y; \
		if ( (unionD)->low.y  > cur->low.y	) (unionD)->low.y	= cur->low.y; \
	} else { \
			memcpy( (void*)(unionD), (void*) cur, sizeof( BOX ) );	\
	} \
	(list)[pos] = num; \
	(pos)++; \
} while(0)

	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
	{
		cur = DatumGetBoxP(entryvec->vector[i].key);
		if (cur->low.x - pageunion.low.x < pageunion.high.x - cur->high.x)
			ADDLIST(listL, unionL, posL, i);
		else
			ADDLIST(listR, unionR, posR, i);
		if (cur->low.y - pageunion.low.y < pageunion.high.y - cur->high.y)
			ADDLIST(listB, unionB, posB, i);
		else
			ADDLIST(listT, unionT, posT, i);
	}

#define LIMIT_RATIO 0.1
#define _IS_BADRATIO(x,y)	( (y) == 0 || (float)(x)/(float)(y) < LIMIT_RATIO )
#define IS_BADRATIO(x,y) ( _IS_BADRATIO((x),(y)) || _IS_BADRATIO((y),(x)) )
	/* bad disposition, try to split by centers of boxes  */
	if (IS_BADRATIO(posR, posL) && IS_BADRATIO(posT, posB))
	{
		double		avgCenterX = 0.0,
					avgCenterY = 0.0;
		double		CenterX,
					CenterY;

		for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
		{
			cur = DatumGetBoxP(entryvec->vector[i].key);
			avgCenterX += ((double) cur->high.x + (double) cur->low.x) / 2.0;
			avgCenterY += ((double) cur->high.y + (double) cur->low.y) / 2.0;
		}

		avgCenterX /= maxoff;
		avgCenterY /= maxoff;

		posL = posR = posB = posT = 0;
		for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
		{
			cur = DatumGetBoxP(entryvec->vector[i].key);

			CenterX = ((double) cur->high.x + (double) cur->low.x) / 2.0;
			CenterY = ((double) cur->high.y + (double) cur->low.y) / 2.0;

			if (CenterX < avgCenterX)
				ADDLIST(listL, unionL, posL, i);
			else if (CenterX == avgCenterX)
			{
				if (posL > posR)
					ADDLIST(listR, unionR, posR, i);
				else
					ADDLIST(listL, unionL, posL, i);
			}
			else
				ADDLIST(listR, unionR, posR, i);

			if (CenterY < avgCenterY)
				ADDLIST(listB, unionB, posB, i);
			else if (CenterY == avgCenterY)
			{
				if (posB > posT)
					ADDLIST(listT, unionT, posT, i);
				else
					ADDLIST(listB, unionB, posB, i);
			}
			else
				ADDLIST(listT, unionT, posT, i);
		}

		if (IS_BADRATIO(posR, posL) && IS_BADRATIO(posT, posB))
		{
			fallbackSplit(entryvec, v);
			PG_RETURN_POINTER(v);
		}
	}

	/* which split more optimal? */
	if (Max(posL, posR) < Max(posB, posT))
		direction = 'x';
	else if (Max(posL, posR) > Max(posB, posT))
		direction = 'y';
	else
	{
		Datum		interLR = DirectFunctionCall2(rt_box_inter,
												  BoxPGetDatum(unionL),
												  BoxPGetDatum(unionR));
		Datum		interBT = DirectFunctionCall2(rt_box_inter,
												  BoxPGetDatum(unionB),
												  BoxPGetDatum(unionT));
		double		sizeLR,
					sizeBT;

		sizeLR = size_box(interLR);
		sizeBT = size_box(interBT);

		if (sizeLR < sizeBT)
			direction = 'x';
		else
			direction = 'y';
	}

	if (direction == 'x')
		chooseLR(v,
				 listL, posL, unionL,
				 listR, posR, unionR);
	else
		chooseLR(v,
				 listB, posB, unionB,
				 listT, posT, unionT);

	PG_RETURN_POINTER(v);
}
Esempio n. 2
0
/*
** The GiST PickSplit method for segments
** We use Guttman's poly time split algorithm
*/
GIST_SPLITVEC *
gseg_picksplit(GistEntryVector *entryvec,
			   GIST_SPLITVEC *v)
{
	OffsetNumber i,
				j;
	SEG		   *datum_alpha,
			   *datum_beta;
	SEG		   *datum_l,
			   *datum_r;
	SEG		   *union_d,
			   *union_dl,
			   *union_dr;
	SEG		   *inter_d;
	bool		firsttime;
	float		size_alpha,
				size_beta,
				size_union,
				size_inter;
	float		size_waste,
				waste;
	float		size_l,
				size_r;
	int			nbytes;
	OffsetNumber seed_1 = 1,
				seed_2 = 2;
	OffsetNumber *left,
			   *right;
	OffsetNumber maxoff;

#ifdef GIST_DEBUG
	fprintf(stderr, "picksplit\n");
#endif

	maxoff = entryvec->n - 2;
	nbytes = (maxoff + 2) * sizeof(OffsetNumber);
	v->spl_left = (OffsetNumber *) palloc(nbytes);
	v->spl_right = (OffsetNumber *) palloc(nbytes);

	firsttime = true;
	waste = 0.0;

	for (i = FirstOffsetNumber; i < maxoff; i = OffsetNumberNext(i))
	{
		datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[i].key);
		for (j = OffsetNumberNext(i); j <= maxoff; j = OffsetNumberNext(j))
		{
			datum_beta = (SEG *) DatumGetPointer(entryvec->vector[j].key);

			/* compute the wasted space by unioning these guys */
			/* size_waste = size_union - size_inter; */
			union_d = seg_union(datum_alpha, datum_beta);
			rt_seg_size(union_d, &size_union);
			inter_d = seg_inter(datum_alpha, datum_beta);
			rt_seg_size(inter_d, &size_inter);
			size_waste = size_union - size_inter;

			/*
			 * are these a more promising split that what we've already seen?
			 */
			if (size_waste > waste || firsttime)
			{
				waste = size_waste;
				seed_1 = i;
				seed_2 = j;
				firsttime = false;
			}
		}
	}

	left = v->spl_left;
	v->spl_nleft = 0;
	right = v->spl_right;
	v->spl_nright = 0;

	datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[seed_1].key);
	datum_l = seg_union(datum_alpha, datum_alpha);
	rt_seg_size(datum_l, &size_l);
	datum_beta = (SEG *) DatumGetPointer(entryvec->vector[seed_2].key);
	datum_r = seg_union(datum_beta, datum_beta);
	rt_seg_size(datum_r, &size_r);

	/*
	 * Now split up the regions between the two seeds.	An important property
	 * of this split algorithm is that the split vector v has the indices of
	 * items to be split in order in its left and right vectors.  We exploit
	 * this property by doing a merge in the code that actually splits the
	 * page.
	 *
	 * For efficiency, we also place the new index tuple in this loop. This is
	 * handled at the very end, when we have placed all the existing tuples
	 * and i == maxoff + 1.
	 */

	maxoff = OffsetNumberNext(maxoff);
	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
	{
		/*
		 * If we've already decided where to place this item, just put it on
		 * the right list.	Otherwise, we need to figure out which page needs
		 * the least enlargement in order to store the item.
		 */

		if (i == seed_1)
		{
			*left++ = i;
			v->spl_nleft++;
			continue;
		}
		else if (i == seed_2)
		{
			*right++ = i;
			v->spl_nright++;
			continue;
		}

		/* okay, which page needs least enlargement? */
		datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[i].key);
		union_dl = seg_union(datum_l, datum_alpha);
		union_dr = seg_union(datum_r, datum_alpha);
		rt_seg_size(union_dl, &size_alpha);
		rt_seg_size(union_dr, &size_beta);

		/* pick which page to add it to */
		if (size_alpha - size_l < size_beta - size_r)
		{
			datum_l = union_dl;
			size_l = size_alpha;
			*left++ = i;
			v->spl_nleft++;
		}
		else
		{
			datum_r = union_dr;
			size_r = size_alpha;
			*right++ = i;
			v->spl_nright++;
		}
	}
	*left = *right = FirstOffsetNumber; /* sentinel value, see dosplit() */

	v->spl_ldatum = PointerGetDatum(datum_l);
	v->spl_rdatum = PointerGetDatum(datum_r);

	return v;
}
Esempio n. 3
0
/*
 * Helper function to perform deletion of index entries from a bucket.
 *
 * This function expects that the caller has acquired a cleanup lock on the
 * primary bucket page, and will return with a write lock again held on the
 * primary bucket page.  The lock won't necessarily be held continuously,
 * though, because we'll release it when visiting overflow pages.
 *
 * It would be very bad if this function cleaned a page while some other
 * backend was in the midst of scanning it, because hashgettuple assumes
 * that the next valid TID will be greater than or equal to the current
 * valid TID.  There can't be any concurrent scans in progress when we first
 * enter this function because of the cleanup lock we hold on the primary
 * bucket page, but as soon as we release that lock, there might be.  We
 * handle that by conspiring to prevent those scans from passing our cleanup
 * scan.  To do that, we lock the next page in the bucket chain before
 * releasing the lock on the previous page.  (This type of lock chaining is
 * not ideal, so we might want to look for a better solution at some point.)
 *
 * We need to retain a pin on the primary bucket to ensure that no concurrent
 * split can start.
 */
void
hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
				  BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
				  uint32 maxbucket, uint32 highmask, uint32 lowmask,
				  double *tuples_removed, double *num_index_tuples,
				  bool split_cleanup,
				  IndexBulkDeleteCallback callback, void *callback_state)
{
	BlockNumber blkno;
	Buffer		buf;
	Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
	bool		bucket_dirty = false;

	blkno = bucket_blkno;
	buf = bucket_buf;

	if (split_cleanup)
		new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
														lowmask, maxbucket);

	/* Scan each page in bucket */
	for (;;)
	{
		HashPageOpaque opaque;
		OffsetNumber offno;
		OffsetNumber maxoffno;
		Buffer		next_buf;
		Page		page;
		OffsetNumber deletable[MaxOffsetNumber];
		int			ndeletable = 0;
		bool		retain_pin = false;
		bool		clear_dead_marking = false;

		vacuum_delay_point();

		page = BufferGetPage(buf);
		opaque = (HashPageOpaque) PageGetSpecialPointer(page);

		/* Scan each tuple in page */
		maxoffno = PageGetMaxOffsetNumber(page);
		for (offno = FirstOffsetNumber;
			 offno <= maxoffno;
			 offno = OffsetNumberNext(offno))
		{
			ItemPointer htup;
			IndexTuple	itup;
			Bucket		bucket;
			bool		kill_tuple = false;

			itup = (IndexTuple) PageGetItem(page,
											PageGetItemId(page, offno));
			htup = &(itup->t_tid);

			/*
			 * To remove the dead tuples, we strictly want to rely on results
			 * of callback function.  refer btvacuumpage for detailed reason.
			 */
			if (callback && callback(htup, callback_state))
			{
				kill_tuple = true;
				if (tuples_removed)
					*tuples_removed += 1;
			}
			else if (split_cleanup)
			{
				/* delete the tuples that are moved by split. */
				bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
											  maxbucket,
											  highmask,
											  lowmask);
				/* mark the item for deletion */
				if (bucket != cur_bucket)
				{
					/*
					 * We expect tuples to either belong to curent bucket or
					 * new_bucket.  This is ensured because we don't allow
					 * further splits from bucket that contains garbage. See
					 * comments in _hash_expandtable.
					 */
					Assert(bucket == new_bucket);
					kill_tuple = true;
				}
			}

			if (kill_tuple)
			{
				/* mark the item for deletion */
				deletable[ndeletable++] = offno;
			}
			else
			{
				/* we're keeping it, so count it */
				if (num_index_tuples)
					*num_index_tuples += 1;
			}
		}

		/* retain the pin on primary bucket page till end of bucket scan */
		if (blkno == bucket_blkno)
			retain_pin = true;
		else
			retain_pin = false;

		blkno = opaque->hasho_nextblkno;

		/*
		 * Apply deletions, advance to next page and write page if needed.
		 */
		if (ndeletable > 0)
		{
			/* No ereport(ERROR) until changes are logged */
			START_CRIT_SECTION();

			PageIndexMultiDelete(page, deletable, ndeletable);
			bucket_dirty = true;

			/*
			 * Let us mark the page as clean if vacuum removes the DEAD tuples
			 * from an index page. We do this by clearing LH_PAGE_HAS_DEAD_TUPLES
			 * flag.
			 */
			if (tuples_removed && *tuples_removed > 0 &&
				opaque->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES)
			{
				opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
				clear_dead_marking = true;
			}

			MarkBufferDirty(buf);

			/* XLOG stuff */
			if (RelationNeedsWAL(rel))
			{
				xl_hash_delete xlrec;
				XLogRecPtr	recptr;

				xlrec.clear_dead_marking = clear_dead_marking;
				xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false;

				XLogBeginInsert();
				XLogRegisterData((char *) &xlrec, SizeOfHashDelete);

				/*
				 * bucket buffer needs to be registered to ensure that we can
				 * acquire a cleanup lock on it during replay.
				 */
				if (!xlrec.is_primary_bucket_page)
					XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE);

				XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
				XLogRegisterBufData(1, (char *) deletable,
									ndeletable * sizeof(OffsetNumber));

				recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
				PageSetLSN(BufferGetPage(buf), recptr);
			}

			END_CRIT_SECTION();
		}

		/* bail out if there are no more pages to scan. */
		if (!BlockNumberIsValid(blkno))
			break;

		next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
											  LH_OVERFLOW_PAGE,
											  bstrategy);

		/*
		 * release the lock on previous page after acquiring the lock on next
		 * page
		 */
		if (retain_pin)
			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
		else
			_hash_relbuf(rel, buf);

		buf = next_buf;
	}

	/*
	 * lock the bucket page to clear the garbage flag and squeeze the bucket.
	 * if the current buffer is same as bucket buffer, then we already have
	 * lock on bucket page.
	 */
	if (buf != bucket_buf)
	{
		_hash_relbuf(rel, buf);
		LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
	}

	/*
	 * Clear the garbage flag from bucket after deleting the tuples that are
	 * moved by split.  We purposefully clear the flag before squeeze bucket,
	 * so that after restart, vacuum shouldn't again try to delete the moved
	 * by split tuples.
	 */
	if (split_cleanup)
	{
		HashPageOpaque bucket_opaque;
		Page		page;

		page = BufferGetPage(bucket_buf);
		bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);

		/* No ereport(ERROR) until changes are logged */
		START_CRIT_SECTION();

		bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
		MarkBufferDirty(bucket_buf);

		/* XLOG stuff */
		if (RelationNeedsWAL(rel))
		{
			XLogRecPtr	recptr;

			XLogBeginInsert();
			XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);

			recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
			PageSetLSN(page, recptr);
		}

		END_CRIT_SECTION();
	}

	/*
	 * If we have deleted anything, try to compact free space.  For squeezing
	 * the bucket, we must have a cleanup lock, else it can impact the
	 * ordering of tuples for a scan that has started before it.
	 */
	if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
		_hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
							bstrategy);
	else
		LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
}
Esempio n. 4
0
/*
 * find entry with lowest penalty
 */
OffsetNumber
gistchoose(Relation r, Page p, IndexTuple it,	/* it has compressed entry */
		   GISTSTATE *giststate)
{
	OffsetNumber maxoff;
	OffsetNumber i;
	OffsetNumber which;
	float		sum_grow,
				which_grow[INDEX_MAX_KEYS];
	GISTENTRY	entry,
				identry[INDEX_MAX_KEYS];
	bool		isnull[INDEX_MAX_KEYS];

	maxoff = PageGetMaxOffsetNumber(p);
	*which_grow = -1.0;
	which = InvalidOffsetNumber;
	sum_grow = 1;
	gistDeCompressAtt(giststate, r,
					  it, NULL, (OffsetNumber) 0,
					  identry, isnull);

	Assert(maxoff >= FirstOffsetNumber);
	Assert(!GistPageIsLeaf(p));

	for (i = FirstOffsetNumber; i <= maxoff && sum_grow; i = OffsetNumberNext(i))
	{
		int			j;
		IndexTuple	itup = (IndexTuple) PageGetItem(p, PageGetItemId(p, i));

		if (!GistPageIsLeaf(p) && GistTupleIsInvalid(itup))
		{
			ereport(LOG,
					(errmsg("index \"%s\" needs VACUUM or REINDEX to finish crash recovery",
							RelationGetRelationName(r))));
			continue;
		}

		sum_grow = 0;
		for (j = 0; j < r->rd_att->natts; j++)
		{
			Datum		datum;
			float		usize;
			bool		IsNull;

			datum = index_getattr(itup, j + 1, giststate->tupdesc, &IsNull);
			gistdentryinit(giststate, j, &entry, datum, r, p, i,
						   FALSE, IsNull);
			usize = gistpenalty(giststate, j, &entry, IsNull,
								&identry[j], isnull[j]);

			if (which_grow[j] < 0 || usize < which_grow[j])
			{
				which = i;
				which_grow[j] = usize;
				if (j < r->rd_att->natts - 1 && i == FirstOffsetNumber)
					which_grow[j + 1] = -1;
				sum_grow += which_grow[j];
			}
			else if (which_grow[j] == usize)
				sum_grow += usize;
			else
			{
				sum_grow = 1;
				break;
			}
		}
	}

	if (which == InvalidOffsetNumber)
		which = FirstOffsetNumber;

	return which;
}
Esempio n. 5
0
/*
 * Updates the stack so that child->parent is the correct parent of the
 * child. child->parent must be exclusively locked on entry, and will
 * remain so at exit, but it might not be the same page anymore.
 */
static void
gistFindCorrectParent(Relation r, GISTInsertStack *child)
{
	GISTInsertStack *parent = child->parent;

	gistcheckpage(r, parent->buffer);
	parent->page = (Page) BufferGetPage(parent->buffer);

	/* here we don't need to distinguish between split and page update */
	if (child->downlinkoffnum == InvalidOffsetNumber ||
		parent->lsn != PageGetLSN(parent->page))
	{
		/* parent is changed, look child in right links until found */
		OffsetNumber i,
					maxoff;
		ItemId		iid;
		IndexTuple	idxtuple;
		GISTInsertStack *ptr;

		while (true)
		{
			maxoff = PageGetMaxOffsetNumber(parent->page);
			for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
			{
				iid = PageGetItemId(parent->page, i);
				idxtuple = (IndexTuple) PageGetItem(parent->page, iid);
				if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
				{
					/* yes!!, found */
					child->downlinkoffnum = i;
					return;
				}
			}

			parent->blkno = GistPageGetOpaque(parent->page)->rightlink;
			UnlockReleaseBuffer(parent->buffer);
			if (parent->blkno == InvalidBlockNumber)
			{
				/*
				 * End of chain and still didn't find parent. It's a very-very
				 * rare situation when root splited.
				 */
				break;
			}
			parent->buffer = ReadBuffer(r, parent->blkno);
			LockBuffer(parent->buffer, GIST_EXCLUSIVE);
			gistcheckpage(r, parent->buffer);
			parent->page = (Page) BufferGetPage(parent->buffer);
		}

		/*
		 * awful!!, we need search tree to find parent ... , but before we
		 * should release all old parent
		 */

		ptr = child->parent->parent;	/* child->parent already released
										 * above */
		while (ptr)
		{
			ReleaseBuffer(ptr->buffer);
			ptr = ptr->parent;
		}

		/* ok, find new path */
		ptr = parent = gistFindPath(r, child->blkno, &child->downlinkoffnum);

		/* read all buffers as expected by caller */
		/* note we don't lock them or gistcheckpage them here! */
		while (ptr)
		{
			ptr->buffer = ReadBuffer(r, ptr->blkno);
			ptr->page = (Page) BufferGetPage(ptr->buffer);
			ptr = ptr->parent;
		}

		/* install new chain of parents to stack */
		child->parent = parent;

		/* make recursive call to normal processing */
		LockBuffer(child->parent->buffer, GIST_EXCLUSIVE);
		gistFindCorrectParent(r, child);
	}

	return;
}
Esempio n. 6
0
/*
** The GiST PickSplit method for _intments
** We use Guttman's poly time split algorithm
*/
Datum
g_int_picksplit(PG_FUNCTION_ARGS)
{
    GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
    GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1);
    OffsetNumber i,
                 j;
    ArrayType  *datum_alpha,
               *datum_beta;
    ArrayType  *datum_l,
               *datum_r;
    ArrayType  *union_d,
               *union_dl,
               *union_dr;
    ArrayType  *inter_d;
    bool		firsttime;
    float		size_alpha,
                size_beta,
                size_union,
                size_inter;
    float		size_waste,
                waste;
    float		size_l,
                size_r;
    int			nbytes;
    OffsetNumber seed_1 = 0,
                 seed_2 = 0;
    OffsetNumber *left,
                 *right;
    OffsetNumber maxoff;
    SPLITCOST  *costvector;

#ifdef GIST_DEBUG
    elog(DEBUG3, "--------picksplit %d", entryvec->n);
#endif

    maxoff = entryvec->n - 2;
    nbytes = (maxoff + 2) * sizeof(OffsetNumber);
    v->spl_left = (OffsetNumber *) palloc(nbytes);
    v->spl_right = (OffsetNumber *) palloc(nbytes);

    firsttime = true;
    waste = 0.0;
    for (i = FirstOffsetNumber; i < maxoff; i = OffsetNumberNext(i))
    {
        datum_alpha = GETENTRY(entryvec, i);
        for (j = OffsetNumberNext(i); j <= maxoff; j = OffsetNumberNext(j))
        {
            datum_beta = GETENTRY(entryvec, j);

            /* compute the wasted space by unioning these guys */
            /* size_waste = size_union - size_inter; */
            union_d = inner_int_union(datum_alpha, datum_beta);
            rt__int_size(union_d, &size_union);
            inter_d = inner_int_inter(datum_alpha, datum_beta);
            rt__int_size(inter_d, &size_inter);
            size_waste = size_union - size_inter;

            pfree(union_d);

            if (inter_d != (ArrayType *) NULL)
                pfree(inter_d);

            /*
             * are these a more promising split that what we've already seen?
             */

            if (size_waste > waste || firsttime)
            {
                waste = size_waste;
                seed_1 = i;
                seed_2 = j;
                firsttime = false;
            }
        }
    }

    left = v->spl_left;
    v->spl_nleft = 0;
    right = v->spl_right;
    v->spl_nright = 0;
    if (seed_1 == 0 || seed_2 == 0)
    {
        seed_1 = 1;
        seed_2 = 2;
    }

    datum_alpha = GETENTRY(entryvec, seed_1);
    datum_l = copy_intArrayType(datum_alpha);
    rt__int_size(datum_l, &size_l);
    datum_beta = GETENTRY(entryvec, seed_2);
    datum_r = copy_intArrayType(datum_beta);
    rt__int_size(datum_r, &size_r);

    maxoff = OffsetNumberNext(maxoff);

    /*
     * sort entries
     */
    costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff);
    for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
    {
        costvector[i - 1].pos = i;
        datum_alpha = GETENTRY(entryvec, i);
        union_d = inner_int_union(datum_l, datum_alpha);
        rt__int_size(union_d, &size_alpha);
        pfree(union_d);
        union_d = inner_int_union(datum_r, datum_alpha);
        rt__int_size(union_d, &size_beta);
        pfree(union_d);
        costvector[i - 1].cost = Abs((size_alpha - size_l) - (size_beta - size_r));
    }
    qsort((void *) costvector, maxoff, sizeof(SPLITCOST), comparecost);

    /*
     * Now split up the regions between the two seeds.	An important property
     * of this split algorithm is that the split vector v has the indices of
     * items to be split in order in its left and right vectors.  We exploit
     * this property by doing a merge in the code that actually splits the
     * page.
     *
     * For efficiency, we also place the new index tuple in this loop. This is
     * handled at the very end, when we have placed all the existing tuples
     * and i == maxoff + 1.
     */


    for (j = 0; j < maxoff; j++)
    {
        i = costvector[j].pos;

        /*
         * If we've already decided where to place this item, just put it on
         * the right list.	Otherwise, we need to figure out which page needs
         * the least enlargement in order to store the item.
         */

        if (i == seed_1)
        {
            *left++ = i;
            v->spl_nleft++;
            continue;
        }
        else if (i == seed_2)
        {
            *right++ = i;
            v->spl_nright++;
            continue;
        }

        /* okay, which page needs least enlargement? */
        datum_alpha = GETENTRY(entryvec, i);
        union_dl = inner_int_union(datum_l, datum_alpha);
        union_dr = inner_int_union(datum_r, datum_alpha);
        rt__int_size(union_dl, &size_alpha);
        rt__int_size(union_dr, &size_beta);

        /* pick which page to add it to */
        if (size_alpha - size_l < size_beta - size_r + WISH_F(v->spl_nleft, v->spl_nright, 0.01))
        {
            if (datum_l)
                pfree(datum_l);
            if (union_dr)
                pfree(union_dr);
            datum_l = union_dl;
            size_l = size_alpha;
            *left++ = i;
            v->spl_nleft++;
        }
        else
        {
            if (datum_r)
                pfree(datum_r);
            if (union_dl)
                pfree(union_dl);
            datum_r = union_dr;
            size_r = size_beta;
            *right++ = i;
            v->spl_nright++;
        }
    }
    pfree(costvector);
    *right = *left = FirstOffsetNumber;

    datum_l->flags &= ~LEAFKEY;
    datum_r->flags &= ~LEAFKEY;
    v->spl_ldatum = PointerGetDatum(datum_l);
    v->spl_rdatum = PointerGetDatum(datum_r);

    PG_RETURN_POINTER(v);
}
Esempio n. 7
0
/*
 * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
 *
 * This routine is used to partition the tuples between old and new bucket and
 * is used to finish the incomplete split operations.  To finish the previously
 * interrupted split operation, the caller needs to fill htab.  If htab is set,
 * then we skip the movement of tuples that exists in htab, otherwise NULL
 * value of htab indicates movement of all the tuples that belong to the new
 * bucket.
 *
 * We are splitting a bucket that consists of a base bucket page and zero
 * or more overflow (bucket chain) pages.  We must relocate tuples that
 * belong in the new bucket.
 *
 * The caller must hold cleanup locks on both buckets to ensure that
 * no one else is trying to access them (see README).
 *
 * The caller must hold a pin, but no lock, on the metapage buffer.
 * The buffer is returned in the same state.  (The metapage is only
 * touched if it becomes necessary to add or remove overflow pages.)
 *
 * Split needs to retain pin on primary bucket pages of both old and new
 * buckets till end of operation.  This is to prevent vacuum from starting
 * while a split is in progress.
 *
 * In addition, the caller must have created the new bucket's base page,
 * which is passed in buffer nbuf, pinned and write-locked.  The lock will be
 * released here and pin must be released by the caller.  (The API is set up
 * this way because we must do _hash_getnewbuf() before releasing the metapage
 * write lock.  So instead of passing the new bucket's start block number, we
 * pass an actual buffer.)
 */
static void
_hash_splitbucket(Relation rel,
				  Buffer metabuf,
				  Bucket obucket,
				  Bucket nbucket,
				  Buffer obuf,
				  Buffer nbuf,
				  HTAB *htab,
				  uint32 maxbucket,
				  uint32 highmask,
				  uint32 lowmask)
{
	Buffer		bucket_obuf;
	Buffer		bucket_nbuf;
	Page		opage;
	Page		npage;
	HashPageOpaque oopaque;
	HashPageOpaque nopaque;
	OffsetNumber itup_offsets[MaxIndexTuplesPerPage];
	IndexTuple	itups[MaxIndexTuplesPerPage];
	Size		all_tups_size = 0;
	int			i;
	uint16		nitups = 0;

	bucket_obuf = obuf;
	opage = BufferGetPage(obuf);
	oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);

	bucket_nbuf = nbuf;
	npage = BufferGetPage(nbuf);
	nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);

	/*
	 * Partition the tuples in the old bucket between the old bucket and the
	 * new bucket, advancing along the old bucket's overflow bucket chain and
	 * adding overflow pages to the new bucket as needed.  Outer loop iterates
	 * once per page in old bucket.
	 */
	for (;;)
	{
		BlockNumber oblkno;
		OffsetNumber ooffnum;
		OffsetNumber omaxoffnum;

		/* Scan each tuple in old page */
		omaxoffnum = PageGetMaxOffsetNumber(opage);
		for (ooffnum = FirstOffsetNumber;
			 ooffnum <= omaxoffnum;
			 ooffnum = OffsetNumberNext(ooffnum))
		{
			IndexTuple	itup;
			Size		itemsz;
			Bucket		bucket;
			bool		found = false;

			/* skip dead tuples */
			if (ItemIdIsDead(PageGetItemId(opage, ooffnum)))
				continue;

			/*
			 * Before inserting a tuple, probe the hash table containing TIDs
			 * of tuples belonging to new bucket, if we find a match, then
			 * skip that tuple, else fetch the item's hash key (conveniently
			 * stored in the item) and determine which bucket it now belongs
			 * in.
			 */
			itup = (IndexTuple) PageGetItem(opage,
											PageGetItemId(opage, ooffnum));

			if (htab)
				(void) hash_search(htab, &itup->t_tid, HASH_FIND, &found);

			if (found)
				continue;

			bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
										  maxbucket, highmask, lowmask);

			if (bucket == nbucket)
			{
				IndexTuple	new_itup;

				/*
				 * make a copy of index tuple as we have to scribble on it.
				 */
				new_itup = CopyIndexTuple(itup);

				/*
				 * mark the index tuple as moved by split, such tuples are
				 * skipped by scan if there is split in progress for a bucket.
				 */
				new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK;

				/*
				 * insert the tuple into the new bucket.  if it doesn't fit on
				 * the current page in the new bucket, we must allocate a new
				 * overflow page and place the tuple on that page instead.
				 */
				itemsz = IndexTupleDSize(*new_itup);
				itemsz = MAXALIGN(itemsz);

				if (PageGetFreeSpaceForMultipleTuples(npage, nitups + 1) < (all_tups_size + itemsz))
				{
					/*
					 * Change the shared buffer state in critical section,
					 * otherwise any error could make it unrecoverable.
					 */
					START_CRIT_SECTION();

					_hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups);
					MarkBufferDirty(nbuf);
					/* log the split operation before releasing the lock */
					log_split_page(rel, nbuf);

					END_CRIT_SECTION();

					/* drop lock, but keep pin */
					LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);

					/* be tidy */
					for (i = 0; i < nitups; i++)
						pfree(itups[i]);
					nitups = 0;
					all_tups_size = 0;

					/* chain to a new overflow page */
					nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf) ? true : false);
					npage = BufferGetPage(nbuf);
					nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
				}

				itups[nitups++] = new_itup;
				all_tups_size += itemsz;
			}
			else
			{
				/*
				 * the tuple stays on this page, so nothing to do.
				 */
				Assert(bucket == obucket);
			}
		}

		oblkno = oopaque->hasho_nextblkno;

		/* retain the pin on the old primary bucket */
		if (obuf == bucket_obuf)
			LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
		else
			_hash_relbuf(rel, obuf);

		/* Exit loop if no more overflow pages in old bucket */
		if (!BlockNumberIsValid(oblkno))
		{
			/*
			 * Change the shared buffer state in critical section, otherwise
			 * any error could make it unrecoverable.
			 */
			START_CRIT_SECTION();

			_hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups);
			MarkBufferDirty(nbuf);
			/* log the split operation before releasing the lock */
			log_split_page(rel, nbuf);

			END_CRIT_SECTION();

			if (nbuf == bucket_nbuf)
				LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
			else
				_hash_relbuf(rel, nbuf);

			/* be tidy */
			for (i = 0; i < nitups; i++)
				pfree(itups[i]);
			break;
		}

		/* Else, advance to next old page */
		obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE);
		opage = BufferGetPage(obuf);
		oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
	}

	/*
	 * We're at the end of the old bucket chain, so we're done partitioning
	 * the tuples.  Mark the old and new buckets to indicate split is
	 * finished.
	 *
	 * To avoid deadlocks due to locking order of buckets, first lock the old
	 * bucket and then the new bucket.
	 */
	LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE);
	opage = BufferGetPage(bucket_obuf);
	oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);

	LockBuffer(bucket_nbuf, BUFFER_LOCK_EXCLUSIVE);
	npage = BufferGetPage(bucket_nbuf);
	nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);

	START_CRIT_SECTION();

	oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT;
	nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED;

	/*
	 * After the split is finished, mark the old bucket to indicate that it
	 * contains deletable tuples.  We will clear split-cleanup flag after
	 * deleting such tuples either at the end of split or at the next split
	 * from old bucket or at the time of vacuum.
	 */
	oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP;

	/*
	 * now write the buffers, here we don't release the locks as caller is
	 * responsible to release locks.
	 */
	MarkBufferDirty(bucket_obuf);
	MarkBufferDirty(bucket_nbuf);

	if (RelationNeedsWAL(rel))
	{
		XLogRecPtr	recptr;
		xl_hash_split_complete xlrec;

		xlrec.old_bucket_flag = oopaque->hasho_flag;
		xlrec.new_bucket_flag = nopaque->hasho_flag;

		XLogBeginInsert();

		XLogRegisterData((char *) &xlrec, SizeOfHashSplitComplete);

		XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD);
		XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD);

		recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE);

		PageSetLSN(BufferGetPage(bucket_obuf), recptr);
		PageSetLSN(BufferGetPage(bucket_nbuf), recptr);
	}

	END_CRIT_SECTION();

	/*
	 * If possible, clean up the old bucket.  We might not be able to do this
	 * if someone else has a pin on it, but if not then we can go ahead.  This
	 * isn't absolutely necessary, but it reduces bloat; if we don't do it
	 * now, VACUUM will do it eventually, but maybe not until new overflow
	 * pages have been allocated.  Note that there's no need to clean up the
	 * new bucket.
	 */
	if (IsBufferCleanupOK(bucket_obuf))
	{
		LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
		hashbucketcleanup(rel, obucket, bucket_obuf,
						  BufferGetBlockNumber(bucket_obuf), NULL,
						  maxbucket, highmask, lowmask, NULL, NULL, true,
						  NULL, NULL);
	}
	else
	{
		LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK);
		LockBuffer(bucket_obuf, BUFFER_LOCK_UNLOCK);
	}
}
Esempio n. 8
0
Datum
readindex(PG_FUNCTION_ARGS)
{
	FuncCallContext	   *funcctx;
	readindexinfo	   *info;

	MIRROREDLOCK_BUFMGR_DECLARE;

	if (SRF_IS_FIRSTCALL())
	{
		Oid		irelid = PG_GETARG_OID(0);
		TupleDesc	tupdesc;
		MemoryContext oldcontext;
		AttrNumber		outattnum;
		Relation	irel;
		TupleDesc	itupdesc;
		int			i;
		AttrNumber	attno;

		irel = index_open(irelid, AccessShareLock);
		itupdesc = RelationGetDescr(irel);
		outattnum = FIXED_COLUMN + itupdesc->natts;

		funcctx = SRF_FIRSTCALL_INIT();
		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
		tupdesc = CreateTemplateTupleDesc(outattnum, false);
		attno = 1;
		TupleDescInitEntry(tupdesc, attno++, "ictid", TIDOID, -1, 0);
		TupleDescInitEntry(tupdesc, attno++, "hctid", TIDOID, -1, 0);
		TupleDescInitEntry(tupdesc, attno++, "aotid", TEXTOID, -1, 0);
		TupleDescInitEntry(tupdesc, attno++, "istatus", TEXTOID, -1, 0);
		TupleDescInitEntry(tupdesc, attno++, "hstatus", TEXTOID, -1, 0);

		for (i = 0; i < itupdesc->natts; i++)
		{
			Form_pg_attribute attr = itupdesc->attrs[i];
			TupleDescInitEntry(tupdesc, attno++, NameStr(attr->attname), attr->atttypid, attr->atttypmod, 0);
		}

		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
		info = (readindexinfo *) palloc(sizeof(readindexinfo));
		funcctx->user_fctx = (void *) info;

		info->outattnum = outattnum;
		info->irel = irel;
		info->hrel = relation_open(irel->rd_index->indrelid, AccessShareLock);
		if (info->hrel->rd_rel != NULL &&
				(info->hrel->rd_rel->relstorage == 'a' ||
				 info->hrel->rd_rel->relstorage == 'c'))
		{
			relation_close(info->hrel, AccessShareLock);
			info->hrel = NULL;
		}
		info->num_pages = RelationGetNumberOfBlocks(irel);
		info->blkno = BTREE_METAPAGE + 1;
		info->page = NULL;

		MemoryContextSwitchTo(oldcontext);
	}

	funcctx = SRF_PERCALL_SETUP();
	info = (readindexinfo *) funcctx->user_fctx;

	while (info->blkno < info->num_pages)
	{
		Datum		values[255];
		bool		nulls[255];
		ItemPointerData		itid;
		HeapTuple	tuple;
		Datum		result;

		if (info->page == NULL)
		{
			MIRROREDLOCK_BUFMGR_LOCK;
			info->buf = ReadBuffer(info->irel, info->blkno);
			info->page = BufferGetPage(info->buf);
			info->opaque = (BTPageOpaque) PageGetSpecialPointer(info->page);
			info->minoff = P_FIRSTDATAKEY(info->opaque);
			info->maxoff = PageGetMaxOffsetNumber(info->page);
			info->offnum = info->minoff;
			MIRROREDLOCK_BUFMGR_UNLOCK;
		}
		if (!P_ISLEAF(info->opaque) || info->offnum > info->maxoff)
		{
			ReleaseBuffer(info->buf);
			info->page = NULL;
			info->blkno++;
			continue;
		}

		MemSet(nulls, false, info->outattnum * sizeof(bool));

		ItemPointerSet(&itid, info->blkno, info->offnum);
		values[0] = ItemPointerGetDatum(&itid);
		readindextuple(info, values, nulls);

		info->offnum = OffsetNumberNext(info->offnum);

		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
		result = HeapTupleGetDatum(tuple);
		SRF_RETURN_NEXT(funcctx, result);
	}

	if (info->hrel != NULL)
		relation_close(info->hrel, AccessShareLock);
	index_close(info->irel, AccessShareLock);
	SRF_RETURN_DONE(funcctx);
}
Esempio n. 9
0
/*
 * Write the index tuples contained in *collector into the index's
 * pending list.
 *
 * Function guarantees that all these tuples will be inserted consecutively,
 * preserving order
 */
void
ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
{
	Relation	index = ginstate->index;
	Buffer		metabuffer;
	Page		metapage;
	GinMetaPageData *metadata = NULL;
	Buffer		buffer = InvalidBuffer;
	Page		page = NULL;
	ginxlogUpdateMeta data;
	bool		separateList = false;
	bool		needCleanup = false;
	int			cleanupSize;
	bool		needWal;

	if (collector->ntuples == 0)
		return;

	needWal = RelationNeedsWAL(index);

	data.node = index->rd_node;
	data.ntuples = 0;
	data.newRightlink = data.prevTail = InvalidBlockNumber;

	metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO);
	metapage = BufferGetPage(metabuffer);

	if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GinListPageSize)
	{
		/*
		 * Total size is greater than one page => make sublist
		 */
		separateList = true;
	}
	else
	{
		LockBuffer(metabuffer, GIN_EXCLUSIVE);
		metadata = GinPageGetMeta(metapage);

		if (metadata->head == InvalidBlockNumber ||
			collector->sumsize + collector->ntuples * sizeof(ItemIdData) > metadata->tailFreeSize)
		{
			/*
			 * Pending list is empty or total size is greater than freespace
			 * on tail page => make sublist
			 *
			 * We unlock metabuffer to keep high concurrency
			 */
			separateList = true;
			LockBuffer(metabuffer, GIN_UNLOCK);
		}
	}

	if (separateList)
	{
		/*
		 * We should make sublist separately and append it to the tail
		 */
		GinMetaPageData sublist;

		memset(&sublist, 0, sizeof(GinMetaPageData));
		makeSublist(index, collector->tuples, collector->ntuples, &sublist);

		if (needWal)
			XLogBeginInsert();

		/*
		 * metapage was unlocked, see above
		 */
		LockBuffer(metabuffer, GIN_EXCLUSIVE);
		metadata = GinPageGetMeta(metapage);

		if (metadata->head == InvalidBlockNumber)
		{
			/*
			 * Main list is empty, so just insert sublist as main list
			 */
			START_CRIT_SECTION();

			metadata->head = sublist.head;
			metadata->tail = sublist.tail;
			metadata->tailFreeSize = sublist.tailFreeSize;

			metadata->nPendingPages = sublist.nPendingPages;
			metadata->nPendingHeapTuples = sublist.nPendingHeapTuples;
		}
		else
		{
			/*
			 * Merge lists
			 */
			data.prevTail = metadata->tail;
			data.newRightlink = sublist.head;

			buffer = ReadBuffer(index, metadata->tail);
			LockBuffer(buffer, GIN_EXCLUSIVE);
			page = BufferGetPage(buffer);

			Assert(GinPageGetOpaque(page)->rightlink == InvalidBlockNumber);

			START_CRIT_SECTION();

			GinPageGetOpaque(page)->rightlink = sublist.head;

			MarkBufferDirty(buffer);

			metadata->tail = sublist.tail;
			metadata->tailFreeSize = sublist.tailFreeSize;

			metadata->nPendingPages += sublist.nPendingPages;
			metadata->nPendingHeapTuples += sublist.nPendingHeapTuples;

			if (needWal)
				XLogRegisterBuffer(1, buffer, REGBUF_STANDARD);
		}
	}
	else
	{
		/*
		 * Insert into tail page.  Metapage is already locked
		 */
		OffsetNumber l,
					off;
		int			i,
					tupsize;
		char	   *ptr;
		char	   *collectordata;

		buffer = ReadBuffer(index, metadata->tail);
		LockBuffer(buffer, GIN_EXCLUSIVE);
		page = BufferGetPage(buffer);

		off = (PageIsEmpty(page)) ? FirstOffsetNumber :
			OffsetNumberNext(PageGetMaxOffsetNumber(page));

		collectordata = ptr = (char *) palloc(collector->sumsize);

		data.ntuples = collector->ntuples;

		if (needWal)
			XLogBeginInsert();

		START_CRIT_SECTION();

		/*
		 * Increase counter of heap tuples
		 */
		Assert(GinPageGetOpaque(page)->maxoff <= metadata->nPendingHeapTuples);
		GinPageGetOpaque(page)->maxoff++;
		metadata->nPendingHeapTuples++;

		for (i = 0; i < collector->ntuples; i++)
		{
			tupsize = IndexTupleSize(collector->tuples[i]);
			l = PageAddItem(page, (Item) collector->tuples[i], tupsize, off, false, false);

			if (l == InvalidOffsetNumber)
				elog(ERROR, "failed to add item to index page in \"%s\"",
					 RelationGetRelationName(index));

			memcpy(ptr, collector->tuples[i], tupsize);
			ptr += tupsize;

			off++;
		}

		Assert((ptr - collectordata) <= collector->sumsize);
		if (needWal)
		{
			XLogRegisterBuffer(1, buffer, REGBUF_STANDARD);
			XLogRegisterBufData(1, collectordata, collector->sumsize);
		}

		metadata->tailFreeSize = PageGetExactFreeSpace(page);

		MarkBufferDirty(buffer);
	}

	/*
	 * Write metabuffer, make xlog entry
	 */
	MarkBufferDirty(metabuffer);

	if (needWal)
	{
		XLogRecPtr	recptr;

		memcpy(&data.metadata, metadata, sizeof(GinMetaPageData));

		XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT);
		XLogRegisterData((char *) &data, sizeof(ginxlogUpdateMeta));

		recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE);
		PageSetLSN(metapage, recptr);

		if (buffer != InvalidBuffer)
		{
			PageSetLSN(page, recptr);
		}
	}

	if (buffer != InvalidBuffer)
		UnlockReleaseBuffer(buffer);

	/*
	 * Force pending list cleanup when it becomes too long. And,
	 * ginInsertCleanup could take significant amount of time, so we prefer to
	 * call it when it can do all the work in a single collection cycle. In
	 * non-vacuum mode, it shouldn't require maintenance_work_mem, so fire it
	 * while pending list is still small enough to fit into
	 * gin_pending_list_limit.
	 *
	 * ginInsertCleanup() should not be called inside our CRIT_SECTION.
	 */
	cleanupSize = GinGetPendingListCleanupSize(index);
	if (metadata->nPendingPages * GIN_PAGE_FREESIZE > cleanupSize * 1024L)
		needCleanup = true;

	UnlockReleaseBuffer(metabuffer);

	END_CRIT_SECTION();

	if (needCleanup)
		ginInsertCleanup(ginstate, true, NULL);
}
Esempio n. 10
0
File: hash.c Progetto: GisKook/Gis
/*
 *	hashgettuple() -- Get the next tuple in the scan.
 */
Datum
hashgettuple(PG_FUNCTION_ARGS)
{
	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
	ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
	HashScanOpaque so = (HashScanOpaque) scan->opaque;
	Relation	rel = scan->indexRelation;
	Buffer		buf;
	Page		page;
	OffsetNumber offnum;
	ItemPointer current;
	bool		res;

	/* Hash indexes are always lossy since we store only the hash code */
	scan->xs_recheck = true;

	/*
	 * We hold pin but not lock on current buffer while outside the hash AM.
	 * Reacquire the read lock here.
	 */
	if (BufferIsValid(so->hashso_curbuf))
		_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ);

	/*
	 * If we've already initialized this scan, we can just advance it in the
	 * appropriate direction.  If we haven't done so yet, we call a routine to
	 * get the first item in the scan.
	 */
	current = &(so->hashso_curpos);
	if (ItemPointerIsValid(current))
	{
		/*
		 * An insertion into the current index page could have happened while
		 * we didn't have read lock on it.  Re-find our position by looking
		 * for the TID we previously returned.	(Because we hold share lock on
		 * the bucket, no deletions or splits could have occurred; therefore
		 * we can expect that the TID still exists in the current index page,
		 * at an offset >= where we were.)
		 */
		OffsetNumber maxoffnum;

		buf = so->hashso_curbuf;
		Assert(BufferIsValid(buf));
		page = BufferGetPage(buf);
		maxoffnum = PageGetMaxOffsetNumber(page);
		for (offnum = ItemPointerGetOffsetNumber(current);
			 offnum <= maxoffnum;
			 offnum = OffsetNumberNext(offnum))
		{
			IndexTuple	itup;

			itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
			if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid)))
				break;
		}
		if (offnum > maxoffnum)
			elog(ERROR, "failed to re-find scan position within index \"%s\"",
				 RelationGetRelationName(rel));
		ItemPointerSetOffsetNumber(current, offnum);

		/*
		 * Check to see if we should kill the previously-fetched tuple.
		 */
		if (scan->kill_prior_tuple)
		{
			/*
			 * Yes, so mark it by setting the LP_DEAD state in the item flags.
			 */
			ItemIdMarkDead(PageGetItemId(page, offnum));

			/*
			 * Since this can be redone later if needed, it's treated the same
			 * as a commit-hint-bit status update for heap tuples: we mark the
			 * buffer dirty but don't make a WAL log entry.
			 */
			SetBufferCommitInfoNeedsSave(buf);
		}

		/*
		 * Now continue the scan.
		 */
		res = _hash_next(scan, dir);
	}
	else
		res = _hash_first(scan, dir);

	/*
	 * Skip killed tuples if asked to.
	 */
	if (scan->ignore_killed_tuples)
	{
		while (res)
		{
			offnum = ItemPointerGetOffsetNumber(current);
			page = BufferGetPage(so->hashso_curbuf);
			if (!ItemIdIsDead(PageGetItemId(page, offnum)))
				break;
			res = _hash_next(scan, dir);
		}
	}

	/* Release read lock on current buffer, but keep it pinned */
	if (BufferIsValid(so->hashso_curbuf))
		_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK);

	/* Return current heap TID on success */
	scan->xs_ctup.t_self = so->hashso_heappos;

	PG_RETURN_BOOL(res);
}
Esempio n. 11
0
File: hash.c Progetto: GisKook/Gis
/*
 * Bulk deletion of all index entries pointing to a set of heap tuples.
 * The set of target tuples is specified via a callback routine that tells
 * whether any given heap tuple (identified by ItemPointer) is being deleted.
 *
 * Result: a palloc'd struct containing statistical info for VACUUM displays.
 */
Datum
hashbulkdelete(PG_FUNCTION_ARGS)
{
	IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
	IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
	IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
	void	   *callback_state = (void *) PG_GETARG_POINTER(3);
	Relation	rel = info->index;
	double		tuples_removed;
	double		num_index_tuples;
	double		orig_ntuples;
	Bucket		orig_maxbucket;
	Bucket		cur_maxbucket;
	Bucket		cur_bucket;
	Buffer		metabuf;
	HashMetaPage metap;
	HashMetaPageData local_metapage;

	tuples_removed = 0;
	num_index_tuples = 0;

	/*
	 * Read the metapage to fetch original bucket and tuple counts.  Also, we
	 * keep a copy of the last-seen metapage so that we can use its
	 * hashm_spares[] values to compute bucket page addresses.	This is a bit
	 * hokey but perfectly safe, since the interesting entries in the spares
	 * array cannot change under us; and it beats rereading the metapage for
	 * each bucket.
	 */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
	metap = HashPageGetMeta(BufferGetPage(metabuf));
	orig_maxbucket = metap->hashm_maxbucket;
	orig_ntuples = metap->hashm_ntuples;
	memcpy(&local_metapage, metap, sizeof(local_metapage));
	_hash_relbuf(rel, metabuf);

	/* Scan the buckets that we know exist */
	cur_bucket = 0;
	cur_maxbucket = orig_maxbucket;

loop_top:
	while (cur_bucket <= cur_maxbucket)
	{
		BlockNumber bucket_blkno;
		BlockNumber blkno;
		bool		bucket_dirty = false;

		/* Get address of bucket's start page */
		bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);

		/* Exclusive-lock the bucket so we can shrink it */
		_hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE);

		/* Shouldn't have any active scans locally, either */
		if (_hash_has_active_scan(rel, cur_bucket))
			elog(ERROR, "hash index has active scan during VACUUM");

		/* Scan each page in bucket */
		blkno = bucket_blkno;
		while (BlockNumberIsValid(blkno))
		{
			Buffer		buf;
			Page		page;
			HashPageOpaque opaque;
			OffsetNumber offno;
			OffsetNumber maxoffno;
			OffsetNumber deletable[MaxOffsetNumber];
			int			ndeletable = 0;

			vacuum_delay_point();

			buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
										   LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
											 info->strategy);
			page = BufferGetPage(buf);
			opaque = (HashPageOpaque) PageGetSpecialPointer(page);
			Assert(opaque->hasho_bucket == cur_bucket);

			/* Scan each tuple in page */
			maxoffno = PageGetMaxOffsetNumber(page);
			for (offno = FirstOffsetNumber;
				 offno <= maxoffno;
				 offno = OffsetNumberNext(offno))
			{
				IndexTuple	itup;
				ItemPointer htup;

				itup = (IndexTuple) PageGetItem(page,
												PageGetItemId(page, offno));
				htup = &(itup->t_tid);
				if (callback(htup, callback_state))
				{
					/* mark the item for deletion */
					deletable[ndeletable++] = offno;
					tuples_removed += 1;
				}
				else
					num_index_tuples += 1;
			}

			/*
			 * Apply deletions and write page if needed, advance to next page.
			 */
			blkno = opaque->hasho_nextblkno;

			if (ndeletable > 0)
			{
				PageIndexMultiDelete(page, deletable, ndeletable);
				_hash_wrtbuf(rel, buf);
				bucket_dirty = true;
			}
			else
				_hash_relbuf(rel, buf);
		}

		/* If we deleted anything, try to compact free space */
		if (bucket_dirty)
			_hash_squeezebucket(rel, cur_bucket, bucket_blkno,
								info->strategy);

		/* Release bucket lock */
		_hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE);

		/* Advance to next bucket */
		cur_bucket++;
	}

	/* Write-lock metapage and check for split since we started */
	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE);
	metap = HashPageGetMeta(BufferGetPage(metabuf));

	if (cur_maxbucket != metap->hashm_maxbucket)
	{
		/* There's been a split, so process the additional bucket(s) */
		cur_maxbucket = metap->hashm_maxbucket;
		memcpy(&local_metapage, metap, sizeof(local_metapage));
		_hash_relbuf(rel, metabuf);
		goto loop_top;
	}

	/* Okay, we're really done.  Update tuple count in metapage. */

	if (orig_maxbucket == metap->hashm_maxbucket &&
		orig_ntuples == metap->hashm_ntuples)
	{
		/*
		 * No one has split or inserted anything since start of scan, so
		 * believe our count as gospel.
		 */
		metap->hashm_ntuples = num_index_tuples;
	}
	else
	{
		/*
		 * Otherwise, our count is untrustworthy since we may have
		 * double-scanned tuples in split buckets.	Proceed by dead-reckoning.
		 * (Note: we still return estimated_count = false, because using this
		 * count is better than not updating reltuples at all.)
		 */
		if (metap->hashm_ntuples > tuples_removed)
			metap->hashm_ntuples -= tuples_removed;
		else
			metap->hashm_ntuples = 0;
		num_index_tuples = metap->hashm_ntuples;
	}

	_hash_wrtbuf(rel, metabuf);

	/* return statistics */
	if (stats == NULL)
		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
	stats->estimated_count = false;
	stats->num_index_tuples = num_index_tuples;
	stats->tuples_removed += tuples_removed;
	/* hashvacuumcleanup will fill in num_pages */

	PG_RETURN_POINTER(stats);
}
Esempio n. 12
0
Datum geography_gist_picksplit(PG_FUNCTION_ARGS)
{

	GistEntryVector	*entryvec = (GistEntryVector*) PG_GETARG_POINTER(0);

	GIST_SPLITVEC *v = (GIST_SPLITVEC*) PG_GETARG_POINTER(1);
	OffsetNumber i;
	/* One union box for each half of the space. */
	GIDX **box_union;
	/* One offset number list for each half of the space. */
	OffsetNumber **list;
	/* One position index for each half of the space. */
	int *pos;
	GIDX *box_pageunion;
	GIDX *box_current;
	int direction = -1;
	bool all_entries_equal = true;
	OffsetNumber max_offset;
	int nbytes, ndims_pageunion, d;
	int posmax = -1;

	POSTGIS_DEBUG(4, "[GIST] 'picksplit' function called");

	/*
	** First calculate the bounding box and maximum number of dimensions in this page.
	*/

	max_offset = entryvec->n - 1;
	box_current = (GIDX*) DatumGetPointer(entryvec->vector[FirstOffsetNumber].key);
	box_pageunion = gidx_copy(box_current);

	/* Calculate the containing box (box_pageunion) for the whole page we are going to split. */
	for ( i = OffsetNumberNext(FirstOffsetNumber); i <= max_offset; i = OffsetNumberNext(i) )
	{
		box_current = (GIDX*) DatumGetPointer(entryvec->vector[i].key);

		if ( all_entries_equal == true && ! gidx_equals (box_pageunion, box_current) )
			all_entries_equal = false;

		gidx_merge( &box_pageunion, box_current );
	}

	POSTGIS_DEBUGF(3, "[GIST] box_pageunion: %s", gidx_to_string(box_pageunion));

	/* Every box in the page is the same! So, we split and just put half the boxes in each child. */
	if ( all_entries_equal )
	{
		POSTGIS_DEBUG(4, "[GIST] picksplit finds all entries equal!");
		geography_gist_picksplit_fallback(entryvec, v);
		PG_RETURN_POINTER(v);
	}

	/* Initialize memory structures. */
	nbytes = (max_offset + 2) * sizeof(OffsetNumber);
	ndims_pageunion = GIDX_NDIMS(box_pageunion);
	POSTGIS_DEBUGF(4, "[GIST] ndims_pageunion == %d", ndims_pageunion);
	pos = palloc(2*ndims_pageunion * sizeof(int));
	list = palloc(2*ndims_pageunion * sizeof(OffsetNumber*));
	box_union = palloc(2*ndims_pageunion * sizeof(GIDX*));
	for ( d = 0; d < ndims_pageunion; d++ )
	{
		list[BELOW(d)] = (OffsetNumber*) palloc(nbytes);
		list[ABOVE(d)] = (OffsetNumber*) palloc(nbytes);
		box_union[BELOW(d)] = gidx_new(ndims_pageunion);
		box_union[ABOVE(d)] = gidx_new(ndims_pageunion);
		pos[BELOW(d)] = 0;
		pos[ABOVE(d)] = 0;
	}

	/*
	** Assign each entry in the node to the volume partitions it belongs to,
	** such as "above the x/y plane, left of the y/z plane, below the x/z plane".
	** Each entry thereby ends up in three of the six partitions.
	*/
	POSTGIS_DEBUG(4, "[GIST] 'picksplit' calculating best split axis");
	for ( i = FirstOffsetNumber; i <= max_offset; i = OffsetNumberNext(i) )
	{
		box_current = (GIDX*) DatumGetPointer(entryvec->vector[i].key);

		for ( d = 0; d < ndims_pageunion; d++ )
		{
			if ( GIDX_GET_MIN(box_current,d)-GIDX_GET_MIN(box_pageunion,d) < GIDX_GET_MAX(box_pageunion,d)-GIDX_GET_MAX(box_current,d) )
			{
				geography_gist_picksplit_addlist(list[BELOW(d)], &(box_union[BELOW(d)]), box_current, &(pos[BELOW(d)]), i);
			}
			else
			{
				geography_gist_picksplit_addlist(list[ABOVE(d)], &(box_union[ABOVE(d)]), box_current, &(pos[ABOVE(d)]), i);
			}

		}

	}

	/*
	** "Bad disposition", too many entries fell into one octant of the space, so no matter which
	** plane we choose to split on, we're going to end up with a mostly full node. Where the
	** data is pretty homogeneous (lots of duplicates) entries that are equidistant from the
	** sides of the page union box can occasionally all end up in one place, leading
	** to this condition.
	*/
	if ( geography_gist_picksplit_badratios(pos,ndims_pageunion) == TRUE )
	{
		/*
		** Instead we split on center points and see if we do better.
		** First calculate the average center point for each axis.
		*/
		double *avgCenter = palloc(ndims_pageunion * sizeof(double));

		for ( d = 0; d < ndims_pageunion; d++ )
		{
			avgCenter[d] = 0.0;
		}

		POSTGIS_DEBUG(4, "[GIST] picksplit can't find good split axis, trying center point method");

		for ( i = FirstOffsetNumber; i <= max_offset; i = OffsetNumberNext(i) )
		{
			box_current = (GIDX*) DatumGetPointer(entryvec->vector[i].key);
			for ( d = 0; d < ndims_pageunion; d++ )
			{
				avgCenter[d] += (GIDX_GET_MAX(box_current,d) + GIDX_GET_MIN(box_current,d)) / 2.0;
			}
		}
		for ( d = 0; d < ndims_pageunion; d++ )
		{
			avgCenter[d] /= max_offset;
			pos[BELOW(d)] = pos[ABOVE(d)] = 0; /* Re-initialize our counters. */
			POSTGIS_DEBUGF(4, "[GIST] picksplit average center point[%d] = %.12g", d, avgCenter[d]);
		}

		/* For each of our entries... */
		for ( i = FirstOffsetNumber; i <= max_offset; i = OffsetNumberNext(i) )
		{
			double center;
			box_current = (GIDX*) DatumGetPointer(entryvec->vector[i].key);

			for ( d = 0; d < ndims_pageunion; d++ )
			{
				center = (GIDX_GET_MIN(box_current,d)+GIDX_GET_MAX(box_current,d))/2.0;
				if ( center < avgCenter[d] )
					geography_gist_picksplit_addlist(list[BELOW(d)], &(box_union[BELOW(d)]), box_current, &(pos[BELOW(d)]), i);
				else if ( FPeq(center, avgCenter[d]) )
					if ( pos[BELOW(d)] > pos[ABOVE(d)] )
						geography_gist_picksplit_addlist(list[ABOVE(d)], &(box_union[ABOVE(d)]), box_current, &(pos[ABOVE(d)]), i);
					else
						geography_gist_picksplit_addlist(list[BELOW(d)], &(box_union[BELOW(d)]), box_current, &(pos[BELOW(d)]), i);
				else
					geography_gist_picksplit_addlist(list[ABOVE(d)], &(box_union[ABOVE(d)]), box_current, &(pos[ABOVE(d)]), i);
			}

		}

		/* Do we have a good disposition now? If not, screw it, just cut the node in half. */
		if ( geography_gist_picksplit_badratios(pos,ndims_pageunion) == TRUE )
		{
			POSTGIS_DEBUG(4, "[GIST] picksplit still cannot find a good split! just cutting the node in half");
			geography_gist_picksplit_fallback(entryvec, v);
			PG_RETURN_POINTER(v);
		}

	}

	/*
	** Now, what splitting plane gives us the most even ratio of
	** entries in our child pages? Since each split region has been apportioned entries
	** against the same number of total entries, the axis that has the smallest maximum
	** number of entries in its regions is the most evenly distributed.
	** TODO: what if the distributions are equal in two or more axes?
	*/
	for ( d = 0; d < ndims_pageunion; d++ )
	{
		int posd = Max(pos[ABOVE(d)],pos[BELOW(d)]);
		if ( posd > posmax )
		{
			direction = d;
			posmax = posd;
		}
	}
	if ( direction == -1 || posmax == -1 )
	{
		/* ERROR OUT HERE */
		elog(ERROR, "Error in building split, unable to determine split direction.");
	}

	POSTGIS_DEBUGF(3, "[GIST] 'picksplit' splitting on axis %d", direction);

	geography_gist_picksplit_constructsplit(v, list[BELOW(direction)],
	                                        pos[BELOW(direction)],
	                                        &(box_union[BELOW(direction)]),
	                                        list[ABOVE(direction)],
	                                        pos[ABOVE(direction)],
	                                        &(box_union[ABOVE(direction)]) );

	POSTGIS_DEBUGF(4, "[GIST] spl_ldatum: %s", gidx_to_string((GIDX*)v->spl_ldatum));
	POSTGIS_DEBUGF(4, "[GIST] spl_rdatum: %s", gidx_to_string((GIDX*)v->spl_rdatum));

	POSTGIS_DEBUGF(4, "[GIST] axis %d: parent range (%.12g, %.12g) left range (%.12g, %.12g), right range (%.12g, %.12g)",
	               direction,
	               GIDX_GET_MIN(box_pageunion, direction), GIDX_GET_MAX(box_pageunion, direction),
	               GIDX_GET_MIN((GIDX*)v->spl_ldatum, direction), GIDX_GET_MAX((GIDX*)v->spl_ldatum, direction),
	               GIDX_GET_MIN((GIDX*)v->spl_rdatum, direction), GIDX_GET_MAX((GIDX*)v->spl_rdatum, direction) );

	PG_RETURN_POINTER(v);

}
Esempio n. 13
0
/*
 *	_bt_readpage() -- Load data from current index page into so->currPos
 *
 * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
 * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
 * they are updated as appropriate.  All other fields of so->currPos are
 * initialized from scratch here.
 *
 * We scan the current page starting at offnum and moving in the indicated
 * direction.  All items matching the scan keys are loaded into currPos.items.
 * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
 * that there can be no more matching tuples in the current scan direction.
 *
 * Returns true if any matching items found on the page, false if none.
 */
static bool
_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
{
	BTScanOpaque so = (BTScanOpaque) scan->opaque;
	Page		page;
	BTPageOpaque opaque;
	OffsetNumber minoff;
	OffsetNumber maxoff;
	int			itemIndex;
	bool		continuescan;

	/* we must have the buffer pinned and locked */
	Assert(BufferIsValid(so->currPos.buf));

	page = BufferGetPage(so->currPos.buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	minoff = P_FIRSTDATAKEY(opaque);
	maxoff = PageGetMaxOffsetNumber(page);

	/*
	 * we must save the page's right-link while scanning it; this tells us
	 * where to step right to after we're done with these items.  There is no
	 * corresponding need for the left-link, since splits always go right.
	 */
	so->currPos.nextPage = opaque->btpo_next;

	if (ScanDirectionIsForward(dir))
	{
		/* load items[] in ascending order */
		itemIndex = 0;

		offnum = Max(offnum, minoff);

		while (offnum <= maxoff)
		{
			if (_bt_checkkeys(scan, page, offnum, dir, &continuescan))
			{
				/* tuple passes all scan key conditions, so remember it */
				/* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */
				so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self;
				so->currPos.items[itemIndex].indexOffset = offnum;
				itemIndex++;
			}
			if (!continuescan)
			{
				/* there can't be any more matches, so stop */
				so->currPos.moreRight = false;
				break;
			}

			offnum = OffsetNumberNext(offnum);
		}

		Assert(itemIndex <= MaxIndexTuplesPerPage);
		so->currPos.firstItem = 0;
		so->currPos.lastItem = itemIndex - 1;
		so->currPos.itemIndex = 0;
	}
	else
	{
		/* load items[] in descending order */
		itemIndex = MaxIndexTuplesPerPage;

		offnum = Min(offnum, maxoff);

		while (offnum >= minoff)
		{
			if (_bt_checkkeys(scan, page, offnum, dir, &continuescan))
			{
				/* tuple passes all scan key conditions, so remember it */
				/* _bt_checkkeys put the heap ptr into scan->xs_ctup.t_self */
				itemIndex--;
				so->currPos.items[itemIndex].heapTid = scan->xs_ctup.t_self;
				so->currPos.items[itemIndex].indexOffset = offnum;
			}
			if (!continuescan)
			{
				/* there can't be any more matches, so stop */
				so->currPos.moreLeft = false;
				break;
			}

			offnum = OffsetNumberPrev(offnum);
		}

		Assert(itemIndex >= 0);
		so->currPos.firstItem = itemIndex;
		so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
		so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
	}

	return (so->currPos.firstItem <= so->currPos.lastItem);
}
Esempio n. 14
0
/*
 *	_hash_squeezebucket(rel, bucket)
 *
 *	Try to squeeze the tuples onto pages occurring earlier in the
 *	bucket chain in an attempt to free overflow pages. When we start
 *	the "squeezing", the page from which we start taking tuples (the
 *	"read" page) is the last bucket in the bucket chain and the page
 *	onto which we start squeezing tuples (the "write" page) is the
 *	first page in the bucket chain.  The read page works backward and
 *	the write page works forward; the procedure terminates when the
 *	read page and write page are the same page.
 *
 *	At completion of this procedure, it is guaranteed that all pages in
 *	the bucket are nonempty, unless the bucket is totally empty (in
 *	which case all overflow pages will be freed).  The original implementation
 *	required that to be true on entry as well, but it's a lot easier for
 *	callers to leave empty overflow pages and let this guy clean it up.
 *
 *	Caller must hold exclusive lock on the target bucket.  This allows
 *	us to safely lock multiple pages in the bucket.
 *
 *	Since this function is invoked in VACUUM, we provide an access strategy
 *	parameter that controls fetches of the bucket pages.
 */
void
_hash_squeezebucket(Relation rel,
					Bucket bucket,
					BlockNumber bucket_blkno,
					BufferAccessStrategy bstrategy)
{
	BlockNumber wblkno;
	BlockNumber rblkno;
	Buffer		wbuf;
	Buffer		rbuf;
	Page		wpage;
	Page		rpage;
	HashPageOpaque wopaque;
	HashPageOpaque ropaque;
	bool		wbuf_dirty;

	/*
	 * start squeezing into the base bucket page.
	 */
	wblkno = bucket_blkno;
	wbuf = _hash_getbuf_with_strategy(rel,
									  wblkno,
									  HASH_WRITE,
									  LH_BUCKET_PAGE,
									  bstrategy);
	wpage = BufferGetPage(wbuf);
	wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);

	/*
	 * if there aren't any overflow pages, there's nothing to squeeze.
	 */
	if (!BlockNumberIsValid(wopaque->hasho_nextblkno))
	{
		_hash_relbuf(rel, wbuf);
		return;
	}

	/*
	 * Find the last page in the bucket chain by starting at the base bucket
	 * page and working forward.  Note: we assume that a hash bucket chain is
	 * usually smaller than the buffer ring being used by VACUUM, else using
	 * the access strategy here would be counterproductive.
	 */
	rbuf = InvalidBuffer;
	ropaque = wopaque;
	do
	{
		rblkno = ropaque->hasho_nextblkno;
		if (rbuf != InvalidBuffer)
			_hash_relbuf(rel, rbuf);
		rbuf = _hash_getbuf_with_strategy(rel,
										  rblkno,
										  HASH_WRITE,
										  LH_OVERFLOW_PAGE,
										  bstrategy);
		rpage = BufferGetPage(rbuf);
		ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
		Assert(ropaque->hasho_bucket == bucket);
	} while (BlockNumberIsValid(ropaque->hasho_nextblkno));

	/*
	 * squeeze the tuples.
	 */
	wbuf_dirty = false;
	for (;;)
	{
		OffsetNumber roffnum;
		OffsetNumber maxroffnum;
		OffsetNumber deletable[MaxOffsetNumber];
		int			ndeletable = 0;

		/* Scan each tuple in "read" page */
		maxroffnum = PageGetMaxOffsetNumber(rpage);
		for (roffnum = FirstOffsetNumber;
			 roffnum <= maxroffnum;
			 roffnum = OffsetNumberNext(roffnum))
		{
			IndexTuple	itup;
			Size		itemsz;

			itup = (IndexTuple) PageGetItem(rpage,
											PageGetItemId(rpage, roffnum));
			itemsz = IndexTupleDSize(*itup);
			itemsz = MAXALIGN(itemsz);

			/*
			 * Walk up the bucket chain, looking for a page big enough for
			 * this item.  Exit if we reach the read page.
			 */
			while (PageGetFreeSpace(wpage) < itemsz)
			{
				Assert(!PageIsEmpty(wpage));

				wblkno = wopaque->hasho_nextblkno;
				Assert(BlockNumberIsValid(wblkno));

				if (wbuf_dirty)
					_hash_wrtbuf(rel, wbuf);
				else
					_hash_relbuf(rel, wbuf);

				/* nothing more to do if we reached the read page */
				if (rblkno == wblkno)
				{
					if (ndeletable > 0)
					{
						/* Delete tuples we already moved off read page */
						PageIndexMultiDelete(rpage, deletable, ndeletable);
						_hash_wrtbuf(rel, rbuf);
					}
					else
						_hash_relbuf(rel, rbuf);
					return;
				}

				wbuf = _hash_getbuf_with_strategy(rel,
												  wblkno,
												  HASH_WRITE,
												  LH_OVERFLOW_PAGE,
												  bstrategy);
				wpage = BufferGetPage(wbuf);
				wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
				Assert(wopaque->hasho_bucket == bucket);
				wbuf_dirty = false;
			}

			/*
			 * we have found room so insert on the "write" page, being careful
			 * to preserve hashkey ordering.  (If we insert many tuples into
			 * the same "write" page it would be worth qsort'ing instead of
			 * doing repeated _hash_pgaddtup.)
			 */
			(void) _hash_pgaddtup(rel, wbuf, itemsz, itup);
			wbuf_dirty = true;

			/* remember tuple for deletion from "read" page */
			deletable[ndeletable++] = roffnum;
		}

		/*
		 * If we reach here, there are no live tuples on the "read" page ---
		 * it was empty when we got to it, or we moved them all.  So we can
		 * just free the page without bothering with deleting tuples
		 * individually.  Then advance to the previous "read" page.
		 *
		 * Tricky point here: if our read and write pages are adjacent in the
		 * bucket chain, our write lock on wbuf will conflict with
		 * _hash_freeovflpage's attempt to update the sibling links of the
		 * removed page.  However, in that case we are done anyway, so we can
		 * simply drop the write lock before calling _hash_freeovflpage.
		 */
		rblkno = ropaque->hasho_prevblkno;
		Assert(BlockNumberIsValid(rblkno));

		/* are we freeing the page adjacent to wbuf? */
		if (rblkno == wblkno)
		{
			/* yes, so release wbuf lock first */
			if (wbuf_dirty)
				_hash_wrtbuf(rel, wbuf);
			else
				_hash_relbuf(rel, wbuf);
			/* free this overflow page (releases rbuf) */
			_hash_freeovflpage(rel, rbuf, bstrategy);
			/* done */
			return;
		}

		/* free this overflow page, then get the previous one */
		_hash_freeovflpage(rel, rbuf, bstrategy);

		rbuf = _hash_getbuf_with_strategy(rel,
										  rblkno,
										  HASH_WRITE,
										  LH_OVERFLOW_PAGE,
										  bstrategy);
		rpage = BufferGetPage(rbuf);
		ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
		Assert(ropaque->hasho_bucket == bucket);
	}

	/* NOTREACHED */
}
Esempio n. 15
0
/*
 * Extract all item values from a BRIN index page
 *
 * Usage: SELECT * FROM brin_page_items(get_raw_page('idx', 1), 'idx'::regclass);
 */
Datum
brin_page_items(PG_FUNCTION_ARGS)
{
	brin_page_state *state;
	FuncCallContext *fctx;

	if (!superuser())
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
				 (errmsg("must be superuser to use raw page functions"))));

	if (SRF_IS_FIRSTCALL())
	{
		bytea	   *raw_page = PG_GETARG_BYTEA_P(0);
		Oid			indexRelid = PG_GETARG_OID(1);
		Page		page;
		TupleDesc	tupdesc;
		MemoryContext mctx;
		Relation	indexRel;
		AttrNumber	attno;

		/* minimally verify the page we got */
		page = verify_brin_page(raw_page, BRIN_PAGETYPE_REGULAR, "regular");

		/* create a function context for cross-call persistence */
		fctx = SRF_FIRSTCALL_INIT();

		/* switch to memory context appropriate for multiple function calls */
		mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);

		/* Build a tuple descriptor for our result type */
		if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
			elog(ERROR, "return type must be a row type");

		indexRel = index_open(indexRelid, AccessShareLock);

		state = palloc(offsetof(brin_page_state, columns) +
					   sizeof(brin_column_state) * RelationGetDescr(indexRel)->natts);

		state->bdesc = brin_build_desc(indexRel);
		state->page = page;
		state->offset = FirstOffsetNumber;
		state->unusedItem = false;
		state->done = false;
		state->dtup = NULL;

		/*
		 * Initialize output functions for all indexed datatypes; simplifies
		 * calling them later.
		 */
		for (attno = 1; attno <= state->bdesc->bd_tupdesc->natts; attno++)
		{
			Oid		output;
			bool	isVarlena;
			BrinOpcInfo *opcinfo;
			int		i;
			brin_column_state *column;

			opcinfo = state->bdesc->bd_info[attno - 1];
			column = palloc(offsetof(brin_column_state, outputFn) +
							sizeof(FmgrInfo) * opcinfo->oi_nstored);

			column->nstored = opcinfo->oi_nstored;
			for (i = 0; i < opcinfo->oi_nstored; i++)
			{
				getTypeOutputInfo(opcinfo->oi_typids[i], &output, &isVarlena);
				fmgr_info(output, &column->outputFn[i]);
			}

			state->columns[attno - 1] = column;
		}

		index_close(indexRel, AccessShareLock);

		fctx->user_fctx = state;
		fctx->tuple_desc = BlessTupleDesc(tupdesc);

		MemoryContextSwitchTo(mctx);
	}

	fctx = SRF_PERCALL_SETUP();
	state = fctx->user_fctx;

	if (!state->done)
	{
		HeapTuple	result;
		Datum		values[7];
		bool		nulls[7];

		/*
		 * This loop is called once for every attribute of every tuple in the
		 * page.  At the start of a tuple, we get a NULL dtup; that's our
		 * signal for obtaining and decoding the next one.  If that's not the
		 * case, we output the next attribute.
		 */
		if (state->dtup == NULL)
		{
			BrinTuple	   *tup;
			MemoryContext mctx;
			ItemId		itemId;

			/* deformed tuple must live across calls */
			mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);

			/* verify item status: if there's no data, we can't decode */
			itemId = PageGetItemId(state->page, state->offset);
			if (ItemIdIsUsed(itemId))
			{
				tup = (BrinTuple *) PageGetItem(state->page,
											  PageGetItemId(state->page,
															state->offset));
				state->dtup = brin_deform_tuple(state->bdesc, tup);
				state->attno = 1;
				state->unusedItem = false;
			}
			else
				state->unusedItem = true;

			MemoryContextSwitchTo(mctx);
		}
		else
			state->attno++;

		MemSet(nulls, 0, sizeof(nulls));

		if (state->unusedItem)
		{
			values[0] = UInt16GetDatum(state->offset);
			nulls[1] = true;
			nulls[2] = true;
			nulls[3] = true;
			nulls[4] = true;
			nulls[5] = true;
			nulls[6] = true;
		}
		else
		{
			int		att = state->attno - 1;

			values[0] = UInt16GetDatum(state->offset);
			values[1] = UInt32GetDatum(state->dtup->bt_blkno);
			values[2] = UInt16GetDatum(state->attno);
			values[3] = BoolGetDatum(state->dtup->bt_columns[att].bv_allnulls);
			values[4] = BoolGetDatum(state->dtup->bt_columns[att].bv_hasnulls);
			values[5] = BoolGetDatum(state->dtup->bt_placeholder);
			if (!state->dtup->bt_columns[att].bv_allnulls)
			{
				BrinValues   *bvalues = &state->dtup->bt_columns[att];
				StringInfoData	s;
				bool		first;
				int			i;

				initStringInfo(&s);
				appendStringInfoChar(&s, '{');

				first = true;
				for (i = 0; i < state->columns[att]->nstored; i++)
				{
					char   *val;

					if (!first)
						appendStringInfoString(&s, " .. ");
					first = false;
					val = OutputFunctionCall(&state->columns[att]->outputFn[i],
											 bvalues->bv_values[i]);
					appendStringInfoString(&s, val);
					pfree(val);
				}
				appendStringInfoChar(&s, '}');

				values[6] = CStringGetTextDatum(s.data);
				pfree(s.data);
			}
			else
			{
				nulls[6] = true;
			}
		}

		result = heap_form_tuple(fctx->tuple_desc, values, nulls);

		/*
		 * If the item was unused, jump straight to the next one; otherwise,
		 * the only cleanup needed here is to set our signal to go to the next
		 * tuple in the following iteration, by freeing the current one.
		 */
		if (state->unusedItem)
			state->offset = OffsetNumberNext(state->offset);
		else if (state->attno >= state->bdesc->bd_tupdesc->natts)
		{
			pfree(state->dtup);
			state->dtup = NULL;
			state->offset = OffsetNumberNext(state->offset);
		}

		/*
		 * If we're beyond the end of the page, set flag to end the function in
		 * the following iteration.
		 */
		if (state->offset > PageGetMaxOffsetNumber(state->page))
			state->done = true;

		SRF_RETURN_NEXT(fctx, HeapTupleGetDatum(result));
	}

	brin_free_desc(state->bdesc);

	SRF_RETURN_DONE(fctx);
}
Esempio n. 16
0
/*----------
 * Add an item to a disk page from the sort output.
 *
 * We must be careful to observe the page layout conventions of nbtsearch.c:
 * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
 * - on non-leaf pages, the key portion of the first item need not be
 *	 stored, we should store only the link.
 *
 * A leaf page being built looks like:
 *
 * +----------------+---------------------------------+
 * | PageHeaderData | linp0 linp1 linp2 ...           |
 * +-----------+----+---------------------------------+
 * | ... linpN |									  |
 * +-----------+--------------------------------------+
 * |	 ^ last										  |
 * |												  |
 * +-------------+------------------------------------+
 * |			 | itemN ...                          |
 * +-------------+------------------+-----------------+
 * |		  ... item3 item2 item1 | "special space" |
 * +--------------------------------+-----------------+
 *
 * Contrast this with the diagram in bufpage.h; note the mismatch
 * between linps and items.  This is because we reserve linp0 as a
 * placeholder for the pointer to the "high key" item; when we have
 * filled up the page, we will set linp0 to point to itemN and clear
 * linpN.  On the other hand, if we find this is the last (rightmost)
 * page, we leave the items alone and slide the linp array over.
 *
 * 'last' pointer indicates the last offset added to the page.
 *----------
 */
static void
_bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
{
	Page		npage;
	BlockNumber nblkno;
	OffsetNumber last_off;
	Size		pgspc;
	Size		itupsz;

	/*
	 * This is a handy place to check for cancel interrupts during the btree
	 * load phase of index creation.
	 */
	CHECK_FOR_INTERRUPTS();

	npage = state->btps_page;
	nblkno = state->btps_blkno;
	last_off = state->btps_lastoff;

	pgspc = PageGetFreeSpace(npage);
	itupsz = IndexTupleDSize(*itup);
	itupsz = MAXALIGN(itupsz);

	/*
	 * Check whether the item can fit on a btree page at all. (Eventually, we
	 * ought to try to apply TOAST methods if not.) We actually need to be
	 * able to fit three items on every page, so restrict any one item to 1/3
	 * the per-page available space. Note that at this point, itupsz doesn't
	 * include the ItemId.
	 *
	 * NOTE: similar code appears in _bt_insertonpg() to defend against
	 * oversize items being inserted into an already-existing index. But
	 * during creation of an index, we don't go through there.
	 */
	if (itupsz > BTMaxItemSize(npage))
		ereport(ERROR,
				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
			errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
				   itupsz, BTMaxItemSize(npage),
				   RelationGetRelationName(wstate->index)),
		errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
				"Consider a function index of an MD5 hash of the value, "
				"or use full text indexing."),
				 errtableconstraint(wstate->heap,
									RelationGetRelationName(wstate->index))));

	/*
	 * Check to see if page is "full".  It's definitely full if the item won't
	 * fit.  Otherwise, compare to the target freespace derived from the
	 * fillfactor.  However, we must put at least two items on each page, so
	 * disregard fillfactor if we don't have that many.
	 */
	if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY))
	{
		/*
		 * Finish off the page and write it out.
		 */
		Page		opage = npage;
		BlockNumber oblkno = nblkno;
		ItemId		ii;
		ItemId		hii;
		IndexTuple	oitup;

		/* Create new page of same level */
		npage = _bt_blnewpage(state->btps_level);

		/* and assign it a page position */
		nblkno = wstate->btws_pages_alloced++;

		/*
		 * We copy the last item on the page into the new page, and then
		 * rearrange the old page so that the 'last item' becomes its high key
		 * rather than a true data item.  There had better be at least two
		 * items on the page already, else the page would be empty of useful
		 * data.
		 */
		Assert(last_off > P_FIRSTKEY);
		ii = PageGetItemId(opage, last_off);
		oitup = (IndexTuple) PageGetItem(opage, ii);
		_bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY);

		/*
		 * Move 'last' into the high key position on opage
		 */
		hii = PageGetItemId(opage, P_HIKEY);
		*hii = *ii;
		ItemIdSetUnused(ii);	/* redundant */
		((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);

		/*
		 * Link the old page into its parent, using its minimum key. If we
		 * don't have a parent, we have to create one; this adds a new btree
		 * level.
		 */
		if (state->btps_next == NULL)
			state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);

		Assert(state->btps_minkey != NULL);
		ItemPointerSet(&(state->btps_minkey->t_tid), oblkno, P_HIKEY);
		_bt_buildadd(wstate, state->btps_next, state->btps_minkey);
		pfree(state->btps_minkey);

		/*
		 * Save a copy of the minimum key for the new page.  We have to copy
		 * it off the old page, not the new one, in case we are not at leaf
		 * level.
		 */
		state->btps_minkey = CopyIndexTuple(oitup);

		/*
		 * Set the sibling links for both pages.
		 */
		{
			BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
			BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);

			oopaque->btpo_next = nblkno;
			nopaque->btpo_prev = oblkno;
			nopaque->btpo_next = P_NONE;		/* redundant */
		}

		/*
		 * Write out the old page.  We never need to touch it again, so we can
		 * free the opage workspace too.
		 */
		_bt_blwritepage(wstate, opage, oblkno);

		/*
		 * Reset last_off to point to new page
		 */
		last_off = P_FIRSTKEY;
	}

	/*
	 * If the new item is the first for its page, stash a copy for later. Note
	 * this will only happen for the first item on a level; on later pages,
	 * the first item for a page is copied from the prior page in the code
	 * above.
	 */
	if (last_off == P_HIKEY)
	{
		Assert(state->btps_minkey == NULL);
		state->btps_minkey = CopyIndexTuple(itup);
	}

	/*
	 * Add the new item into the current page.
	 */
	last_off = OffsetNumberNext(last_off);
	_bt_sortaddtup(npage, itupsz, itup, last_off);

	state->btps_page = npage;
	state->btps_blkno = nblkno;
	state->btps_lastoff = last_off;
}
Esempio n. 17
0
/*
 * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
 *
 * We are splitting a bucket that consists of a base bucket page and zero
 * or more overflow (bucket chain) pages.  We must relocate tuples that
 * belong in the new bucket, and compress out any free space in the old
 * bucket.
 *
 * The caller must hold exclusive locks on both buckets to ensure that
 * no one else is trying to access them (see README).
 *
 * The caller must hold a pin, but no lock, on the metapage buffer.
 * The buffer is returned in the same state.  (The metapage is only
 * touched if it becomes necessary to add or remove overflow pages.)
 */
static void
_hash_splitbucket(Relation rel,
				  Buffer metabuf,
				  Bucket obucket,
				  Bucket nbucket,
				  BlockNumber start_oblkno,
				  BlockNumber start_nblkno,
				  uint32 maxbucket,
				  uint32 highmask,
				  uint32 lowmask)
{
	Bucket		bucket;
	Buffer		obuf;
	Buffer		nbuf;
	BlockNumber oblkno;
	BlockNumber nblkno;
	bool		null;
	Datum		datum;
	HashPageOpaque oopaque;
	HashPageOpaque nopaque;
	IndexTuple	itup;
	Size		itemsz;
	OffsetNumber ooffnum;
	OffsetNumber noffnum;
	OffsetNumber omaxoffnum;
	Page		opage;
	Page		npage;
	TupleDesc	itupdesc = RelationGetDescr(rel);

	MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD;

	/*
	 * It should be okay to simultaneously write-lock pages from each bucket,
	 * since no one else can be trying to acquire buffer lock on pages of
	 * either bucket.
	 */
	oblkno = start_oblkno;
	obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
	_hash_checkpage(rel, obuf, LH_BUCKET_PAGE);
	opage = BufferGetPage(obuf);
	oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);

	nblkno = start_nblkno;
	nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE);
	npage = BufferGetPage(nbuf);

	/* initialize the new bucket's primary page */
	_hash_pageinit(npage, BufferGetPageSize(nbuf));
	nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
	nopaque->hasho_prevblkno = InvalidBlockNumber;
	nopaque->hasho_nextblkno = InvalidBlockNumber;
	nopaque->hasho_bucket = nbucket;
	nopaque->hasho_flag = LH_BUCKET_PAGE;
	nopaque->hasho_filler = HASHO_FILL;

	/*
	 * Partition the tuples in the old bucket between the old bucket and the
	 * new bucket, advancing along the old bucket's overflow bucket chain and
	 * adding overflow pages to the new bucket as needed.
	 */
	ooffnum = FirstOffsetNumber;
	omaxoffnum = PageGetMaxOffsetNumber(opage);
	for (;;)
	{
		/*
		 * at each iteration through this loop, each of these variables should
		 * be up-to-date: obuf opage oopaque ooffnum omaxoffnum
		 */

		/* check if we're at the end of the page */
		if (ooffnum > omaxoffnum)
		{
			/* at end of page, but check for an(other) overflow page */
			oblkno = oopaque->hasho_nextblkno;
			if (!BlockNumberIsValid(oblkno))
				break;

			/*
			 * we ran out of tuples on this particular page, but we have more
			 * overflow pages; advance to next page.
			 */
			_hash_wrtbuf(rel, obuf);

			obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
			_hash_checkpage(rel, obuf, LH_OVERFLOW_PAGE);
			opage = BufferGetPage(obuf);
			oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
			ooffnum = FirstOffsetNumber;
			omaxoffnum = PageGetMaxOffsetNumber(opage);
			continue;
		}

		/*
		 * Re-hash the tuple to determine which bucket it now belongs in.
		 *
		 * It is annoying to call the hash function while holding locks, but
		 * releasing and relocking the page for each tuple is unappealing too.
		 */
		itup = (IndexTuple) PageGetItem(opage, PageGetItemId(opage, ooffnum));
		datum = index_getattr(itup, 1, itupdesc, &null);
		Assert(!null);

		bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum),
									  maxbucket, highmask, lowmask);

		if (bucket == nbucket)
		{
			/*
			 * insert the tuple into the new bucket.  if it doesn't fit on the
			 * current page in the new bucket, we must allocate a new overflow
			 * page and place the tuple on that page instead.
			 */
			itemsz = IndexTupleDSize(*itup);
			itemsz = MAXALIGN(itemsz);

			if (PageGetFreeSpace(npage) < itemsz)
			{
				/* write out nbuf and drop lock, but keep pin */
				_hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK);
				/* chain to a new overflow page */
				nbuf = _hash_addovflpage(rel, metabuf, nbuf);
				_hash_checkpage(rel, nbuf, LH_OVERFLOW_PAGE);
				npage = BufferGetPage(nbuf);
				/* we don't need nopaque within the loop */
			}

			noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
			if (PageAddItem(npage, (Item) itup, itemsz, noffnum, LP_USED)
				== InvalidOffsetNumber)
				elog(ERROR, "failed to add index item to \"%s\"",
					 RelationGetRelationName(rel));

			/*
			 * now delete the tuple from the old bucket.  after this section
			 * of code, 'ooffnum' will actually point to the ItemId to which
			 * we would point if we had advanced it before the deletion
			 * (PageIndexTupleDelete repacks the ItemId array).  this also
			 * means that 'omaxoffnum' is exactly one less than it used to be,
			 * so we really can just decrement it instead of calling
			 * PageGetMaxOffsetNumber.
			 */
			PageIndexTupleDelete(opage, ooffnum);
			omaxoffnum = OffsetNumberPrev(omaxoffnum);
		}
		else
		{
			/*
			 * the tuple stays on this page.  we didn't move anything, so we
			 * didn't delete anything and therefore we don't have to change
			 * 'omaxoffnum'.
			 */
			Assert(bucket == obucket);
			ooffnum = OffsetNumberNext(ooffnum);
		}
	}

	/*
	 * We're at the end of the old bucket chain, so we're done partitioning
	 * the tuples.	Before quitting, call _hash_squeezebucket to ensure the
	 * tuples remaining in the old bucket (including the overflow pages) are
	 * packed as tightly as possible.  The new bucket is already tight.
	 */
	_hash_wrtbuf(rel, obuf);
	_hash_wrtbuf(rel, nbuf);

	_hash_squeezebucket(rel, obucket, start_oblkno);
}
Esempio n. 18
0
/*
 *	_hash_step() -- step to the next valid item in a scan in the bucket.
 *
 *		If no valid record exists in the requested direction, return
 *		false.  Else, return true and set the hashso_curpos for the
 *		scan to the right thing.
 *
 *		'bufP' points to the current buffer, which is pinned and read-locked.
 *		On success exit, we have pin and read-lock on whichever page
 *		contains the right item; on failure, we have released all buffers.
 */
bool
_hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
{
	Relation	rel = scan->indexRelation;
	HashScanOpaque so = (HashScanOpaque) scan->opaque;
	ItemPointer current;
	Buffer		buf;
	Page		page;
	HashPageOpaque opaque;
	OffsetNumber maxoff;
	OffsetNumber offnum;
	BlockNumber blkno;
	IndexTuple	itup;

	current = &(so->hashso_curpos);

	buf = *bufP;
	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
	page = BufferGetPage(buf);
	opaque = (HashPageOpaque) PageGetSpecialPointer(page);

	/*
	 * If _hash_step is called from _hash_first, current will not be valid, so
	 * we can't dereference it.  However, in that case, we presumably want to
	 * start at the beginning/end of the page...
	 */
	maxoff = PageGetMaxOffsetNumber(page);
	if (ItemPointerIsValid(current))
		offnum = ItemPointerGetOffsetNumber(current);
	else
		offnum = InvalidOffsetNumber;

	/*
	 * 'offnum' now points to the last tuple we examined (if any).
	 *
	 * continue to step through tuples until: 1) we get to the end of the
	 * bucket chain or 2) we find a valid tuple.
	 */
	do
	{
		switch (dir)
		{
			case ForwardScanDirection:
				if (offnum != InvalidOffsetNumber)
					offnum = OffsetNumberNext(offnum);	/* move forward */
				else
				{
					/* new page, locate starting position by binary search */
					offnum = _hash_binsearch(page, so->hashso_sk_hash);
				}

				for (;;)
				{
					/*
					 * check if we're still in the range of items with the
					 * target hash key
					 */
					if (offnum <= maxoff)
					{
						Assert(offnum >= FirstOffsetNumber);
						itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
						if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
							break;		/* yes, so exit for-loop */
					}

					/*
					 * ran off the end of this page, try the next
					 */
					_hash_readnext(rel, &buf, &page, &opaque);
					if (BufferIsValid(buf))
					{
						maxoff = PageGetMaxOffsetNumber(page);
						offnum = _hash_binsearch(page, so->hashso_sk_hash);
					}
					else
					{
						/* end of bucket */
						itup = NULL;
						break;	/* exit for-loop */
					}
				}
				break;

			case BackwardScanDirection:
				if (offnum != InvalidOffsetNumber)
					offnum = OffsetNumberPrev(offnum);	/* move back */
				else
				{
					/* new page, locate starting position by binary search */
					offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
				}

				for (;;)
				{
					/*
					 * check if we're still in the range of items with the
					 * target hash key
					 */
					if (offnum >= FirstOffsetNumber)
					{
						Assert(offnum <= maxoff);
						itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
						if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup))
							break;		/* yes, so exit for-loop */
					}

					/*
					 * ran off the end of this page, try the next
					 */
					_hash_readprev(rel, &buf, &page, &opaque);
					if (BufferIsValid(buf))
					{
						maxoff = PageGetMaxOffsetNumber(page);
						offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
					}
					else
					{
						/* end of bucket */
						itup = NULL;
						break;	/* exit for-loop */
					}
				}
				break;

			default:
				/* NoMovementScanDirection */
				/* this should not be reached */
				itup = NULL;
				break;
		}

		if (itup == NULL)
		{
			/* we ran off the end of the bucket without finding a match */
			*bufP = so->hashso_curbuf = InvalidBuffer;
			ItemPointerSetInvalid(current);
			return false;
		}

		/* check the tuple quals, loop around if not met */
	} while (!_hash_checkqual(scan, itup));

	/* if we made it to here, we've found a valid tuple */
	blkno = BufferGetBlockNumber(buf);
	*bufP = so->hashso_curbuf = buf;
	ItemPointerSet(current, blkno, offnum);
	return true;
}
Esempio n. 19
0
static void
bitmap_xlog_insert_lovitem(bool redo, XLogRecPtr lsn, XLogRecord* record)
{
	xl_bm_lovitem	*xlrec = (xl_bm_lovitem*) XLogRecGetData(record);
	Relation reln;

	reln = XLogOpenRelation(xlrec->bm_node);
	if (!RelationIsValid(reln))
		return;

	if (redo)
	{
		Buffer			lovBuffer;
		Page			lovPage;

#ifdef BM_DEBUG
		ereport(LOG, (errcode(LOG), 
			errmsg("call bitmap_xlog_insert_lovitem: redo=%d, blkno=%d\n", 
					redo, xlrec->bm_lov_blkno)));
#endif

		lovBuffer = XLogReadBuffer(false, reln, xlrec->bm_lov_blkno);
		if (!BufferIsValid(lovBuffer))
			elog(PANIC, "bm_insert_redo: block unfound: %d",
				 xlrec->bm_lov_blkno);

		lovPage = BufferGetPage(lovBuffer);

		if (XLByteLT(PageGetLSN(lovPage), lsn))
		{
			if(xlrec->bm_isNewItem)
			{
				OffsetNumber	newOffset, itemSize;

				newOffset = OffsetNumberNext(PageGetMaxOffsetNumber(lovPage));
				if (newOffset != xlrec->bm_lov_offset)
					elog(PANIC, 
				"bm_insert_redo: LOV item is not inserted in pos %d(requested %d)",
						 newOffset, xlrec->bm_lov_offset);

				itemSize = sizeof(BMLOVItemData);
				if (itemSize > PageGetFreeSpace(lovPage))
					elog(PANIC, 
						 "bm_insert_redo: not enough space in LOV page %d",
						 xlrec->bm_lov_blkno);
		
				if (PageAddItem(lovPage, (Item)&(xlrec->bm_lovItem), itemSize, 
								newOffset, LP_USED) == InvalidOffsetNumber)
					ereport(ERROR,
							(errcode(ERRCODE_INTERNAL_ERROR),
							errmsg("failed to add LOV item to \"%s\"",
							RelationGetRelationName(reln))));
			}

			else{
				BMLOVItem oldLovItem;
				oldLovItem = (BMLOVItem)
					PageGetItem(lovPage, 
								PageGetItemId(lovPage, xlrec->bm_lov_offset));

				memcpy(oldLovItem, &(xlrec->bm_lovItem), sizeof(BMLOVItemData));
			}

			PageSetLSN(lovPage, lsn);
			PageSetTLI(lovPage, ThisTimeLineID);
			_bitmap_wrtbuf(lovBuffer);
		}

		else {
			_bitmap_relbuf(lovBuffer);
		}
	}

	else
		elog(PANIC, "bm_insert_undo: not implemented.");
}
Esempio n. 20
0
/*
 * _bt_killitems - set LP_DEAD state for items an indexscan caller has
 * told us were killed
 *
 * scan->so contains information about the current page and killed tuples
 * thereon (generally, this should only be called if so->numKilled > 0).
 *
 * The caller must have pin on so->currPos.buf, but may or may not have
 * read-lock, as indicated by haveLock.  Note that we assume read-lock
 * is sufficient for setting LP_DEAD status (which is only a hint).
 *
 * We match items by heap TID before assuming they are the right ones to
 * delete.	We cope with cases where items have moved right due to insertions.
 * If an item has moved off the current page due to a split, we'll fail to
 * find it and do nothing (this is not an error case --- we assume the item
 * will eventually get marked in a future indexscan).  Note that because we
 * hold pin on the target page continuously from initially reading the items
 * until applying this function, VACUUM cannot have deleted any items from
 * the page, and so there is no need to search left from the recorded offset.
 * (This observation also guarantees that the item is still the right one
 * to delete, which might otherwise be questionable since heap TIDs can get
 * recycled.)
 */
void
_bt_killitems(IndexScanDesc scan, bool haveLock)
{
	BTScanOpaque so = (BTScanOpaque) scan->opaque;
	Page		page;
	BTPageOpaque opaque;
	OffsetNumber minoff;
	OffsetNumber maxoff;
	int			i;
	bool		killedsomething = false;

	Assert(BufferIsValid(so->currPos.buf));

	if (!haveLock)
		LockBuffer(so->currPos.buf, BT_READ);

	page = BufferGetPage(so->currPos.buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	minoff = P_FIRSTDATAKEY(opaque);
	maxoff = PageGetMaxOffsetNumber(page);

	for (i = 0; i < so->numKilled; i++)
	{
		int			itemIndex = so->killedItems[i];
		BTScanPosItem *kitem = &so->currPos.items[itemIndex];
		OffsetNumber offnum = kitem->indexOffset;

		Assert(itemIndex >= so->currPos.firstItem &&
			   itemIndex <= so->currPos.lastItem);
		if (offnum < minoff)
			continue;			/* pure paranoia */
		while (offnum <= maxoff)
		{
			ItemId		iid = PageGetItemId(page, offnum);
			IndexTuple	ituple = (IndexTuple) PageGetItem(page, iid);

			if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
			{
				/* found the item */
				ItemIdMarkDead(iid);
				killedsomething = true;
				break;			/* out of inner search loop */
			}
			offnum = OffsetNumberNext(offnum);
		}
	}

	/*
	 * Since this can be redone later if needed, it's treated the same as a
	 * commit-hint-bit status update for heap tuples: we mark the buffer dirty
	 * but don't make a WAL log entry.
	 *
	 * Whenever we mark anything LP_DEAD, we also set the page's
	 * BTP_HAS_GARBAGE flag, which is likewise just a hint.
	 */
	if (killedsomething)
	{
		opaque->btpo_flags |= BTP_HAS_GARBAGE;
		SetBufferCommitInfoNeedsSave(so->currPos.buf);
	}

	if (!haveLock)
		LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);

	/*
	 * Always reset the scan state, so we don't look for same items on other
	 * pages.
	 */
	so->numKilled = 0;
}
Esempio n. 21
0
/*
 *	_hash_finish_split() -- Finish the previously interrupted split operation
 *
 * To complete the split operation, we form the hash table of TIDs in new
 * bucket which is then used by split operation to skip tuples that are
 * already moved before the split operation was previously interrupted.
 *
 * The caller must hold a pin, but no lock, on the metapage and old bucket's
 * primary page buffer.  The buffers are returned in the same state.  (The
 * metapage is only touched if it becomes necessary to add or remove overflow
 * pages.)
 */
void
_hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket,
				   uint32 maxbucket, uint32 highmask, uint32 lowmask)
{
	HASHCTL		hash_ctl;
	HTAB	   *tidhtab;
	Buffer		bucket_nbuf = InvalidBuffer;
	Buffer		nbuf;
	Page		npage;
	BlockNumber nblkno;
	BlockNumber bucket_nblkno;
	HashPageOpaque npageopaque;
	Bucket		nbucket;
	bool		found;

	/* Initialize hash tables used to track TIDs */
	memset(&hash_ctl, 0, sizeof(hash_ctl));
	hash_ctl.keysize = sizeof(ItemPointerData);
	hash_ctl.entrysize = sizeof(ItemPointerData);
	hash_ctl.hcxt = CurrentMemoryContext;

	tidhtab =
		hash_create("bucket ctids",
					256,		/* arbitrary initial size */
					&hash_ctl,
					HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);

	bucket_nblkno = nblkno = _hash_get_newblock_from_oldbucket(rel, obucket);

	/*
	 * Scan the new bucket and build hash table of TIDs
	 */
	for (;;)
	{
		OffsetNumber noffnum;
		OffsetNumber nmaxoffnum;

		nbuf = _hash_getbuf(rel, nblkno, HASH_READ,
							LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);

		/* remember the primary bucket buffer to acquire cleanup lock on it. */
		if (nblkno == bucket_nblkno)
			bucket_nbuf = nbuf;

		npage = BufferGetPage(nbuf);
		npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage);

		/* Scan each tuple in new page */
		nmaxoffnum = PageGetMaxOffsetNumber(npage);
		for (noffnum = FirstOffsetNumber;
			 noffnum <= nmaxoffnum;
			 noffnum = OffsetNumberNext(noffnum))
		{
			IndexTuple	itup;

			/* Fetch the item's TID and insert it in hash table. */
			itup = (IndexTuple) PageGetItem(npage,
											PageGetItemId(npage, noffnum));

			(void) hash_search(tidhtab, &itup->t_tid, HASH_ENTER, &found);

			Assert(!found);
		}

		nblkno = npageopaque->hasho_nextblkno;

		/*
		 * release our write lock without modifying buffer and ensure to
		 * retain the pin on primary bucket.
		 */
		if (nbuf == bucket_nbuf)
			LockBuffer(nbuf, BUFFER_LOCK_UNLOCK);
		else
			_hash_relbuf(rel, nbuf);

		/* Exit loop if no more overflow pages in new bucket */
		if (!BlockNumberIsValid(nblkno))
			break;
	}

	/*
	 * Conditionally get the cleanup lock on old and new buckets to perform
	 * the split operation.  If we don't get the cleanup locks, silently give
	 * up and next insertion on old bucket will try again to complete the
	 * split.
	 */
	if (!ConditionalLockBufferForCleanup(obuf))
	{
		hash_destroy(tidhtab);
		return;
	}
	if (!ConditionalLockBufferForCleanup(bucket_nbuf))
	{
		LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
		hash_destroy(tidhtab);
		return;
	}

	npage = BufferGetPage(bucket_nbuf);
	npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
	nbucket = npageopaque->hasho_bucket;

	_hash_splitbucket(rel, metabuf, obucket,
					  nbucket, obuf, bucket_nbuf, tidhtab,
					  maxbucket, highmask, lowmask);

	_hash_dropbuf(rel, bucket_nbuf);
	hash_destroy(tidhtab);
}
Esempio n. 22
0
/*
 * Scan all items on the GiST index page identified by *pageItem, and insert
 * them into the queue (or directly to output areas)
 *
 * scan: index scan we are executing
 * pageItem: search queue item identifying an index page to scan
 * myDistances: distances array associated with pageItem, or NULL at the root
 * tbm: if not NULL, gistgetbitmap's output bitmap
 * ntids: if not NULL, gistgetbitmap's output tuple counter
 *
 * If tbm/ntids aren't NULL, we are doing an amgetbitmap scan, and heap
 * tuples should be reported directly into the bitmap.	If they are NULL,
 * we're doing a plain or ordered indexscan.  For a plain indexscan, heap
 * tuple TIDs are returned into so->pageData[].  For an ordered indexscan,
 * heap tuple TIDs are pushed into individual search queue items.
 *
 * If we detect that the index page has split since we saw its downlink
 * in the parent, we push its new right sibling onto the queue so the
 * sibling will be processed next.
 */
static void
gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances,
			 TIDBitmap *tbm, int64 *ntids)
{
	GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
	Buffer		buffer;
	Page		page;
	GISTPageOpaque opaque;
	OffsetNumber maxoff;
	OffsetNumber i;
	GISTSearchTreeItem *tmpItem = so->tmpTreeItem;
	bool		isNew;
	MemoryContext oldcxt;

	Assert(!GISTSearchItemIsHeap(*pageItem));

	buffer = ReadBuffer(scan->indexRelation, pageItem->blkno);
	LockBuffer(buffer, GIST_SHARE);
	gistcheckpage(scan->indexRelation, buffer);
	page = BufferGetPage(buffer);
	opaque = GistPageGetOpaque(page);

	/*
	 * Check if we need to follow the rightlink. We need to follow it if the
	 * page was concurrently split since we visited the parent (in which case
	 * parentlsn < nsn), or if the the system crashed after a page split but
	 * before the downlink was inserted into the parent.
	 */
	if (!XLogRecPtrIsInvalid(pageItem->data.parentlsn) &&
		(GistFollowRight(page) ||
		 XLByteLT(pageItem->data.parentlsn, opaque->nsn)) &&
		opaque->rightlink != InvalidBlockNumber /* sanity check */ )
	{
		/* There was a page split, follow right link to add pages */
		GISTSearchItem *item;

		/* This can't happen when starting at the root */
		Assert(myDistances != NULL);

		oldcxt = MemoryContextSwitchTo(so->queueCxt);

		/* Create new GISTSearchItem for the right sibling index page */
		item = palloc(sizeof(GISTSearchItem));
		item->next = NULL;
		item->blkno = opaque->rightlink;
		item->data.parentlsn = pageItem->data.parentlsn;

		/* Insert it into the queue using same distances as for this page */
		tmpItem->head = item;
		tmpItem->lastHeap = NULL;
		memcpy(tmpItem->distances, myDistances,
			   sizeof(double) * scan->numberOfOrderBys);

		(void) rb_insert(so->queue, (RBNode *) tmpItem, &isNew);

		MemoryContextSwitchTo(oldcxt);
	}

	so->nPageData = so->curPageData = 0;

	/*
	 * check all tuples on page
	 */
	maxoff = PageGetMaxOffsetNumber(page);
	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
	{
		IndexTuple	it = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
		bool		match;
		bool		recheck;

		/*
		 * Must call gistindex_keytest in tempCxt, and clean up any leftover
		 * junk afterward.
		 */
		oldcxt = MemoryContextSwitchTo(so->giststate->tempCxt);

		match = gistindex_keytest(scan, it, page, i, &recheck);

		MemoryContextSwitchTo(oldcxt);
		MemoryContextReset(so->giststate->tempCxt);

		/* Ignore tuple if it doesn't match */
		if (!match)
			continue;

		if (tbm && GistPageIsLeaf(page))
		{
			/*
			 * getbitmap scan, so just push heap tuple TIDs into the bitmap
			 * without worrying about ordering
			 */
			tbm_add_tuples(tbm, &it->t_tid, 1, recheck);
			(*ntids)++;
		}
		else if (scan->numberOfOrderBys == 0 && GistPageIsLeaf(page))
		{
			/*
			 * Non-ordered scan, so report heap tuples in so->pageData[]
			 */
			so->pageData[so->nPageData].heapPtr = it->t_tid;
			so->pageData[so->nPageData].recheck = recheck;
			so->nPageData++;
		}
		else
		{
			/*
			 * Must push item into search queue.  We get here for any lower
			 * index page, and also for heap tuples if doing an ordered
			 * search.
			 */
			GISTSearchItem *item;

			oldcxt = MemoryContextSwitchTo(so->queueCxt);

			/* Create new GISTSearchItem for this item */
			item = palloc(sizeof(GISTSearchItem));
			item->next = NULL;

			if (GistPageIsLeaf(page))
			{
				/* Creating heap-tuple GISTSearchItem */
				item->blkno = InvalidBlockNumber;
				item->data.heap.heapPtr = it->t_tid;
				item->data.heap.recheck = recheck;
			}
			else
			{
				/* Creating index-page GISTSearchItem */
				item->blkno = ItemPointerGetBlockNumber(&it->t_tid);
				/* lsn of current page is lsn of parent page for child */
				item->data.parentlsn = PageGetLSN(page);
			}

			/* Insert it into the queue using new distance data */
			tmpItem->head = item;
			tmpItem->lastHeap = GISTSearchItemIsHeap(*item) ? item : NULL;
			memcpy(tmpItem->distances, so->distances,
				   sizeof(double) * scan->numberOfOrderBys);

			(void) rb_insert(so->queue, (RBNode *) tmpItem, &isNew);

			MemoryContextSwitchTo(oldcxt);
		}
	}

	UnlockReleaseBuffer(buffer);
}
Esempio n. 23
0
/*
 * Traverse the tree to find path from root page to specified "child" block.
 *
 * returns a new insertion stack, starting from the parent of "child", up
 * to the root. *downlinkoffnum is set to the offset of the downlink in the
 * direct parent of child.
 *
 * To prevent deadlocks, this should lock only one page at a time.
 */
static GISTInsertStack *
gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum)
{
	Page		page;
	Buffer		buffer;
	OffsetNumber i,
				maxoff;
	ItemId		iid;
	IndexTuple	idxtuple;
	List	   *fifo;
	GISTInsertStack *top,
			   *ptr;
	BlockNumber blkno;

	top = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
	top->blkno = GIST_ROOT_BLKNO;
	top->downlinkoffnum = InvalidOffsetNumber;

	fifo = list_make1(top);
	while (fifo != NIL)
	{
		/* Get next page to visit */
		top = linitial(fifo);
		fifo = list_delete_first(fifo);

		buffer = ReadBuffer(r, top->blkno);
		LockBuffer(buffer, GIST_SHARE);
		gistcheckpage(r, buffer);
		page = (Page) BufferGetPage(buffer);

		if (GistPageIsLeaf(page))
		{
			/*
			 * Because we scan the index top-down, all the rest of the pages
			 * in the queue must be leaf pages as well.
			 */
			UnlockReleaseBuffer(buffer);
			break;
		}

		top->lsn = PageGetLSN(page);

		/*
		 * If F_FOLLOW_RIGHT is set, the page to the right doesn't have a
		 * downlink. This should not normally happen..
		 */
		if (GistFollowRight(page))
			elog(ERROR, "concurrent GiST page split was incomplete");

		if (top->parent && top->parent->lsn < GistPageGetNSN(page) &&
			GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */ )
		{
			/*
			 * Page was split while we looked elsewhere. We didn't see the
			 * downlink to the right page when we scanned the parent, so add
			 * it to the queue now.
			 *
			 * Put the right page ahead of the queue, so that we visit it
			 * next. That's important, because if this is the lowest internal
			 * level, just above leaves, we might already have queued up some
			 * leaf pages, and we assume that there can't be any non-leaf
			 * pages behind leaf pages.
			 */
			ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
			ptr->blkno = GistPageGetOpaque(page)->rightlink;
			ptr->downlinkoffnum = InvalidOffsetNumber;
			ptr->parent = top->parent;

			fifo = lcons(ptr, fifo);
		}

		maxoff = PageGetMaxOffsetNumber(page);

		for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
		{
			iid = PageGetItemId(page, i);
			idxtuple = (IndexTuple) PageGetItem(page, iid);
			blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
			if (blkno == child)
			{
				/* Found it! */
				UnlockReleaseBuffer(buffer);
				*downlinkoffnum = i;
				return top;
			}
			else
			{
				/* Append this child to the list of pages to visit later */
				ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
				ptr->blkno = blkno;
				ptr->downlinkoffnum = i;
				ptr->parent = top;

				fifo = lappend(fifo, ptr);
			}
		}

		UnlockReleaseBuffer(buffer);
	}

	elog(ERROR, "failed to re-find parent of a page in index \"%s\", block %u",
		 RelationGetRelationName(r), child);
	return NULL;				/* keep compiler quiet */
}
Esempio n. 24
0
/*
 * Prune and repair fragmentation in the specified page.
 *
 * Caller must have pin and buffer cleanup lock on the page.
 *
 * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
 * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
 *
 * If report_stats is true then we send the number of reclaimed heap-only
 * tuples to pgstats.  (This must be FALSE during vacuum, since vacuum will
 * send its own new total to pgstats, and we don't want this delta applied
 * on top of that.)
 *
 * Returns the number of tuples deleted from the page and sets
 * latestRemovedXid.
 */
int
heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
				bool report_stats, TransactionId *latestRemovedXid)
{
	int			ndeleted = 0;
	Page		page = BufferGetPage(buffer);
	OffsetNumber offnum,
				maxoff;
	PruneState	prstate;

	/*
	 * Our strategy is to scan the page and make lists of items to change,
	 * then apply the changes within a critical section.  This keeps as much
	 * logic as possible out of the critical section, and also ensures that
	 * WAL replay will work the same as the normal case.
	 *
	 * First, initialize the new pd_prune_xid value to zero (indicating no
	 * prunable tuples).  If we find any tuples which may soon become
	 * prunable, we will save the lowest relevant XID in new_prune_xid. Also
	 * initialize the rest of our working state.
	 */
	prstate.new_prune_xid = InvalidTransactionId;
	prstate.latestRemovedXid = *latestRemovedXid;
	prstate.nredirected = prstate.ndead = prstate.nunused = 0;
	memset(prstate.marked, 0, sizeof(prstate.marked));

	/* Scan the page */
	maxoff = PageGetMaxOffsetNumber(page);
	for (offnum = FirstOffsetNumber;
		 offnum <= maxoff;
		 offnum = OffsetNumberNext(offnum))
	{
		ItemId		itemid;

		/* Ignore items already processed as part of an earlier chain */
		if (prstate.marked[offnum])
			continue;

		/* Nothing to do if slot is empty or already dead */
		itemid = PageGetItemId(page, offnum);
		if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid))
			continue;

		/* Process this item or chain of items */
		ndeleted += heap_prune_chain(relation, buffer, offnum,
									 OldestXmin,
									 &prstate);
	}

	/* Any error while applying the changes is critical */
	START_CRIT_SECTION();

	/* Have we found any prunable items? */
	if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0)
	{
		/*
		 * Apply the planned item changes, then repair page fragmentation, and
		 * update the page's hint bit about whether it has free line pointers.
		 */
		heap_page_prune_execute(buffer,
								prstate.redirected, prstate.nredirected,
								prstate.nowdead, prstate.ndead,
								prstate.nowunused, prstate.nunused);

		/*
		 * Update the page's pd_prune_xid field to either zero, or the lowest
		 * XID of any soon-prunable tuple.
		 */
		((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;

		/*
		 * Also clear the "page is full" flag, since there's no point in
		 * repeating the prune/defrag process until something else happens to
		 * the page.
		 */
		PageClearFull(page);

		MarkBufferDirty(buffer);

		/*
		 * Emit a WAL HEAP_CLEAN record showing what we did
		 */
		if (RelationNeedsWAL(relation))
		{
			XLogRecPtr	recptr;

			recptr = log_heap_clean(relation, buffer,
									prstate.redirected, prstate.nredirected,
									prstate.nowdead, prstate.ndead,
									prstate.nowunused, prstate.nunused,
									prstate.latestRemovedXid);

			PageSetLSN(BufferGetPage(buffer), recptr);
			PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
		}
	}
	else
	{
		/*
		 * If we didn't prune anything, but have found a new value for the
		 * pd_prune_xid field, update it and mark the buffer dirty. This is
		 * treated as a non-WAL-logged hint.
		 *
		 * Also clear the "page is full" flag if it is set, since there's no
		 * point in repeating the prune/defrag process until something else
		 * happens to the page.
		 */
		if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid ||
			PageIsFull(page))
		{
			((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
			PageClearFull(page);
			SetBufferCommitInfoNeedsSave(buffer);
		}
	}

	END_CRIT_SECTION();

	/*
	 * If requested, report the number of tuples reclaimed to pgstats. This is
	 * ndeleted minus ndead, because we don't want to count a now-DEAD root
	 * item as a deletion for this purpose.
	 */
	if (report_stats && ndeleted > prstate.ndead)
		pgstat_update_heap_dead_tuples(relation, ndeleted - prstate.ndead);

	*latestRemovedXid = prstate.latestRemovedXid;

	/*
	 * XXX Should we update the FSM information of this page ?
	 *
	 * There are two schools of thought here. We may not want to update FSM
	 * information so that the page is not used for unrelated UPDATEs/INSERTs
	 * and any free space in this page will remain available for further
	 * UPDATEs in *this* page, thus improving chances for doing HOT updates.
	 *
	 * But for a large table and where a page does not receive further UPDATEs
	 * for a long time, we might waste this space by not updating the FSM
	 * information. The relation may get extended and fragmented further.
	 *
	 * One possibility is to leave "fillfactor" worth of space in this page
	 * and update FSM with the remaining space.
	 *
	 * In any case, the current FSM implementation doesn't accept
	 * one-page-at-a-time updates, so this is all academic for now.
	 */

	return ndeleted;
}
Esempio n. 25
0
/*
 * btvacuumpage --- VACUUM one page
 *
 * This processes a single page for btvacuumscan().  In some cases we
 * must go back and re-examine previously-scanned pages; this routine
 * recurses when necessary to handle that case.
 *
 * blkno is the page to process.  orig_blkno is the highest block number
 * reached by the outer btvacuumscan loop (the same as blkno, unless we
 * are recursing to re-examine a previous page).
 */
static void
btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
{
	IndexVacuumInfo *info = vstate->info;
	IndexBulkDeleteResult *stats = vstate->stats;
	IndexBulkDeleteCallback callback = vstate->callback;
	void	   *callback_state = vstate->callback_state;
	Relation	rel = info->index;
	bool		delete_now;
	BlockNumber recurse_to;
	Buffer		buf;
	Page		page;
	BTPageOpaque opaque;

restart:
	delete_now = false;
	recurse_to = P_NONE;

	/* call vacuum_delay_point while not holding any buffer lock */
	vacuum_delay_point();

	/*
	 * We can't use _bt_getbuf() here because it always applies
	 * _bt_checkpage(), which will barf on an all-zero page. We want to
	 * recycle all-zero pages, not fail.  Also, we want to use a nondefault
	 * buffer access strategy.
	 */
	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
							 info->strategy);
	LockBuffer(buf, BT_READ);
	page = BufferGetPage(buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	if (!PageIsNew(page))
		_bt_checkpage(rel, buf);

	/*
	 * If we are recursing, the only case we want to do anything with is a
	 * live leaf page having the current vacuum cycle ID.  Any other state
	 * implies we already saw the page (eg, deleted it as being empty).
	 */
	if (blkno != orig_blkno)
	{
		if (_bt_page_recyclable(page) ||
			P_IGNORE(opaque) ||
			!P_ISLEAF(opaque) ||
			opaque->btpo_cycleid != vstate->cycleid)
		{
			_bt_relbuf(rel, buf);
			return;
		}
	}

	/* Page is valid, see what to do with it */
	if (_bt_page_recyclable(page))
	{
		/* Okay to recycle this page */
		RecordFreeIndexPage(rel, blkno);
		vstate->totFreePages++;
		stats->pages_deleted++;
	}
	else if (P_ISDELETED(opaque))
	{
		/* Already deleted, but can't recycle yet */
		stats->pages_deleted++;
	}
	else if (P_ISHALFDEAD(opaque))
	{
		/* Half-dead, try to delete */
		delete_now = true;
	}
	else if (P_ISLEAF(opaque))
	{
		OffsetNumber deletable[MaxOffsetNumber];
		int			ndeletable;
		OffsetNumber offnum,
					minoff,
					maxoff;

		/*
		 * Trade in the initial read lock for a super-exclusive write lock on
		 * this page.  We must get such a lock on every leaf page over the
		 * course of the vacuum scan, whether or not it actually contains any
		 * deletable tuples --- see nbtree/README.
		 */
		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
		LockBufferForCleanup(buf);

		/*
		 * Remember highest leaf page number we've taken cleanup lock on; see
		 * notes in btvacuumscan
		 */
		if (blkno > vstate->lastBlockLocked)
			vstate->lastBlockLocked = blkno;

		/*
		 * Check whether we need to recurse back to earlier pages.  What we
		 * are concerned about is a page split that happened since we started
		 * the vacuum scan.  If the split moved some tuples to a lower page
		 * then we might have missed 'em.  If so, set up for tail recursion.
		 * (Must do this before possibly clearing btpo_cycleid below!)
		 */
		if (vstate->cycleid != 0 &&
			opaque->btpo_cycleid == vstate->cycleid &&
			!(opaque->btpo_flags & BTP_SPLIT_END) &&
			!P_RIGHTMOST(opaque) &&
			opaque->btpo_next < orig_blkno)
			recurse_to = opaque->btpo_next;

		/*
		 * Scan over all items to see which ones need deleted according to the
		 * callback function.
		 */
		ndeletable = 0;
		minoff = P_FIRSTDATAKEY(opaque);
		maxoff = PageGetMaxOffsetNumber(page);
		if (callback)
		{
			for (offnum = minoff;
				 offnum <= maxoff;
				 offnum = OffsetNumberNext(offnum))
			{
				IndexTuple	itup;
				ItemPointer htup;

				itup = (IndexTuple) PageGetItem(page,
												PageGetItemId(page, offnum));
				htup = &(itup->t_tid);

				/*
				 * During Hot Standby we currently assume that
				 * XLOG_BTREE_VACUUM records do not produce conflicts. That is
				 * only true as long as the callback function depends only
				 * upon whether the index tuple refers to heap tuples removed
				 * in the initial heap scan. When vacuum starts it derives a
				 * value of OldestXmin. Backends taking later snapshots could
				 * have a RecentGlobalXmin with a later xid than the vacuum's
				 * OldestXmin, so it is possible that row versions deleted
				 * after OldestXmin could be marked as killed by other
				 * backends. The callback function *could* look at the index
				 * tuple state in isolation and decide to delete the index
				 * tuple, though currently it does not. If it ever did, we
				 * would need to reconsider whether XLOG_BTREE_VACUUM records
				 * should cause conflicts. If they did cause conflicts they
				 * would be fairly harsh conflicts, since we haven't yet
				 * worked out a way to pass a useful value for
				 * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
				 * applies to *any* type of index that marks index tuples as
				 * killed.
				 */
				if (callback(htup, callback_state))
					deletable[ndeletable++] = offnum;
			}
		}

		/*
		 * Apply any needed deletes.  We issue just one _bt_delitems_vacuum()
		 * call per page, so as to minimize WAL traffic.
		 */
		if (ndeletable > 0)
		{
			/*
			 * Notice that the issued XLOG_BTREE_VACUUM WAL record includes an
			 * instruction to the replay code to get cleanup lock on all pages
			 * between the previous lastBlockVacuumed and this page.  This
			 * ensures that WAL replay locks all leaf pages at some point.
			 *
			 * Since we can visit leaf pages out-of-order when recursing,
			 * replay might end up locking such pages an extra time, but it
			 * doesn't seem worth the amount of bookkeeping it'd take to avoid
			 * that.
			 */
			_bt_delitems_vacuum(rel, buf, deletable, ndeletable,
								vstate->lastBlockVacuumed);

			/*
			 * Remember highest leaf page number we've issued a
			 * XLOG_BTREE_VACUUM WAL record for.
			 */
			if (blkno > vstate->lastBlockVacuumed)
				vstate->lastBlockVacuumed = blkno;

			stats->tuples_removed += ndeletable;
			/* must recompute maxoff */
			maxoff = PageGetMaxOffsetNumber(page);
		}
		else
		{
			/*
			 * If the page has been split during this vacuum cycle, it seems
			 * worth expending a write to clear btpo_cycleid even if we don't
			 * have any deletions to do.  (If we do, _bt_delitems_vacuum takes
			 * care of this.)  This ensures we won't process the page again.
			 *
			 * We treat this like a hint-bit update because there's no need to
			 * WAL-log it.
			 */
			if (vstate->cycleid != 0 &&
				opaque->btpo_cycleid == vstate->cycleid)
			{
				opaque->btpo_cycleid = 0;
				MarkBufferDirtyHint(buf, true);
			}
		}

		/*
		 * If it's now empty, try to delete; else count the live tuples. We
		 * don't delete when recursing, though, to avoid putting entries into
		 * freePages out-of-order (doesn't seem worth any extra code to handle
		 * the case).
		 */
		if (minoff > maxoff)
			delete_now = (blkno == orig_blkno);
		else
			stats->num_index_tuples += maxoff - minoff + 1;
	}

	if (delete_now)
	{
		MemoryContext oldcontext;
		int			ndel;

		/* Run pagedel in a temp context to avoid memory leakage */
		MemoryContextReset(vstate->pagedelcontext);
		oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);

		ndel = _bt_pagedel(rel, buf);

		/* count only this page, else may double-count parent */
		if (ndel)
			stats->pages_deleted++;

		MemoryContextSwitchTo(oldcontext);
		/* pagedel released buffer, so we shouldn't */
	}
	else
		_bt_relbuf(rel, buf);

	/*
	 * This is really tail recursion, but if the compiler is too stupid to
	 * optimize it as such, we'd eat an uncomfortably large amount of stack
	 * space per recursion level (due to the deletable[] array). A failure is
	 * improbable since the number of levels isn't likely to be large ... but
	 * just in case, let's hand-optimize into a loop.
	 */
	if (recurse_to != P_NONE)
	{
		blkno = recurse_to;
		goto restart;
	}
}
Esempio n. 26
0
/*
 * For all items in this page, find their respective root line pointers.
 * If item k is part of a HOT-chain with root at item j, then we set
 * root_offsets[k - 1] = j.
 *
 * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
 * We zero out all unused entries.
 *
 * The function must be called with at least share lock on the buffer, to
 * prevent concurrent prune operations.
 *
 * Note: The information collected here is valid only as long as the caller
 * holds a pin on the buffer. Once pin is released, a tuple might be pruned
 * and reused by a completely unrelated tuple.
 */
void
heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
{
	OffsetNumber offnum,
				maxoff;

	MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber));

	maxoff = PageGetMaxOffsetNumber(page);
	for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
	{
		ItemId		lp = PageGetItemId(page, offnum);
		HeapTupleHeader htup;
		OffsetNumber nextoffnum;
		TransactionId priorXmax;

		/* skip unused and dead items */
		if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
			continue;

		if (ItemIdIsNormal(lp))
		{
			htup = (HeapTupleHeader) PageGetItem(page, lp);

			/*
			 * Check if this tuple is part of a HOT-chain rooted at some other
			 * tuple. If so, skip it for now; we'll process it when we find
			 * its root.
			 */
			if (HeapTupleHeaderIsHeapOnly(htup))
				continue;

			/*
			 * This is either a plain tuple or the root of a HOT-chain.
			 * Remember it in the mapping.
			 */
			root_offsets[offnum - 1] = offnum;

			/* If it's not the start of a HOT-chain, we're done with it */
			if (!HeapTupleHeaderIsHotUpdated(htup))
				continue;

			/* Set up to scan the HOT-chain */
			nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
			priorXmax = HeapTupleHeaderGetXmax(htup);
		}
		else
		{
			/* Must be a redirect item. We do not set its root_offsets entry */
			Assert(ItemIdIsRedirected(lp));
			/* Set up to scan the HOT-chain */
			nextoffnum = ItemIdGetRedirect(lp);
			priorXmax = InvalidTransactionId;
		}

		/*
		 * Now follow the HOT-chain and collect other tuples in the chain.
		 *
		 * Note: Even though this is a nested loop, the complexity of the
		 * function is O(N) because a tuple in the page should be visited not
		 * more than twice, once in the outer loop and once in HOT-chain
		 * chases.
		 */
		for (;;)
		{
			lp = PageGetItemId(page, nextoffnum);

			/* Check for broken chains */
			if (!ItemIdIsNormal(lp))
				break;

			htup = (HeapTupleHeader) PageGetItem(page, lp);

			if (TransactionIdIsValid(priorXmax) &&
				!TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
				break;

			/* Remember the root line pointer for this item */
			root_offsets[nextoffnum - 1] = offnum;

			/* Advance to next chain member, if any */
			if (!HeapTupleHeaderIsHotUpdated(htup))
				break;

			nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
			priorXmax = HeapTupleHeaderGetXmax(htup);
		}
	}
}
Esempio n. 27
0
/*
 *	hashgettuple() -- Get the next tuple in the scan.
 */
bool
hashgettuple(IndexScanDesc scan, ScanDirection dir)
{
	HashScanOpaque so = (HashScanOpaque) scan->opaque;
	Relation	rel = scan->indexRelation;
	Buffer		buf;
	Page		page;
	OffsetNumber offnum;
	ItemPointer current;
	bool		res;

	/* Hash indexes are always lossy since we store only the hash code */
	scan->xs_recheck = true;

	/*
	 * We hold pin but not lock on current buffer while outside the hash AM.
	 * Reacquire the read lock here.
	 */
	if (BufferIsValid(so->hashso_curbuf))
		LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);

	/*
	 * If we've already initialized this scan, we can just advance it in the
	 * appropriate direction.  If we haven't done so yet, we call a routine to
	 * get the first item in the scan.
	 */
	current = &(so->hashso_curpos);
	if (ItemPointerIsValid(current))
	{
		/*
		 * An insertion into the current index page could have happened while
		 * we didn't have read lock on it.  Re-find our position by looking
		 * for the TID we previously returned.  (Because we hold a pin on the
		 * primary bucket page, no deletions or splits could have occurred;
		 * therefore we can expect that the TID still exists in the current
		 * index page, at an offset >= where we were.)
		 */
		OffsetNumber maxoffnum;

		buf = so->hashso_curbuf;
		Assert(BufferIsValid(buf));
		page = BufferGetPage(buf);

		/*
		 * We don't need test for old snapshot here as the current buffer is
		 * pinned, so vacuum can't clean the page.
		 */
		maxoffnum = PageGetMaxOffsetNumber(page);
		for (offnum = ItemPointerGetOffsetNumber(current);
			 offnum <= maxoffnum;
			 offnum = OffsetNumberNext(offnum))
		{
			IndexTuple	itup;

			itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
			if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid)))
				break;
		}
		if (offnum > maxoffnum)
			elog(ERROR, "failed to re-find scan position within index \"%s\"",
				 RelationGetRelationName(rel));
		ItemPointerSetOffsetNumber(current, offnum);

		/*
		 * Check to see if we should kill the previously-fetched tuple.
		 */
		if (scan->kill_prior_tuple)
		{
			/*
			 * Yes, so remember it for later. (We'll deal with all such
			 * tuples at once right after leaving the index page or at
			 * end of scan.) In case if caller reverses the indexscan
			 * direction it is quite possible that the same item might
			 * get entered multiple times. But, we don't detect that;
			 * instead, we just forget any excess entries.
			 */
			if (so->killedItems == NULL)
				so->killedItems = palloc(MaxIndexTuplesPerPage *
										 sizeof(HashScanPosItem));

			if (so->numKilled < MaxIndexTuplesPerPage)
			{
				so->killedItems[so->numKilled].heapTid = so->hashso_heappos;
				so->killedItems[so->numKilled].indexOffset =
							ItemPointerGetOffsetNumber(&(so->hashso_curpos));
				so->numKilled++;
			}
		}

		/*
		 * Now continue the scan.
		 */
		res = _hash_next(scan, dir);
	}
	else
		res = _hash_first(scan, dir);

	/*
	 * Skip killed tuples if asked to.
	 */
	if (scan->ignore_killed_tuples)
	{
		while (res)
		{
			offnum = ItemPointerGetOffsetNumber(current);
			page = BufferGetPage(so->hashso_curbuf);
			if (!ItemIdIsDead(PageGetItemId(page, offnum)))
				break;
			res = _hash_next(scan, dir);
		}
	}

	/* Release read lock on current buffer, but keep it pinned */
	if (BufferIsValid(so->hashso_curbuf))
		LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);

	/* Return current heap TID on success */
	scan->xs_ctup.t_self = so->hashso_heappos;

	return res;
}
Esempio n. 28
0
Datum
gtsquery_picksplit(PG_FUNCTION_ARGS)
{
	GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
	GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1);
	OffsetNumber maxoff = entryvec->n - 2;
	OffsetNumber k,
				j;
	TSQuerySign datum_l,
				datum_r;
	int4		size_alpha,
				size_beta;
	int4		size_waste,
				waste = -1;
	int4		nbytes;
	OffsetNumber seed_1 = 0,
				seed_2 = 0;
	OffsetNumber *left,
			   *right;

	SPLITCOST  *costvector;

	nbytes = (maxoff + 2) * sizeof(OffsetNumber);
	left = v->spl_left = (OffsetNumber *) palloc(nbytes);
	right = v->spl_right = (OffsetNumber *) palloc(nbytes);
	v->spl_nleft = v->spl_nright = 0;

	for (k = FirstOffsetNumber; k < maxoff; k = OffsetNumberNext(k))
		for (j = OffsetNumberNext(k); j <= maxoff; j = OffsetNumberNext(j))
		{
			size_waste = hemdist(GETENTRY(entryvec, j), GETENTRY(entryvec, k));
			if (size_waste > waste)
			{
				waste = size_waste;
				seed_1 = k;
				seed_2 = j;
			}
		}


	if (seed_1 == 0 || seed_2 == 0)
	{
		seed_1 = 1;
		seed_2 = 2;
	}

	datum_l = GETENTRY(entryvec, seed_1);
	datum_r = GETENTRY(entryvec, seed_2);

	maxoff = OffsetNumberNext(maxoff);
	costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff);
	for (j = FirstOffsetNumber; j <= maxoff; j = OffsetNumberNext(j))
	{
		costvector[j - 1].pos = j;
		size_alpha = hemdist(GETENTRY(entryvec, seed_1), GETENTRY(entryvec, j));
		size_beta = hemdist(GETENTRY(entryvec, seed_2), GETENTRY(entryvec, j));
		costvector[j - 1].cost = abs(size_alpha - size_beta);
	}
	qsort((void *) costvector, maxoff, sizeof(SPLITCOST), comparecost);

	for (k = 0; k < maxoff; k++)
	{
		j = costvector[k].pos;
		if (j == seed_1)
		{
			*left++ = j;
			v->spl_nleft++;
			continue;
		}
		else if (j == seed_2)
		{
			*right++ = j;
			v->spl_nright++;
			continue;
		}
		size_alpha = hemdist(datum_l, GETENTRY(entryvec, j));
		size_beta = hemdist(datum_r, GETENTRY(entryvec, j));

		if (size_alpha < size_beta + WISH_F(v->spl_nleft, v->spl_nright, 0.05))
		{
			datum_l |= GETENTRY(entryvec, j);
			*left++ = j;
			v->spl_nleft++;
		}
		else
		{
			datum_r |= GETENTRY(entryvec, j);
			*right++ = j;
			v->spl_nright++;
		}
	}

	*right = *left = FirstOffsetNumber;
	v->spl_ldatum = TSQuerySignGetDatum(datum_l);
	v->spl_rdatum = TSQuerySignGetDatum(datum_r);

	PG_RETURN_POINTER(v);
}
Esempio n. 29
0
/*
 * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
 *
 * We are splitting a bucket that consists of a base bucket page and zero
 * or more overflow (bucket chain) pages.  We must relocate tuples that
 * belong in the new bucket, and compress out any free space in the old
 * bucket.
 *
 * The caller must hold exclusive locks on both buckets to ensure that
 * no one else is trying to access them (see README).
 *
 * The caller must hold a pin, but no lock, on the metapage buffer.
 * The buffer is returned in the same state.  (The metapage is only
 * touched if it becomes necessary to add or remove overflow pages.)
 *
 * In addition, the caller must have created the new bucket's base page,
 * which is passed in buffer nbuf, pinned and write-locked.  That lock and
 * pin are released here.  (The API is set up this way because we must do
 * _hash_getnewbuf() before releasing the metapage write lock.  So instead of
 * passing the new bucket's start block number, we pass an actual buffer.)
 */
static void
_hash_splitbucket(Relation rel,
				  Buffer metabuf,
				  Bucket obucket,
				  Bucket nbucket,
				  BlockNumber start_oblkno,
				  Buffer nbuf,
				  uint32 maxbucket,
				  uint32 highmask,
				  uint32 lowmask)
{
	Buffer		obuf;
	Page		opage;
	Page		npage;
	HashPageOpaque oopaque;
	HashPageOpaque nopaque;

	/*
	 * It should be okay to simultaneously write-lock pages from each bucket,
	 * since no one else can be trying to acquire buffer lock on pages of
	 * either bucket.
	 */
	obuf = _hash_getbuf(rel, start_oblkno, HASH_WRITE, LH_BUCKET_PAGE);
	opage = BufferGetPage(obuf);
	oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);

	npage = BufferGetPage(nbuf);

	/* initialize the new bucket's primary page */
	nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
	nopaque->hasho_prevblkno = InvalidBlockNumber;
	nopaque->hasho_nextblkno = InvalidBlockNumber;
	nopaque->hasho_bucket = nbucket;
	nopaque->hasho_flag = LH_BUCKET_PAGE;
	nopaque->hasho_page_id = HASHO_PAGE_ID;

	/*
	 * Partition the tuples in the old bucket between the old bucket and the
	 * new bucket, advancing along the old bucket's overflow bucket chain and
	 * adding overflow pages to the new bucket as needed.  Outer loop iterates
	 * once per page in old bucket.
	 */
	for (;;)
	{
		BlockNumber oblkno;
		OffsetNumber ooffnum;
		OffsetNumber omaxoffnum;
		OffsetNumber deletable[MaxOffsetNumber];
		int			ndeletable = 0;

		/* Scan each tuple in old page */
		omaxoffnum = PageGetMaxOffsetNumber(opage);
		for (ooffnum = FirstOffsetNumber;
			 ooffnum <= omaxoffnum;
			 ooffnum = OffsetNumberNext(ooffnum))
		{
			IndexTuple	itup;
			Size		itemsz;
			Bucket		bucket;

			/* skip dead tuples */
			if (ItemIdIsDead(PageGetItemId(opage, ooffnum)))
				continue;

			/*
			 * Fetch the item's hash key (conveniently stored in the item) and
			 * determine which bucket it now belongs in.
			 */
			itup = (IndexTuple) PageGetItem(opage,
											PageGetItemId(opage, ooffnum));
			bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
										  maxbucket, highmask, lowmask);

			if (bucket == nbucket)
			{
				/*
				 * insert the tuple into the new bucket.  if it doesn't fit on
				 * the current page in the new bucket, we must allocate a new
				 * overflow page and place the tuple on that page instead.
				 *
				 * XXX we have a problem here if we fail to get space for a
				 * new overflow page: we'll error out leaving the bucket split
				 * only partially complete, meaning the index is corrupt,
				 * since searches may fail to find entries they should find.
				 */
				itemsz = IndexTupleDSize(*itup);
				itemsz = MAXALIGN(itemsz);

				if (PageGetFreeSpace(npage) < itemsz)
				{
					/* write out nbuf and drop lock, but keep pin */
					_hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK);
					/* chain to a new overflow page */
					nbuf = _hash_addovflpage(rel, metabuf, nbuf);
					npage = BufferGetPage(nbuf);
					/* we don't need nopaque within the loop */
				}

				/*
				 * Insert tuple on new page, using _hash_pgaddtup to ensure
				 * correct ordering by hashkey.  This is a tad inefficient
				 * since we may have to shuffle itempointers repeatedly.
				 * Possible future improvement: accumulate all the items for
				 * the new page and qsort them before insertion.
				 */
				(void) _hash_pgaddtup(rel, nbuf, itemsz, itup);

				/*
				 * Mark tuple for deletion from old page.
				 */
				deletable[ndeletable++] = ooffnum;
			}
			else
			{
				/*
				 * the tuple stays on this page, so nothing to do.
				 */
				Assert(bucket == obucket);
			}
		}

		oblkno = oopaque->hasho_nextblkno;

		/*
		 * Done scanning this old page.  If we moved any tuples, delete them
		 * from the old page.
		 */
		if (ndeletable > 0)
		{
			PageIndexMultiDelete(opage, deletable, ndeletable);
			_hash_wrtbuf(rel, obuf);
		}
		else
			_hash_relbuf(rel, obuf);

		/* Exit loop if no more overflow pages in old bucket */
		if (!BlockNumberIsValid(oblkno))
			break;

		/* Else, advance to next old page */
		obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
		opage = BufferGetPage(obuf);
		oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
	}

	/*
	 * We're at the end of the old bucket chain, so we're done partitioning
	 * the tuples.  Before quitting, call _hash_squeezebucket to ensure the
	 * tuples remaining in the old bucket (including the overflow pages) are
	 * packed as tightly as possible.  The new bucket is already tight.
	 */
	_hash_wrtbuf(rel, nbuf);

	_hash_squeezebucket(rel, obucket, start_oblkno, NULL);
}
/*
 * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so.
 *
 * This action unlinks the page from the b-tree structure, removing all
 * pointers leading to it --- but not touching its own left and right links.
 * The page cannot be physically reclaimed right away, since other processes
 * may currently be trying to follow links leading to the page; they have to
 * be allowed to use its right-link to recover.  See nbtree/README.
 *
 * On entry, the target buffer must be pinned and locked (either read or write
 * lock is OK).  This lock and pin will be dropped before exiting.
 *
 * The "stack" argument can be a search stack leading (approximately) to the
 * target page, or NULL --- outside callers typically pass NULL since they
 * have not done such a search, but internal recursion cases pass the stack
 * to avoid duplicated search effort.
 *
 * Returns the number of pages successfully deleted (zero if page cannot
 * be deleted now; could be more than one if parent pages were deleted too).
 *
 * NOTE: this leaks memory.  Rather than trying to clean up everything
 * carefully, it's better to run it in a temp context that can be reset
 * frequently.
 */
int
_bt_pagedel(Relation rel, Buffer buf, BTStack stack)
{
	int			result;
	BlockNumber target,
				leftsib,
				rightsib,
				parent;
	OffsetNumber poffset,
				maxoff;
	uint32		targetlevel,
				ilevel;
	ItemId		itemid;
	IndexTuple	targetkey,
				itup;
	ScanKey		itup_scankey;
	Buffer		lbuf,
				rbuf,
				pbuf;
	bool		parent_half_dead;
	bool		parent_one_child;
	bool		rightsib_empty;
	Buffer		metabuf = InvalidBuffer;
	Page		metapg = NULL;
	BTMetaPageData *metad = NULL;
	Page		page;
	BTPageOpaque opaque;

	/*
	 * We can never delete rightmost pages nor root pages.	While at it, check
	 * that page is not already deleted and is empty.
	 */
	page = BufferGetPage(buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
		P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
	{
		/* Should never fail to delete a half-dead page */
		Assert(!P_ISHALFDEAD(opaque));

		_bt_relbuf(rel, buf);
		return 0;
	}

	/*
	 * Save info about page, including a copy of its high key (it must have
	 * one, being non-rightmost).
	 */
	target = BufferGetBlockNumber(buf);
	targetlevel = opaque->btpo.level;
	leftsib = opaque->btpo_prev;
	itemid = PageGetItemId(page, P_HIKEY);
	targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));

	/*
	 * To avoid deadlocks, we'd better drop the target page lock before going
	 * further.
	 */
	_bt_relbuf(rel, buf);

	/*
	 * We need an approximate pointer to the page's parent page.  We use the
	 * standard search mechanism to search for the page's high key; this will
	 * give us a link to either the current parent or someplace to its left
	 * (if there are multiple equal high keys).  In recursion cases, the
	 * caller already generated a search stack and we can just re-use that
	 * work.
	 */
	if (stack == NULL)
	{
		if (!InRecovery)
		{
			/* we need an insertion scan key to do our search, so build one */
			itup_scankey = _bt_mkscankey(rel, targetkey);
			/* find the leftmost leaf page containing this key */
			stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false,
							   &lbuf, BT_READ);
			/* don't need a pin on that either */
			_bt_relbuf(rel, lbuf);

			/*
			 * If we are trying to delete an interior page, _bt_search did
			 * more than we needed.  Locate the stack item pointing to our
			 * parent level.
			 */
			ilevel = 0;
			for (;;)
			{
				if (stack == NULL)
					elog(ERROR, "not enough stack items");
				if (ilevel == targetlevel)
					break;
				stack = stack->bts_parent;
				ilevel++;
			}
		}
		else
		{
			/*
			 * During WAL recovery, we can't use _bt_search (for one reason,
			 * it might invoke user-defined comparison functions that expect
			 * facilities not available in recovery mode).	Instead, just set
			 * up a dummy stack pointing to the left end of the parent tree
			 * level, from which _bt_getstackbuf will walk right to the parent
			 * page.  Painful, but we don't care too much about performance in
			 * this scenario.
			 */
			pbuf = _bt_get_endpoint(rel, targetlevel + 1, false);
			stack = (BTStack) palloc(sizeof(BTStackData));
			stack->bts_blkno = BufferGetBlockNumber(pbuf);
			stack->bts_offset = InvalidOffsetNumber;
			/* bts_btentry will be initialized below */
			stack->bts_parent = NULL;
			_bt_relbuf(rel, pbuf);
		}
	}

	/*
	 * We cannot delete a page that is the rightmost child of its immediate
	 * parent, unless it is the only child --- in which case the parent has to
	 * be deleted too, and the same condition applies recursively to it. We
	 * have to check this condition all the way up before trying to delete. We
	 * don't need to re-test when deleting a non-leaf page, though.
	 */
	if (targetlevel == 0 &&
		!_bt_parent_deletion_safe(rel, target, stack))
		return 0;

	/*
	 * We have to lock the pages we need to modify in the standard order:
	 * moving right, then up.  Else we will deadlock against other writers.
	 *
	 * So, we need to find and write-lock the current left sibling of the
	 * target page.  The sibling that was current a moment ago could have
	 * split, so we may have to move right.  This search could fail if either
	 * the sibling or the target page was deleted by someone else meanwhile;
	 * if so, give up.	(Right now, that should never happen, since page
	 * deletion is only done in VACUUM and there shouldn't be multiple VACUUMs
	 * concurrently on the same table.)
	 */
	if (leftsib != P_NONE)
	{
		lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
		page = BufferGetPage(lbuf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		while (P_ISDELETED(opaque) || opaque->btpo_next != target)
		{
			/* step right one page */
			leftsib = opaque->btpo_next;
			_bt_relbuf(rel, lbuf);
			if (leftsib == P_NONE)
			{
				elog(LOG, "no left sibling (concurrent deletion?) in \"%s\"",
					 RelationGetRelationName(rel));
				return 0;
			}
			lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
			page = BufferGetPage(lbuf);
			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		}
	}
	else
		lbuf = InvalidBuffer;

	/*
	 * Next write-lock the target page itself.	It should be okay to take just
	 * a write lock not a superexclusive lock, since no scans would stop on an
	 * empty page.
	 */
	buf = _bt_getbuf(rel, target, BT_WRITE);
	page = BufferGetPage(buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);

	/*
	 * Check page is still empty etc, else abandon deletion.  The empty check
	 * is necessary since someone else might have inserted into it while we
	 * didn't have it locked; the others are just for paranoia's sake.
	 */
	if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
		P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
	{
		_bt_relbuf(rel, buf);
		if (BufferIsValid(lbuf))
			_bt_relbuf(rel, lbuf);
		return 0;
	}
	if (opaque->btpo_prev != leftsib)
		elog(ERROR, "left link changed unexpectedly in block %u of index \"%s\"",
			 target, RelationGetRelationName(rel));

	/*
	 * And next write-lock the (current) right sibling.
	 */
	rightsib = opaque->btpo_next;
	rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
	page = BufferGetPage(rbuf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	if (opaque->btpo_prev != target)
		elog(ERROR, "right sibling's left-link doesn't match: "
			 "block %u links to %u instead of expected %u in index \"%s\"",
			 rightsib, opaque->btpo_prev, target,
			 RelationGetRelationName(rel));

	/*
	 * Next find and write-lock the current parent of the target page. This is
	 * essentially the same as the corresponding step of splitting.
	 */
	ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY);
	pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
	if (pbuf == InvalidBuffer)
		elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
			 RelationGetRelationName(rel), target);
	parent = stack->bts_blkno;
	poffset = stack->bts_offset;

	/*
	 * If the target is the rightmost child of its parent, then we can't
	 * delete, unless it's also the only child --- in which case the parent
	 * changes to half-dead status.  The "can't delete" case should have been
	 * detected by _bt_parent_deletion_safe, so complain if we see it now.
	 */
	page = BufferGetPage(pbuf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	maxoff = PageGetMaxOffsetNumber(page);
	parent_half_dead = false;
	parent_one_child = false;
	if (poffset >= maxoff)
	{
		if (poffset == P_FIRSTDATAKEY(opaque))
			parent_half_dead = true;
		else
			elog(ERROR, "failed to delete rightmost child %u of block %u in index \"%s\"",
				 target, parent, RelationGetRelationName(rel));
	}
	else
	{
		/* Will there be exactly one child left in this parent? */
		if (OffsetNumberNext(P_FIRSTDATAKEY(opaque)) == maxoff)
			parent_one_child = true;
	}

	/*
	 * If we are deleting the next-to-last page on the target's level, then
	 * the rightsib is a candidate to become the new fast root. (In theory, it
	 * might be possible to push the fast root even further down, but the odds
	 * of doing so are slim, and the locking considerations daunting.)
	 *
	 * We don't support handling this in the case where the parent is becoming
	 * half-dead, even though it theoretically could occur.
	 *
	 * We can safely acquire a lock on the metapage here --- see comments for
	 * _bt_newroot().
	 */
	if (leftsib == P_NONE && !parent_half_dead)
	{
		page = BufferGetPage(rbuf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		Assert(opaque->btpo.level == targetlevel);
		if (P_RIGHTMOST(opaque))
		{
			/* rightsib will be the only one left on the level */
			metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
			metapg = BufferGetPage(metabuf);
			metad = BTPageGetMeta(metapg);

			/*
			 * The expected case here is btm_fastlevel == targetlevel+1; if
			 * the fastlevel is <= targetlevel, something is wrong, and we
			 * choose to overwrite it to fix it.
			 */
			if (metad->btm_fastlevel > targetlevel + 1)
			{
				/* no update wanted */
				_bt_relbuf(rel, metabuf);
				metabuf = InvalidBuffer;
			}
		}
	}

	/*
	 * Check that the parent-page index items we're about to delete/overwrite
	 * contain what we expect.  This can fail if the index has become
	 * corrupt for some reason.  We want to throw any error before entering
	 * the critical section --- otherwise it'd be a PANIC.
	 *
	 * The test on the target item is just an Assert because _bt_getstackbuf
	 * should have guaranteed it has the expected contents.  The test on the
	 * next-child downlink is known to sometimes fail in the field, though.
	 */
	page = BufferGetPage(pbuf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);

#ifdef USE_ASSERT_CHECKING
	itemid = PageGetItemId(page, poffset);
	itup = (IndexTuple) PageGetItem(page, itemid);
	Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target);
#endif

	if (!parent_half_dead)
	{
		OffsetNumber nextoffset;

		nextoffset = OffsetNumberNext(poffset);
		itemid = PageGetItemId(page, nextoffset);
		itup = (IndexTuple) PageGetItem(page, itemid);
		if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib)
			elog(ERROR, "right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
				 rightsib, target, ItemPointerGetBlockNumber(&(itup->t_tid)),
				 parent, RelationGetRelationName(rel));
	}

	/*
	 * Here we begin doing the deletion.
	 */

	/* No ereport(ERROR) until changes are logged */
	START_CRIT_SECTION();

	/*
	 * Update parent.  The normal case is a tad tricky because we want to
	 * delete the target's downlink and the *following* key.  Easiest way is
	 * to copy the right sibling's downlink over the target downlink, and then
	 * delete the following item.
	 */
	if (parent_half_dead)
	{
		PageIndexTupleDelete(page, poffset);
		opaque->btpo_flags |= BTP_HALF_DEAD;
	}
	else
	{
		OffsetNumber nextoffset;

		itemid = PageGetItemId(page, poffset);
		itup = (IndexTuple) PageGetItem(page, itemid);
		ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);

		nextoffset = OffsetNumberNext(poffset);
		PageIndexTupleDelete(page, nextoffset);
	}

	/*
	 * Update siblings' side-links.  Note the target page's side-links will
	 * continue to point to the siblings.  Asserts here are just rechecking
	 * things we already verified above.
	 */
	if (BufferIsValid(lbuf))
	{
		page = BufferGetPage(lbuf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		Assert(opaque->btpo_next == target);
		opaque->btpo_next = rightsib;
	}
	page = BufferGetPage(rbuf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	Assert(opaque->btpo_prev == target);
	opaque->btpo_prev = leftsib;
	rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));

	/*
	 * Mark the page itself deleted.  It can be recycled when all current
	 * transactions are gone.
	 */
	page = BufferGetPage(buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	opaque->btpo_flags &= ~BTP_HALF_DEAD;
	opaque->btpo_flags |= BTP_DELETED;
	opaque->btpo.xact = ReadNewTransactionId();

	/* And update the metapage, if needed */
	if (BufferIsValid(metabuf))
	{
		metad->btm_fastroot = rightsib;
		metad->btm_fastlevel = targetlevel;
		MarkBufferDirty(metabuf);
	}

	/* Must mark buffers dirty before XLogInsert */
	MarkBufferDirty(pbuf);
	MarkBufferDirty(rbuf);
	MarkBufferDirty(buf);
	if (BufferIsValid(lbuf))
		MarkBufferDirty(lbuf);

	/* XLOG stuff */
	if (!rel->rd_istemp)
	{
		xl_btree_delete_page xlrec;
		xl_btree_metadata xlmeta;
		uint8		xlinfo;
		XLogRecPtr	recptr;
		XLogRecData rdata[5];
		XLogRecData *nextrdata;

		xlrec.target.node = rel->rd_node;
		ItemPointerSet(&(xlrec.target.tid), parent, poffset);
		xlrec.deadblk = target;
		xlrec.leftblk = leftsib;
		xlrec.rightblk = rightsib;
		xlrec.btpo_xact = opaque->btpo.xact;

		rdata[0].data = (char *) &xlrec;
		rdata[0].len = SizeOfBtreeDeletePage;
		rdata[0].buffer = InvalidBuffer;
		rdata[0].next = nextrdata = &(rdata[1]);

		if (BufferIsValid(metabuf))
		{
			xlmeta.root = metad->btm_root;
			xlmeta.level = metad->btm_level;
			xlmeta.fastroot = metad->btm_fastroot;
			xlmeta.fastlevel = metad->btm_fastlevel;

			nextrdata->data = (char *) &xlmeta;
			nextrdata->len = sizeof(xl_btree_metadata);
			nextrdata->buffer = InvalidBuffer;
			nextrdata->next = nextrdata + 1;
			nextrdata++;
			xlinfo = XLOG_BTREE_DELETE_PAGE_META;
		}
		else if (parent_half_dead)
			xlinfo = XLOG_BTREE_DELETE_PAGE_HALF;
		else
			xlinfo = XLOG_BTREE_DELETE_PAGE;

		nextrdata->data = NULL;
		nextrdata->len = 0;
		nextrdata->next = nextrdata + 1;
		nextrdata->buffer = pbuf;
		nextrdata->buffer_std = true;
		nextrdata++;

		nextrdata->data = NULL;
		nextrdata->len = 0;
		nextrdata->buffer = rbuf;
		nextrdata->buffer_std = true;
		nextrdata->next = NULL;

		if (BufferIsValid(lbuf))
		{
			nextrdata->next = nextrdata + 1;
			nextrdata++;
			nextrdata->data = NULL;
			nextrdata->len = 0;
			nextrdata->buffer = lbuf;
			nextrdata->buffer_std = true;
			nextrdata->next = NULL;
		}

		recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);

		if (BufferIsValid(metabuf))
		{
			PageSetLSN(metapg, recptr);
			PageSetTLI(metapg, ThisTimeLineID);
		}
		page = BufferGetPage(pbuf);
		PageSetLSN(page, recptr);
		PageSetTLI(page, ThisTimeLineID);
		page = BufferGetPage(rbuf);
		PageSetLSN(page, recptr);
		PageSetTLI(page, ThisTimeLineID);
		page = BufferGetPage(buf);
		PageSetLSN(page, recptr);
		PageSetTLI(page, ThisTimeLineID);
		if (BufferIsValid(lbuf))
		{
			page = BufferGetPage(lbuf);
			PageSetLSN(page, recptr);
			PageSetTLI(page, ThisTimeLineID);
		}
	}

	END_CRIT_SECTION();

	/* release metapage; send out relcache inval if metapage changed */
	if (BufferIsValid(metabuf))
	{
		CacheInvalidateRelcache(rel);
		_bt_relbuf(rel, metabuf);
	}
	/* can always release leftsib immediately */
	if (BufferIsValid(lbuf))
		_bt_relbuf(rel, lbuf);

	/*
	 * If parent became half dead, recurse to delete it. Otherwise, if right
	 * sibling is empty and is now the last child of the parent, recurse to
	 * try to delete it.  (These cases cannot apply at the same time, though
	 * the second case might itself recurse to the first.)
	 *
	 * When recursing to parent, we hold the lock on the target page until
	 * done.  This delays any insertions into the keyspace that was just
	 * effectively reassigned to the parent's right sibling.  If we allowed
	 * that, and there were enough such insertions before we finish deleting
	 * the parent, page splits within that keyspace could lead to inserting
	 * out-of-order keys into the grandparent level.  It is thought that that
	 * wouldn't have any serious consequences, but it still seems like a
	 * pretty bad idea.
	 */
	if (parent_half_dead)
	{
		/* recursive call will release pbuf */
		_bt_relbuf(rel, rbuf);
		result = _bt_pagedel(rel, pbuf, stack->bts_parent) + 1;
		_bt_relbuf(rel, buf);
	}
	else if (parent_one_child && rightsib_empty)
	{
		_bt_relbuf(rel, pbuf);
		_bt_relbuf(rel, buf);
		/* recursive call will release rbuf */
		result = _bt_pagedel(rel, rbuf, stack) + 1;
	}
	else
	{
		_bt_relbuf(rel, pbuf);
		_bt_relbuf(rel, buf);
		_bt_relbuf(rel, rbuf);
		result = 1;
	}

	return result;
}