예제 #1
0
/*
 * Mask a btree page before performing consistency checks on it.
 */
void
btree_mask(char *pagedata, BlockNumber blkno)
{
	Page		page = (Page) pagedata;
	BTPageOpaque maskopaq;

	mask_page_lsn_and_checksum(page);

	mask_page_hint_bits(page);
	mask_unused_space(page);

	maskopaq = (BTPageOpaque) PageGetSpecialPointer(page);

	if (P_ISDELETED(maskopaq))
	{
		/*
		 * Mask page content on a DELETED page since it will be re-initialized
		 * during replay. See btree_xlog_unlink_page() for details.
		 */
		mask_page_content(page);
	}
	else if (P_ISLEAF(maskopaq))
	{
		/*
		 * In btree leaf pages, it is possible to modify the LP_FLAGS without
		 * emitting any WAL record. Hence, mask the line pointer flags. See
		 * _bt_killitems(), _bt_check_unique() for details.
		 */
		mask_lp_flags(page);
	}

	/*
	 * BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See
	 * _bt_killitems(), _bt_check_unique() for details.
	 */
	maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE;

	/*
	 * During replay of a btree page split, we don't set the BTP_SPLIT_END
	 * flag of the right sibling and initialize the cycle_id to 0 for the same
	 * page. See btree_xlog_split() for details.
	 */
	maskopaq->btpo_flags &= ~BTP_SPLIT_END;
	maskopaq->btpo_cycleid = 0;
}
예제 #2
0
/*
 * Add an item to a page being built.
 *
 * The main difference between this routine and a bare PageAddItem call
 * is that this code knows that the leftmost data item on a non-leaf
 * btree page doesn't need to have a key.  Therefore, it strips such
 * items down to just the item header.
 *
 * This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use
 * that because it assumes that P_RIGHTMOST() will return the correct
 * answer for the page.  Here, we don't know yet if the page will be
 * rightmost.  Offset P_FIRSTKEY is always the first data key.
 */
static void
_bt_sortaddtup(Page page,
			   Size itemsize,
			   IndexTuple itup,
			   OffsetNumber itup_off)
{
	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	IndexTupleData trunctuple;

	if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY)
	{
		trunctuple = *itup;
		trunctuple.t_info = sizeof(IndexTupleData);
		itup = &trunctuple;
		itemsize = sizeof(IndexTupleData);
	}

	if (PageAddItem(page, (Item) itup, itemsize, itup_off,
					false, false) == InvalidOffsetNumber)
		elog(ERROR, "failed to add item to the index page");
}
예제 #3
0
/*
 * pgstat_btree_page -- check tuples in a btree page
 */
static void
pgstat_btree_page(pgstattuple_type *stat, Relation rel, BlockNumber blkno,
				  BufferAccessStrategy bstrategy)
{
	Buffer		buf;
	Page		page;

	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
	LockBuffer(buf, BT_READ);
	page = BufferGetPage(buf);

	/* Page is valid, see what to do with it */
	if (PageIsNew(page))
	{
		/* fully empty page */
		stat->free_space += BLCKSZ;
	}
	else
	{
		BTPageOpaque opaque;

		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		if (opaque->btpo_flags & (BTP_DELETED | BTP_HALF_DEAD))
		{
			/* recyclable page */
			stat->free_space += BLCKSZ;
		}
		else if (P_ISLEAF(opaque))
		{
			pgstat_index_page(stat, page, P_FIRSTDATAKEY(opaque),
							  PageGetMaxOffsetNumber(page));
		}
		else
		{
			/* root or node */
		}
	}

	_bt_relbuf(rel, buf);
}
예제 #4
0
/*
 *	_bt_search() -- Search the tree for a particular scankey,
 *		or more precisely for the first leaf page it could be on.
 *
 * Return value is a stack of parent-page pointers.  *bufP is set to the
 * address of the leaf-page buffer, which is read-locked and pinned.
 * No locks are held on the parent pages, however!
 *
 * NOTE that the returned buffer is read-locked regardless of the access
 * parameter.  However, access = BT_WRITE will allow an empty root page
 * to be created and returned.	When access = BT_READ, an empty index
 * will result in *bufP being set to InvalidBuffer.
 */
BTStack
_bt_search(Relation rel, int keysz, ScanKey scankey,
		   Buffer *bufP, int access)
{
	BTStack		stack_in = NULL;

	/* Get the root page to start with */
	*bufP = _bt_getroot(rel, access);

	/* If index is empty and access = BT_READ, no root page is created. */
	if (!BufferIsValid(*bufP))
		return (BTStack) NULL;

	/* Loop iterates once per level descended in the tree */
	for (;;)
	{
		Page		page;
		BTPageOpaque opaque;
		OffsetNumber offnum;
		ItemId		itemid;
		BTItem		btitem;
		IndexTuple	itup;
		BlockNumber blkno;
		BlockNumber par_blkno;
		BTStack		new_stack;

		/*
		 * Race -- the page we just grabbed may have split since we read
		 * its pointer in the parent (or metapage).  If it has, we may
		 * need to move right to its new sibling.  Do that.
		 */
		*bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ);

		/* if this is a leaf page, we're done */
		page = BufferGetPage(*bufP);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		if (P_ISLEAF(opaque))
			break;

		/*
		 * Find the appropriate item on the internal page, and get the
		 * child page that it points to.
		 */
		offnum = _bt_binsrch(rel, *bufP, keysz, scankey);
		itemid = PageGetItemId(page, offnum);
		btitem = (BTItem) PageGetItem(page, itemid);
		itup = &(btitem->bti_itup);
		blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
		par_blkno = BufferGetBlockNumber(*bufP);

		/*
		 * We need to save the location of the index entry we chose in the
		 * parent page on a stack. In case we split the tree, we'll use
		 * the stack to work back up to the parent page.  We also save the
		 * actual downlink (TID) to uniquely identify the index entry, in
		 * case it moves right while we're working lower in the tree.  See
		 * the paper by Lehman and Yao for how this is detected and
		 * handled. (We use the child link to disambiguate duplicate keys
		 * in the index -- Lehman and Yao disallow duplicate keys.)
		 */
		new_stack = (BTStack) palloc(sizeof(BTStackData));
		new_stack->bts_blkno = par_blkno;
		new_stack->bts_offset = offnum;
		memcpy(&new_stack->bts_btitem, btitem, sizeof(BTItemData));
		new_stack->bts_parent = stack_in;

		/* drop the read lock on the parent page, acquire one on the child */
		_bt_relbuf(rel, *bufP);
		*bufP = _bt_getbuf(rel, blkno, BT_READ);

		/* okay, all set to move down a level */
		stack_in = new_stack;
	}

	return stack_in;
}
예제 #5
0
/*----------
 *	_bt_compare() -- Compare scankey to a particular tuple on the page.
 *
 *	keysz: number of key conditions to be checked (might be less than the
 *	total length of the scan key!)
 *	page/offnum: location of btree item to be compared to.
 *
 *		This routine returns:
 *			<0 if scankey < tuple at offnum;
 *			 0 if scankey == tuple at offnum;
 *			>0 if scankey > tuple at offnum.
 *		NULLs in the keys are treated as sortable values.  Therefore
 *		"equality" does not necessarily mean that the item should be
 *		returned to the caller as a matching key!
 *
 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
 * "minus infinity": this routine will always claim it is less than the
 * scankey.  The actual key value stored (if any, which there probably isn't)
 * does not matter.  This convention allows us to implement the Lehman and
 * Yao convention that the first down-link pointer is before the first key.
 * See backend/access/nbtree/README for details.
 *----------
 */
int32
_bt_compare(Relation rel,
			int keysz,
			ScanKey scankey,
			Page page,
			OffsetNumber offnum)
{
	TupleDesc	itupdesc = RelationGetDescr(rel);
	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	BTItem		btitem;
	IndexTuple	itup;
	int			i;

	/*
	 * Force result ">" if target item is first data item on an internal
	 * page --- see NOTE above.
	 */
	if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
		return 1;

	btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
	itup = &(btitem->bti_itup);

	/*
	 * The scan key is set up with the attribute number associated with
	 * each term in the key.  It is important that, if the index is
	 * multi-key, the scan contain the first k key attributes, and that
	 * they be in order.  If you think about how multi-key ordering works,
	 * you'll understand why this is.
	 *
	 * We don't test for violation of this condition here, however.  The
	 * initial setup for the index scan had better have gotten it right
	 * (see _bt_first).
	 */

	for (i = 0; i < keysz; i++)
	{
		ScanKey		entry = &scankey[i];
		Datum		datum;
		bool		isNull;
		int32		result;

		datum = index_getattr(itup, entry->sk_attno, itupdesc, &isNull);

		/* see comments about NULLs handling in btbuild */
		if (entry->sk_flags & SK_ISNULL)		/* key is NULL */
		{
			if (isNull)
				result = 0;		/* NULL "=" NULL */
			else
				result = 1;		/* NULL ">" NOT_NULL */
		}
		else if (isNull)		/* key is NOT_NULL and item is NULL */
		{
			result = -1;		/* NOT_NULL "<" NULL */
		}
		else
		{
			result = DatumGetInt32(FunctionCall2(&entry->sk_func,
												 entry->sk_argument,
												 datum));
		}

		/* if the keys are unequal, return the difference */
		if (result != 0)
			return result;
	}

	/* if we get here, the keys are equal */
	return 0;
}
예제 #6
0
파일: index.c 프로젝트: fdr/pg_check
uint32 check_index_page(Relation rel, PageHeader header, char *buffer, int block) {
  
	uint32 nerrs = 0;
	BTPageOpaque opaque = NULL;
	
	/* check basic page header */
	nerrs += check_page_header(header, block);
	
	/* (block==0) means it's a meta-page, otherwise it's a regular index-page */
	if (block == BTREE_METAPAGE) {
		
		BTMetaPageData * mpdata = BTPageGetMeta(buffer);
	  
		ereport(DEBUG2, (errmsg("[%d] is a meta-page [magic=%d, version=%d]", block, mpdata->btm_magic, mpdata->btm_version)));
		
		if (mpdata->btm_magic != BTREE_MAGIC) {
			ereport(WARNING,(errmsg("[%d] metapage contains invalid magic number %d (should be %d)", block, mpdata->btm_magic, BTREE_MAGIC)));
			nerrs++;
		}
		
		if (mpdata->btm_version != BTREE_VERSION) {
			ereport(WARNING,(errmsg("[%d] metapage contains invalid version %d (should be %d)", block, mpdata->btm_version, BTREE_VERSION)));
			nerrs++;
		}
		
		/* FIXME Check that the btm_root/btm_fastroot is between 1 and number of index blocks */
		/* FIXME Check that the btm_level/btm_fastlevel is equal to the level fo the root block */
		
	} else {
	  
		opaque = (BTPageOpaque)(buffer + header->pd_special);
	
		/* check there's enough space for index-relevant data */
		if (header->pd_special > BLCKSZ - sizeof(BTPageOpaque)) {
			ereport(WARNING,
					(errmsg("[%d] there's not enough special space for index data (%d > %d)",
							block,
							(int) sizeof(BTPageOpaque),
							BLCKSZ - header->pd_special)));
			nerrs++;
		}
		
		/*
		 * if the page is a leaf page, then level needs to be 0. Otherwise,
		 * it should be > 0. Deleted pages don't have a level, the level
		 * field is interleaved with an xid.
		 */
		if (!P_ISDELETED(opaque))
		{
			if (P_ISLEAF(opaque))
			{
				if (opaque-> btpo.level != 0) {
					ereport(WARNING,
							(errmsg("[%d] is leaf page, but level %d is not zero",
									block, opaque->btpo.level)));
					nerrs++;
				}
			}
			else
			{
				if (opaque-> btpo.level == 0) {
					ereport(WARNING,
							(errmsg("[%d] is a non-leaf page, but level is zero",
									block)));
					nerrs++;
				}
			}
		}
	}
	
	return nerrs;
}
예제 #7
0
파일: nbtree.c 프로젝트: LJoNe/gpdb
/*
 * For a newly inserted heap tid, check if an entry with this tid
 * already exists in a unique index.  If it does, abort the inserting
 * transaction.
 */
static void
_bt_validate_tid(Relation irel, ItemPointer h_tid)
{
	MIRROREDLOCK_BUFMGR_DECLARE;

	BlockNumber blkno;
	BlockNumber num_pages;
	Buffer buf;
	Page page;
	BTPageOpaque opaque;
	IndexTuple itup;
	OffsetNumber maxoff,
			minoff,
			offnum;

	elog(DEBUG1, "validating tid (%d,%d) for index (%s)",
		 ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid),
		 RelationGetRelationName(irel));

	blkno = BTREE_METAPAGE + 1;
	num_pages = RelationGetNumberOfBlocks(irel);

	MIRROREDLOCK_BUFMGR_LOCK;
	for (; blkno < num_pages; blkno++)
	{
		buf = ReadBuffer(irel, blkno);
		page = BufferGetPage(buf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		if (!PageIsNew(page))
			_bt_checkpage(irel, buf);
		if (P_ISLEAF(opaque))
		{
			minoff = P_FIRSTDATAKEY(opaque);
			maxoff = PageGetMaxOffsetNumber(page);
			for (offnum = minoff;
				 offnum <= maxoff;
				 offnum = OffsetNumberNext(offnum))
			{
				itup = (IndexTuple) PageGetItem(page,
												PageGetItemId(page, offnum));
				if (ItemPointerEquals(&itup->t_tid, h_tid))
				{
					Form_pg_attribute key_att = RelationGetDescr(irel)->attrs[0];
					Oid key = InvalidOid;
					bool isnull;
					if (key_att->atttypid == OIDOID)
					{
						key = DatumGetInt32(
								index_getattr(itup, 1, RelationGetDescr(irel), &isnull));
						elog(ERROR, "found tid (%d,%d), %s (%d) already in index (%s)",
							 ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid),
							 NameStr(key_att->attname), key, RelationGetRelationName(irel));
					}
					else
					{
						elog(ERROR, "found tid (%d,%d) already in index (%s)",
							 ItemPointerGetBlockNumber(h_tid), ItemPointerGetOffsetNumber(h_tid),
							 RelationGetRelationName(irel));
					}
				}
			}
		}
		ReleaseBuffer(buf);
	}
	MIRROREDLOCK_BUFMGR_UNLOCK;
}
예제 #8
0
파일: nbtree.c 프로젝트: LJoNe/gpdb
/*
 * btvacuumpage --- VACUUM one page
 *
 * This processes a single page for btvacuumscan().  In some cases we
 * must go back and re-examine previously-scanned pages; this routine
 * recurses when necessary to handle that case.
 *
 * blkno is the page to process.  orig_blkno is the highest block number
 * reached by the outer btvacuumscan loop (the same as blkno, unless we
 * are recursing to re-examine a previous page).
 */
static void
btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
{
	MIRROREDLOCK_BUFMGR_DECLARE;

	IndexVacuumInfo *info = vstate->info;
	IndexBulkDeleteResult *stats = vstate->stats;
	IndexBulkDeleteCallback callback = vstate->callback;
	void	   *callback_state = vstate->callback_state;
	Relation	rel = info->index;
	bool		delete_now;
	BlockNumber recurse_to;
	Buffer		buf;
	Page		page;
	BTPageOpaque opaque;
restart:
	delete_now = false;
	recurse_to = P_NONE;

	/* call vacuum_delay_point while not holding any buffer lock */
	vacuum_delay_point();

	/*
	 * We can't use _bt_getbuf() here because it always applies
	 * _bt_checkpage(), which will barf on an all-zero page. We want to
	 * recycle all-zero pages, not fail.  Also, we want to use a nondefault
	 * buffer access strategy.
	 */
	
	// -------- MirroredLock ----------
	MIRROREDLOCK_BUFMGR_LOCK;
	
	buf = ReadBufferWithStrategy(rel, blkno, info->strategy);
	LockBuffer(buf, BT_READ);
	page = BufferGetPage(buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	if (!PageIsNew(page))
		_bt_checkpage(rel, buf);

	/*
	 * If we are recursing, the only case we want to do anything with is a
	 * live leaf page having the current vacuum cycle ID.  Any other state
	 * implies we already saw the page (eg, deleted it as being empty). In
	 * particular, we don't want to risk adding it to freePages twice.
	 */
	if (blkno != orig_blkno)
	{
		if (_bt_page_recyclable(page) ||
			P_IGNORE(opaque) ||
			!P_ISLEAF(opaque) ||
			opaque->btpo_cycleid != vstate->cycleid)
		{
			_bt_relbuf(rel, buf);

			MIRROREDLOCK_BUFMGR_UNLOCK;
			// -------- MirroredLock ----------

			return;
		}
	}

	/* Page is valid, see what to do with it */
	if (_bt_page_recyclable(page))
	{
		/* Okay to recycle this page */
		if (vstate->nFreePages < vstate->maxFreePages)
			vstate->freePages[vstate->nFreePages++] = blkno;
		vstate->totFreePages++;
		stats->pages_deleted++;
	}
	else if (P_ISDELETED(opaque))
	{
		/* Already deleted, but can't recycle yet */
		stats->pages_deleted++;
	}
	else if (P_ISHALFDEAD(opaque))
	{
		/* Half-dead, try to delete */
		delete_now = true;
	}
	else if (P_ISLEAF(opaque))
	{
		OffsetNumber deletable[MaxOffsetNumber];
		int			ndeletable;
		OffsetNumber offnum,
					minoff,
					maxoff;

		/*
		 * Trade in the initial read lock for a super-exclusive write lock on
		 * this page.  We must get such a lock on every leaf page over the
		 * course of the vacuum scan, whether or not it actually contains any
		 * deletable tuples --- see nbtree/README.
		 */
		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
		LockBufferForCleanup(buf);

		/*
		 * Check whether we need to recurse back to earlier pages.	What we
		 * are concerned about is a page split that happened since we started
		 * the vacuum scan.  If the split moved some tuples to a lower page
		 * then we might have missed 'em.  If so, set up for tail recursion.
		 * (Must do this before possibly clearing btpo_cycleid below!)
		 */
		if (vstate->cycleid != 0 &&
			opaque->btpo_cycleid == vstate->cycleid &&
			!(opaque->btpo_flags & BTP_SPLIT_END) &&
			!P_RIGHTMOST(opaque) &&
			opaque->btpo_next < orig_blkno)
			recurse_to = opaque->btpo_next;

		/*
		 * Scan over all items to see which ones need deleted according to the
		 * callback function.
		 */
		ndeletable = 0;
		minoff = P_FIRSTDATAKEY(opaque);
		maxoff = PageGetMaxOffsetNumber(page);
		if (callback)
		{
			for (offnum = minoff;
				 offnum <= maxoff;
				 offnum = OffsetNumberNext(offnum))
			{
				IndexTuple	itup;
				ItemPointer htup;

				itup = (IndexTuple) PageGetItem(page,
												PageGetItemId(page, offnum));
				htup = &(itup->t_tid);
				if (callback(htup, callback_state))
					deletable[ndeletable++] = offnum;
			}
		}

		/*
		 * Apply any needed deletes.  We issue just one _bt_delitems() call
		 * per page, so as to minimize WAL traffic.
		 */
		if (ndeletable > 0)
		{
			_bt_delitems(rel, buf, deletable, ndeletable, true);
			stats->tuples_removed += ndeletable;
			/* must recompute maxoff */
			maxoff = PageGetMaxOffsetNumber(page);
		}
		else
		{
			/*
			 * If the page has been split during this vacuum cycle, it seems
			 * worth expending a write to clear btpo_cycleid even if we don't
			 * have any deletions to do.  (If we do, _bt_delitems takes care
			 * of this.)  This ensures we won't process the page again.
			 *
			 * We treat this like a hint-bit update because there's no need to
			 * WAL-log it.
			 */
			if (vstate->cycleid != 0 &&
				opaque->btpo_cycleid == vstate->cycleid)
			{
				opaque->btpo_cycleid = 0;
				SetBufferCommitInfoNeedsSave(buf);
			}
		}

		/*
		 * If it's now empty, try to delete; else count the live tuples. We
		 * don't delete when recursing, though, to avoid putting entries into
		 * freePages out-of-order (doesn't seem worth any extra code to handle
		 * the case).
		 */
		if (minoff > maxoff)
			delete_now = (blkno == orig_blkno);
		else
			stats->num_index_tuples += maxoff - minoff + 1;
	}

	if (delete_now)
	{
		MemoryContext oldcontext;
		int			ndel;

		/* Run pagedel in a temp context to avoid memory leakage */
		MemoryContextReset(vstate->pagedelcontext);
		oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);

		ndel = _bt_pagedel(rel, buf, NULL, info->vacuum_full);

		/* count only this page, else may double-count parent */
		if (ndel)
			stats->pages_deleted++;

		/*
		 * During VACUUM FULL it's okay to recycle deleted pages immediately,
		 * since there can be no other transactions scanning the index.  Note
		 * that we will only recycle the current page and not any parent pages
		 * that _bt_pagedel might have recursed to; this seems reasonable in
		 * the name of simplicity.	(Trying to do otherwise would mean we'd
		 * have to sort the list of recyclable pages we're building.)
		 */
		if (ndel && info->vacuum_full)
		{
			if (vstate->nFreePages < vstate->maxFreePages)
				vstate->freePages[vstate->nFreePages++] = blkno;
			vstate->totFreePages++;
		}

		MemoryContextSwitchTo(oldcontext);
		/* pagedel released buffer, so we shouldn't */
	}
	else
		_bt_relbuf(rel, buf);

	
	MIRROREDLOCK_BUFMGR_UNLOCK;
	// -------- MirroredLock ----------
	
	/*
	 * This is really tail recursion, but if the compiler is too stupid to
	 * optimize it as such, we'd eat an uncomfortably large amount of stack
	 * space per recursion level (due to the deletable[] array). A failure is
	 * improbable since the number of levels isn't likely to be large ... but
	 * just in case, let's hand-optimize into a loop.
	 */
	if (recurse_to != P_NONE)
	{
		blkno = recurse_to;
		goto restart;
	}
}
예제 #9
0
/*----------
 *	_bt_compare() -- Compare scankey to a particular tuple on the page.
 *
 * The passed scankey must be an insertion-type scankey (see nbtree/README),
 * but it can omit the rightmost column(s) of the index.
 *
 *	keysz: number of key conditions to be checked (might be less than the
 *		number of index columns!)
 *	page/offnum: location of btree item to be compared to.
 *
 *		This routine returns:
 *			<0 if scankey < tuple at offnum;
 *			 0 if scankey == tuple at offnum;
 *			>0 if scankey > tuple at offnum.
 *		NULLs in the keys are treated as sortable values.  Therefore
 *		"equality" does not necessarily mean that the item should be
 *		returned to the caller as a matching key!
 *
 * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
 * "minus infinity": this routine will always claim it is less than the
 * scankey.  The actual key value stored (if any, which there probably isn't)
 * does not matter.  This convention allows us to implement the Lehman and
 * Yao convention that the first down-link pointer is before the first key.
 * See backend/access/nbtree/README for details.
 *----------
 */
int32
_bt_compare(Relation rel,
			int keysz,
			ScanKey scankey,
			Page page,
			OffsetNumber offnum)
{
	TupleDesc	itupdesc = RelationGetDescr(rel);
	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	IndexTuple	itup;
	int			i;

	/*
	 * Force result ">" if target item is first data item on an internal page
	 * --- see NOTE above.
	 */
	if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
		return 1;

	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));

	/*
	 * The scan key is set up with the attribute number associated with each
	 * term in the key.  It is important that, if the index is multi-key, the
	 * scan contain the first k key attributes, and that they be in order.	If
	 * you think about how multi-key ordering works, you'll understand why
	 * this is.
	 *
	 * We don't test for violation of this condition here, however.  The
	 * initial setup for the index scan had better have gotten it right (see
	 * _bt_first).
	 */

	for (i = 1; i <= keysz; i++)
	{
		Datum		datum;
		bool		isNull;
		int32		result;

		datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);

		/* see comments about NULLs handling in btbuild */
		if (scankey->sk_flags & SK_ISNULL)		/* key is NULL */
		{
			if (isNull)
				result = 0;		/* NULL "=" NULL */
			else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
				result = -1;	/* NULL "<" NOT_NULL */
			else
				result = 1;		/* NULL ">" NOT_NULL */
		}
		else if (isNull)		/* key is NOT_NULL and item is NULL */
		{
			if (scankey->sk_flags & SK_BT_NULLS_FIRST)
				result = 1;		/* NOT_NULL ">" NULL */
			else
				result = -1;	/* NOT_NULL "<" NULL */
		}
		else
		{
			/*
			 * The sk_func needs to be passed the index value as left arg and
			 * the sk_argument as right arg (they might be of different
			 * types).	Since it is convenient for callers to think of
			 * _bt_compare as comparing the scankey to the index item, we have
			 * to flip the sign of the comparison result.  (Unless it's a DESC
			 * column, in which case we *don't* flip the sign.)
			 */
			result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
													 scankey->sk_collation,
													 datum,
													 scankey->sk_argument));

			if (!(scankey->sk_flags & SK_BT_DESC))
				result = -result;
		}

		/* if the keys are unequal, return the difference */
		if (result != 0)
			return result;

		scankey++;
	}

	/* if we get here, the keys are equal */
	return 0;
}
예제 #10
0
파일: btreefuncs.c 프로젝트: colinet/sqlix
/* -------------------------------------------------
 * GetBTPageStatistics()
 *
 * Collect statistics of single b-tree page
 * -------------------------------------------------
 */
static void
GetBTPageStatistics(block_t blkno, buf_id_t buffer, BTPageStat *stat)
{
	page_p		page = BUF_PAGE(buffer);
	struct page_hdr *	phdr = (struct page_hdr *) page;
	item_id_t maxoff = PAGE_MAX_ITEM_ID(page);
	struct bt_page_opaque * opaque = (struct bt_page_opaque *) PAGE_SPECIAL_PTR(page);
	int			item_size = 0;
	int			off;

	stat->blkno = blkno;

	stat->max_avail = BLK_SZ - (BLK_SZ - phdr->pd_special + PAGE_HDR_SZ);

	stat->dead_items = stat->live_items = 0;

	stat->page_size = PAGE_SZ(page);

	/* page type (flags) */
	if (P_ISDELETED(opaque))
	{
		stat->type = 'd';
		stat->btpo.xact = opaque->btpo.xact;
		return;
	}
	else if (P_IGNORE(opaque))
		stat->type = 'e';
	else if (P_ISLEAF(opaque))
		stat->type = 'l';
	else if (P_ISROOT(opaque))
		stat->type = 'r';
	else
		stat->type = 'i';

	/* btpage opaque data */
	stat->btpo_prev = opaque->btpo_prev;
	stat->btpo_next = opaque->btpo_next;
	stat->btpo.level = opaque->btpo.level;
	stat->btpo_flags = opaque->btpo_flags;
	stat->btpo_cycleid = opaque->btpo_cycleid;

	/* count live and dead tuples, and free space */
	for (off = FIRST_ITEM_ID; off <= maxoff; off++)
	{
		struct index_tuple *	itup;

		struct item_id *		id = PAGE_ITEM_ID(page, off);

		itup = (struct index_tuple *) PAGE_GET_ITEM(page, id);

		item_size += INDEX_TUPLE_SZ(itup);

		if (!ITEMID_DEAD(id))
			stat->live_items++;
		else
			stat->dead_items++;
	}
	stat->free_size = page_free_space(page);

	if ((stat->live_items + stat->dead_items) > 0)
		stat->avg_item_size = item_size / (stat->live_items + stat->dead_items);
	else
		stat->avg_item_size = 0;
}
예제 #11
0
/* -------------------------------------------------
 * GetBTPageStatistics()
 *
 * Collect statistics of single b-tree page
 * -------------------------------------------------
 */
static void
GetBTPageStatistics(BlockNumber blkno, Buffer buffer, BTPageStat *stat)
{
	Page		page = BufferGetPage(buffer);
	PageHeader	phdr = (PageHeader) page;
	OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	int			item_size = 0;
	int			off;

	stat->blkno = blkno;

	stat->max_avail = BLCKSZ - (BLCKSZ - phdr->pd_special + SizeOfPageHeaderData);

	stat->dead_items = stat->live_items = 0;

	stat->page_size = PageGetPageSize(page);

	/* page type (flags) */
	if (P_ISDELETED(opaque))
	{
		stat->type = 'd';
		stat->btpo.xact = opaque->btpo.xact;
		return;
	}
	else if (P_IGNORE(opaque))
		stat->type = 'e';
	else if (P_ISLEAF(opaque))
		stat->type = 'l';
	else if (P_ISROOT(opaque))
		stat->type = 'r';
	else
		stat->type = 'i';

	/* btpage opaque data */
	stat->btpo_prev = opaque->btpo_prev;
	stat->btpo_next = opaque->btpo_next;
	stat->btpo.level = opaque->btpo.level;
	stat->btpo_flags = opaque->btpo_flags;
	stat->btpo_cycleid = opaque->btpo_cycleid;

	/* count live and dead tuples, and free space */
	for (off = FirstOffsetNumber; off <= maxoff; off++)
	{
		IndexTuple	itup;

		ItemId		id = PageGetItemId(page, off);

		itup = (IndexTuple) PageGetItem(page, id);

		item_size += IndexTupleSize(itup);

		if (!ItemIdIsDead(id))
			stat->live_items++;
		else
			stat->dead_items++;
	}
	stat->free_size = PageGetFreeSpace(page);

	if ((stat->live_items + stat->dead_items) > 0)
		stat->avg_item_size = item_size / (stat->live_items + stat->dead_items);
	else
		stat->avg_item_size = 0;
}
예제 #12
0
파일: indexscan.c 프로젝트: AnLingm/gpdb
Datum
readindex(PG_FUNCTION_ARGS)
{
	FuncCallContext	   *funcctx;
	readindexinfo	   *info;
	Relation	irel = NULL;
	Relation	hrel = NULL;

	MIRROREDLOCK_BUFMGR_DECLARE;

	if (SRF_IS_FIRSTCALL())
	{
		Oid		irelid = PG_GETARG_OID(0);
		TupleDesc	tupdesc;
		MemoryContext oldcontext;
		AttrNumber		outattnum;
		TupleDesc	itupdesc;
		int			i;
		AttrNumber	attno;

		irel = index_open(irelid, AccessShareLock);
		itupdesc = RelationGetDescr(irel);
		outattnum = FIXED_COLUMN + itupdesc->natts;

		funcctx = SRF_FIRSTCALL_INIT();
		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
		tupdesc = CreateTemplateTupleDesc(outattnum, false);
		attno = 1;
		TupleDescInitEntry(tupdesc, attno++, "ictid", TIDOID, -1, 0);
		TupleDescInitEntry(tupdesc, attno++, "hctid", TIDOID, -1, 0);
		TupleDescInitEntry(tupdesc, attno++, "aotid", TEXTOID, -1, 0);
		TupleDescInitEntry(tupdesc, attno++, "istatus", TEXTOID, -1, 0);
		TupleDescInitEntry(tupdesc, attno++, "hstatus", TEXTOID, -1, 0);

		for (i = 0; i < itupdesc->natts; i++)
		{
			Form_pg_attribute attr = itupdesc->attrs[i];
			TupleDescInitEntry(tupdesc, attno++, NameStr(attr->attname), attr->atttypid, attr->atttypmod, 0);
		}

		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
		info = (readindexinfo *) palloc(sizeof(readindexinfo));
		funcctx->user_fctx = (void *) info;

		info->outattnum = outattnum;
		info->ireloid = irelid;

		hrel = relation_open(irel->rd_index->indrelid, AccessShareLock);
		if (hrel->rd_rel != NULL &&
			(hrel->rd_rel->relstorage == 'a' ||
			 hrel->rd_rel->relstorage == 'c'))
		{
			relation_close(hrel, AccessShareLock);
			hrel = NULL;
			info->hreloid = InvalidOid;
		}
		else
			info->hreloid = irel->rd_index->indrelid;
		info->num_pages = RelationGetNumberOfBlocks(irel);
		info->blkno = BTREE_METAPAGE + 1;
		info->page = NULL;

		MemoryContextSwitchTo(oldcontext);
	}

	funcctx = SRF_PERCALL_SETUP();
	info = (readindexinfo *) funcctx->user_fctx;

	/*
	 * Open the relations (on first call, we did that above already).
	 * We unfortunately have to look up the relcache entry on every call,
	 * because if we store it in the cross-call context, we won't get a
	 * chance to release it if the function isn't run to completion,
	 * e.g. because of a LIMIT clause. We only lock the relation on the
	 * first call, and keep the lock until completion, however.
	 */
	if (!irel)
		irel = index_open(info->ireloid, NoLock);
	if (!hrel && info->hreloid != InvalidOid)
		hrel = heap_open(info->hreloid, NoLock);

	while (info->blkno < info->num_pages)
	{
		Datum		values[255];
		bool		nulls[255];
		ItemPointerData		itid;
		HeapTuple	tuple;
		Datum		result;

		if (info->page == NULL)
		{
			Buffer		buf;

			/*
			 * Make copy of the page, because we cannot hold a buffer pin
			 * across calls (we wouldn't have a chance to release it, if the
			 * function isn't run to completion.)
			 */
			info->page = palloc(BLCKSZ);

			MIRROREDLOCK_BUFMGR_LOCK;
			buf = ReadBuffer(irel, info->blkno);
			memcpy(info->page, BufferGetPage(buf), BLCKSZ);
			ReleaseBuffer(buf);
			MIRROREDLOCK_BUFMGR_UNLOCK;

			info->opaque = (BTPageOpaque) PageGetSpecialPointer(info->page);
			info->minoff = P_FIRSTDATAKEY(info->opaque);
			info->maxoff = PageGetMaxOffsetNumber(info->page);
			info->offnum = info->minoff;
		}
		if (!P_ISLEAF(info->opaque) || info->offnum > info->maxoff)
		{
			pfree(info->page);
			info->page = NULL;
			info->blkno++;
			continue;
		}

		MemSet(nulls, false, info->outattnum * sizeof(bool));

		ItemPointerSet(&itid, info->blkno, info->offnum);
		values[0] = ItemPointerGetDatum(&itid);
		readindextuple(info, irel, hrel, values, nulls);

		info->offnum = OffsetNumberNext(info->offnum);

		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
		result = HeapTupleGetDatum(tuple);

		if (hrel != NULL)
			heap_close(hrel, NoLock);
		index_close(irel, NoLock);

		SRF_RETURN_NEXT(funcctx, result);
	}

	if (hrel != NULL)
		heap_close(hrel, AccessShareLock);
	index_close(irel, AccessShareLock);
	SRF_RETURN_DONE(funcctx);
}
예제 #13
0
/* ------------------------------------------------------
 * pgstatindex()
 *
 * Usage: SELECT * FROM pgstatindex('t1_pkey');
 * ------------------------------------------------------
 */
Datum
pgstatindex(PG_FUNCTION_ARGS)
{
	text	   *relname = PG_GETARG_TEXT_P(0);
	Relation	rel;
	RangeVar   *relrv;
	Datum		result;
	BlockNumber nblocks;
	BlockNumber blkno;
	BTIndexStat indexStat;

	if (!superuser())
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
				 (errmsg("must be superuser to use pgstattuple functions"))));

	relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
	rel = relation_openrv(relrv, AccessShareLock);

	if (!IS_INDEX(rel) || !IS_BTREE(rel))
		elog(ERROR, "relation \"%s\" is not a btree index",
			 RelationGetRelationName(rel));

	/*
	 * Reject attempts to read non-local temporary relations; we would be
	 * likely to get wrong data since we have no visibility into the owning
	 * session's local buffers.
	 */
	if (RELATION_IS_OTHER_TEMP(rel))
		ereport(ERROR,
				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
				 errmsg("cannot access temporary tables of other sessions")));

	/*
	 * Read metapage
	 */
	{
		Buffer		buffer = ReadBuffer(rel, 0);
		Page		page = BufferGetPage(buffer);
		BTMetaPageData *metad = BTPageGetMeta(page);

		indexStat.version = metad->btm_version;
		indexStat.level = metad->btm_level;
		indexStat.root_blkno = metad->btm_root;

		ReleaseBuffer(buffer);
	}

	/* -- init counters -- */
	indexStat.root_pages = 0;
	indexStat.internal_pages = 0;
	indexStat.leaf_pages = 0;
	indexStat.empty_pages = 0;
	indexStat.deleted_pages = 0;

	indexStat.max_avail = 0;
	indexStat.free_space = 0;

	indexStat.fragments = 0;

	/*
	 * Scan all blocks except the metapage
	 */
	nblocks = RelationGetNumberOfBlocks(rel);

	for (blkno = 1; blkno < nblocks; blkno++)
	{
		Buffer		buffer;
		Page		page;
		BTPageOpaque opaque;

		/* Read and lock buffer */
		buffer = ReadBuffer(rel, blkno);
		LockBuffer(buffer, BUFFER_LOCK_SHARE);

		page = BufferGetPage(buffer);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);

		/* Determine page type, and update totals */

		if (P_ISLEAF(opaque))
		{
			int			max_avail;

			max_avail = BLCKSZ - (BLCKSZ - ((PageHeader) page)->pd_special + SizeOfPageHeaderData);
			indexStat.max_avail += max_avail;
			indexStat.free_space += PageGetFreeSpace(page);

			indexStat.leaf_pages++;

			/*
			 * If the next leaf is on an earlier block, it means a
			 * fragmentation.
			 */
			if (opaque->btpo_next != P_NONE && opaque->btpo_next < blkno)
				indexStat.fragments++;
		}
		else if (P_ISDELETED(opaque))
			indexStat.deleted_pages++;
		else if (P_IGNORE(opaque))
			indexStat.empty_pages++;
		else if (P_ISROOT(opaque))
			indexStat.root_pages++;
		else
			indexStat.internal_pages++;

		/* Unlock and release buffer */
		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
		ReleaseBuffer(buffer);
	}

	relation_close(rel, AccessShareLock);

	/*----------------------------
	 * Build a result tuple
	 *----------------------------
	 */
	{
		TupleDesc	tupleDesc;
		int			j;
		char	   *values[10];
		HeapTuple	tuple;

		/* Build a tuple descriptor for our result type */
		if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
			elog(ERROR, "return type must be a row type");

		j = 0;
		values[j] = palloc(32);
		snprintf(values[j++], 32, "%d", indexStat.version);
		values[j] = palloc(32);
		snprintf(values[j++], 32, "%d", indexStat.level);
		values[j] = palloc(32);
		snprintf(values[j++], 32, INT64_FORMAT,
				 (indexStat.root_pages +
				  indexStat.leaf_pages +
				  indexStat.internal_pages +
				  indexStat.deleted_pages +
				  indexStat.empty_pages) * BLCKSZ);
		values[j] = palloc(32);
		snprintf(values[j++], 32, "%u", indexStat.root_blkno);
		values[j] = palloc(32);
		snprintf(values[j++], 32, INT64_FORMAT, indexStat.internal_pages);
		values[j] = palloc(32);
		snprintf(values[j++], 32, INT64_FORMAT, indexStat.leaf_pages);
		values[j] = palloc(32);
		snprintf(values[j++], 32, INT64_FORMAT, indexStat.empty_pages);
		values[j] = palloc(32);
		snprintf(values[j++], 32, INT64_FORMAT, indexStat.deleted_pages);
		values[j] = palloc(32);
		if (indexStat.max_avail > 0)
			snprintf(values[j++], 32, "%.2f",
					 100.0 - (double) indexStat.free_space / (double) indexStat.max_avail * 100.0);
		else
			snprintf(values[j++], 32, "NaN");
		values[j] = palloc(32);
		if (indexStat.leaf_pages > 0)
			snprintf(values[j++], 32, "%.2f",
					 (double) indexStat.fragments / (double) indexStat.leaf_pages * 100.0);
		else
			snprintf(values[j++], 32, "NaN");

		tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
									   values);

		result = HeapTupleGetDatum(tuple);
	}

	PG_RETURN_DATUM(result);
}
예제 #14
0
파일: indexscan.c 프로젝트: BenjaminYu/gpdb
Datum
readindex(PG_FUNCTION_ARGS)
{
	FuncCallContext	   *funcctx;
	readindexinfo	   *info;

	MIRROREDLOCK_BUFMGR_DECLARE;

	if (SRF_IS_FIRSTCALL())
	{
		Oid		irelid = PG_GETARG_OID(0);
		TupleDesc	tupdesc;
		MemoryContext oldcontext;
		AttrNumber		outattnum;
		Relation	irel;
		TupleDesc	itupdesc;
		int			i;
		AttrNumber	attno;

		irel = index_open(irelid, AccessShareLock);
		itupdesc = RelationGetDescr(irel);
		outattnum = FIXED_COLUMN + itupdesc->natts;

		funcctx = SRF_FIRSTCALL_INIT();
		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
		tupdesc = CreateTemplateTupleDesc(outattnum, false);
		attno = 1;
		TupleDescInitEntry(tupdesc, attno++, "ictid", TIDOID, -1, 0);
		TupleDescInitEntry(tupdesc, attno++, "hctid", TIDOID, -1, 0);
		TupleDescInitEntry(tupdesc, attno++, "aotid", TEXTOID, -1, 0);
		TupleDescInitEntry(tupdesc, attno++, "istatus", TEXTOID, -1, 0);
		TupleDescInitEntry(tupdesc, attno++, "hstatus", TEXTOID, -1, 0);

		for (i = 0; i < itupdesc->natts; i++)
		{
			Form_pg_attribute attr = itupdesc->attrs[i];
			TupleDescInitEntry(tupdesc, attno++, NameStr(attr->attname), attr->atttypid, attr->atttypmod, 0);
		}

		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
		info = (readindexinfo *) palloc(sizeof(readindexinfo));
		funcctx->user_fctx = (void *) info;

		info->outattnum = outattnum;
		info->irel = irel;
		info->hrel = relation_open(irel->rd_index->indrelid, AccessShareLock);
		if (info->hrel->rd_rel != NULL &&
				(info->hrel->rd_rel->relstorage == 'a' ||
				 info->hrel->rd_rel->relstorage == 'c'))
		{
			relation_close(info->hrel, AccessShareLock);
			info->hrel = NULL;
		}
		info->num_pages = RelationGetNumberOfBlocks(irel);
		info->blkno = BTREE_METAPAGE + 1;
		info->page = NULL;

		MemoryContextSwitchTo(oldcontext);
	}

	funcctx = SRF_PERCALL_SETUP();
	info = (readindexinfo *) funcctx->user_fctx;

	while (info->blkno < info->num_pages)
	{
		Datum		values[255];
		bool		nulls[255];
		ItemPointerData		itid;
		HeapTuple	tuple;
		Datum		result;

		if (info->page == NULL)
		{
			MIRROREDLOCK_BUFMGR_LOCK;
			info->buf = ReadBuffer(info->irel, info->blkno);
			info->page = BufferGetPage(info->buf);
			info->opaque = (BTPageOpaque) PageGetSpecialPointer(info->page);
			info->minoff = P_FIRSTDATAKEY(info->opaque);
			info->maxoff = PageGetMaxOffsetNumber(info->page);
			info->offnum = info->minoff;
			MIRROREDLOCK_BUFMGR_UNLOCK;
		}
		if (!P_ISLEAF(info->opaque) || info->offnum > info->maxoff)
		{
			ReleaseBuffer(info->buf);
			info->page = NULL;
			info->blkno++;
			continue;
		}

		MemSet(nulls, false, info->outattnum * sizeof(bool));

		ItemPointerSet(&itid, info->blkno, info->offnum);
		values[0] = ItemPointerGetDatum(&itid);
		readindextuple(info, values, nulls);

		info->offnum = OffsetNumberNext(info->offnum);

		tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
		result = HeapTupleGetDatum(tuple);
		SRF_RETURN_NEXT(funcctx, result);
	}

	if (info->hrel != NULL)
		relation_close(info->hrel, AccessShareLock);
	index_close(info->irel, AccessShareLock);
	SRF_RETURN_DONE(funcctx);
}
예제 #15
0
/*
 *	_bt_endpoint() -- Find the first or last page in the index, and scan
 * from there to the first key satisfying all the quals.
 *
 * This is used by _bt_first() to set up a scan when we've determined
 * that the scan must start at the beginning or end of the index (for
 * a forward or backward scan respectively).  Exit conditions are the
 * same as for _bt_first().
 */
static bool
_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
{
	Relation	rel = scan->indexRelation;
	BTScanOpaque so = (BTScanOpaque) scan->opaque;
	Buffer		buf;
	Page		page;
	BTPageOpaque opaque;
	OffsetNumber start;
	BTScanPosItem *currItem;

	/*
	 * Scan down to the leftmost or rightmost leaf page.  This is a simplified
	 * version of _bt_search().  We don't maintain a stack since we know we
	 * won't need it.
	 */
	buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));

	if (!BufferIsValid(buf))
	{
		/*
		 * Empty index. Lock the whole relation, as nothing finer to lock
		 * exists.
		 */
		PredicateLockRelation(rel, scan->xs_snapshot);
		so->currPos.buf = InvalidBuffer;
		return false;
	}

	PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
	page = BufferGetPage(buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	Assert(P_ISLEAF(opaque));

	if (ScanDirectionIsForward(dir))
	{
		/* There could be dead pages to the left, so not this: */
		/* Assert(P_LEFTMOST(opaque)); */

		start = P_FIRSTDATAKEY(opaque);
	}
	else if (ScanDirectionIsBackward(dir))
	{
		Assert(P_RIGHTMOST(opaque));

		start = PageGetMaxOffsetNumber(page);
	}
	else
	{
		elog(ERROR, "invalid scan direction: %d", (int) dir);
		start = 0;				/* keep compiler quiet */
	}

	/* remember which buffer we have pinned */
	so->currPos.buf = buf;

	/* initialize moreLeft/moreRight appropriately for scan direction */
	if (ScanDirectionIsForward(dir))
	{
		so->currPos.moreLeft = false;
		so->currPos.moreRight = true;
	}
	else
	{
		so->currPos.moreLeft = true;
		so->currPos.moreRight = false;
	}
	so->numKilled = 0;			/* just paranoia */
	so->markItemIndex = -1;		/* ditto */

	/*
	 * Now load data from the first page of the scan.
	 */
	if (!_bt_readpage(scan, dir, start))
	{
		/*
		 * There's no actually-matching data on this page.  Try to advance to
		 * the next page.  Return false if there's no matching data at all.
		 */
		if (!_bt_steppage(scan, dir))
			return false;
	}

	/* Drop the lock, but not pin, on the current page */
	LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);

	/* OK, itemIndex says what to return */
	currItem = &so->currPos.items[so->currPos.itemIndex];
	scan->xs_ctup.t_self = currItem->heapTid;
	if (scan->xs_want_itup)
		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);

	return true;
}
예제 #16
0
/**
 * @brief Read the left-most leaf page by walking down on index tree structure
 * from root node.
 *
 * Process flow
 * -# Open index file and read meta page
 * -# Get block number of root page
 * -# Read "fast root" page
 * -# Read left child page until reaching left-most leaf page
 *
 * After calling this function, the members of BTReader are the following:
 * - smgr : Smgr relation of the existing index file.
 * - blkno : block number of left-most leaf page. If there is no leaf page,
 *		   InvalidBlockNumber is set.
 * - offnum : InvalidOffsetNumber is set.
 * - page : Left-most leaf page, or undefined if no leaf page.
 *
 * @param reader [in/out] B-Tree index reader
 * @return true iff there are some tuples
 */
static bool
BTReaderInit(BTReader *reader, Relation rel)
{
	BTPageOpaque	metaopaque;
	BTMetaPageData *metad;
	BTPageOpaque	opaque;
	BlockNumber		blkno;

	/*
	 * HACK: We cannot use smgropen because smgrs returned from it
	 * will be closed automatically when we assign a new file node.
	 *
	 * XXX: It might be better to open the previous relfilenode with
	 * smgropen *after* RelationSetNewRelfilenode.
	 */
	memset(&reader->smgr, 0, sizeof(reader->smgr));
#if PG_VERSION_NUM >= 90100
	reader->smgr.smgr_rnode.node = rel->rd_node;
	reader->smgr.smgr_rnode.backend =
		rel->rd_backend == MyBackendId ? MyBackendId : InvalidBackendId;
#else
	reader->smgr.smgr_rnode = rel->rd_node;
#endif
	reader->smgr.smgr_which = 0;	/* md.c */

	reader->blkno = InvalidBlockNumber;
	reader->offnum = InvalidOffsetNumber;
	reader->page = palloc(BLCKSZ);

	/*
	 * Read meta page and check sanity of it.
	 * 
	 * XXX: It might be better to do REINDEX against corrupted indexes
	 * instead of raising errors because we've spent long time for data
	 * loading...
	 */
	BTReaderReadPage(reader, BTREE_METAPAGE);
	metaopaque = (BTPageOpaque) PageGetSpecialPointer(reader->page);
	metad = BTPageGetMeta(reader->page);

	if (!(metaopaque->btpo_flags & BTP_META) ||
		metad->btm_magic != BTREE_MAGIC)
		ereport(ERROR,
				(errcode(ERRCODE_INDEX_CORRUPTED),
				 errmsg("index \"%s\" is not a reader",
						RelationGetRelationName(rel))));

	if (metad->btm_version != BTREE_VERSION)
		ereport(ERROR,
				(errcode(ERRCODE_INDEX_CORRUPTED),
				 errmsg("version mismatch in index \"%s\": file version %d,"
						" code version %d",
						RelationGetRelationName(rel),
						metad->btm_version, BTREE_VERSION)));

	if (metad->btm_root == P_NONE)
	{
		/* No root page; We ignore the index in the subsequent build. */
		reader->blkno = InvalidBlockNumber;
		return false;
	}

	/* Go to the fast root page. */
	blkno = metad->btm_fastroot;
	BTReaderReadPage(reader, blkno);
	opaque = (BTPageOpaque) PageGetSpecialPointer(reader->page);

	/* Walk down to the left-most leaf page */
	while (!P_ISLEAF(opaque))
	{
		ItemId		firstid;
		IndexTuple	itup;

		/* Get the block number of the left child */
		firstid = PageGetItemId(reader->page, P_FIRSTDATAKEY(opaque));
		itup = (IndexTuple) PageGetItem(reader->page, firstid);
		blkno = ItemPointerGetBlockNumber(&(itup->t_tid));

		/* Go down to children */
		for (;;)
		{
			BTReaderReadPage(reader, blkno);
			opaque = (BTPageOpaque) PageGetSpecialPointer(reader->page);

			if (!P_IGNORE(opaque))
				break;

			if (P_RIGHTMOST(opaque))
			{
				/* We reach end of the index without any valid leaves. */
				reader->blkno = InvalidBlockNumber;
				return false;
			}
			blkno = opaque->btpo_next;
		}
	}
	
	return true;
}
예제 #17
0
/*
 *	_bt_binsrch() -- Do a binary search for a key on a particular page.
 *
 * The passed scankey must be an insertion-type scankey (see nbtree/README),
 * but it can omit the rightmost column(s) of the index.
 *
 * When nextkey is false (the usual case), we are looking for the first
 * item >= scankey.  When nextkey is true, we are looking for the first
 * item strictly greater than scankey.
 *
 * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
 * key >= given scankey, or > scankey if nextkey is true.  (NOTE: in
 * particular, this means it is possible to return a value 1 greater than the
 * number of keys on the page, if the scankey is > all keys on the page.)
 *
 * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
 * of the last key < given scankey, or last key <= given scankey if nextkey
 * is true.  (Since _bt_compare treats the first data key of such a page as
 * minus infinity, there will be at least one key < scankey, so the result
 * always points at one of the keys on the page.)  This key indicates the
 * right place to descend to be sure we find all leaf keys >= given scankey
 * (or leaf keys > given scankey when nextkey is true).
 *
 * This procedure is not responsible for walking right, it just examines
 * the given page.	_bt_binsrch() has no lock or refcount side effects
 * on the buffer.
 */
OffsetNumber
_bt_binsrch(Relation rel,
			Buffer buf,
			int keysz,
			ScanKey scankey,
			bool nextkey)
{
	Page		page;
	BTPageOpaque opaque;
	OffsetNumber low,
				high;
	int32		result,
				cmpval;

	page = BufferGetPage(buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);

	low = P_FIRSTDATAKEY(opaque);
	high = PageGetMaxOffsetNumber(page);

	/*
	 * If there are no keys on the page, return the first available slot. Note
	 * this covers two cases: the page is really empty (no keys), or it
	 * contains only a high key.  The latter case is possible after vacuuming.
	 * This can never happen on an internal page, however, since they are
	 * never empty (an internal page must have children).
	 */
	if (high < low)
		return low;

	/*
	 * Binary search to find the first key on the page >= scan key, or first
	 * key > scankey when nextkey is true.
	 *
	 * For nextkey=false (cmpval=1), the loop invariant is: all slots before
	 * 'low' are < scan key, all slots at or after 'high' are >= scan key.
	 *
	 * For nextkey=true (cmpval=0), the loop invariant is: all slots before
	 * 'low' are <= scan key, all slots at or after 'high' are > scan key.
	 *
	 * We can fall out when high == low.
	 */
	high++;						/* establish the loop invariant for high */

	cmpval = nextkey ? 0 : 1;	/* select comparison value */

	while (high > low)
	{
		OffsetNumber mid = low + ((high - low) / 2);

		/* We have low <= mid < high, so mid points at a real slot */

		result = _bt_compare(rel, keysz, scankey, page, mid);

		if (result >= cmpval)
			low = mid + 1;
		else
			high = mid;
	}

	/*
	 * At this point we have high == low, but be careful: they could point
	 * past the last slot on the page.
	 *
	 * On a leaf page, we always return the first key >= scan key (resp. >
	 * scan key), which could be the last slot + 1.
	 */
	if (P_ISLEAF(opaque))
		return low;

	/*
	 * On a non-leaf page, return the last key < scan key (resp. <= scan key).
	 * There must be one if _bt_compare() is playing by the rules.
	 */
	Assert(low > P_FIRSTDATAKEY(opaque));

	return OffsetNumberPrev(low);
}
예제 #18
0
static Datum
pgstatindex_impl(Relation rel, FunctionCallInfo fcinfo)
{
	Datum		result;
	BlockNumber nblocks;
	BlockNumber blkno;
	BTIndexStat indexStat;
	BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);

	if (!IS_INDEX(rel) || !IS_BTREE(rel))
		elog(ERROR, "relation \"%s\" is not a btree index",
			 RelationGetRelationName(rel));

	/*
	 * Reject attempts to read non-local temporary relations; we would be
	 * likely to get wrong data since we have no visibility into the owning
	 * session's local buffers.
	 */
	if (RELATION_IS_OTHER_TEMP(rel))
		ereport(ERROR,
				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
				 errmsg("cannot access temporary tables of other sessions")));

	/*
	 * Read metapage
	 */
	{
		Buffer		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, 0, RBM_NORMAL, bstrategy);
		Page		page = BufferGetPage(buffer);
		BTMetaPageData *metad = BTPageGetMeta(page);

		indexStat.version = metad->btm_version;
		indexStat.level = metad->btm_level;
		indexStat.root_blkno = metad->btm_root;

		ReleaseBuffer(buffer);
	}

	/* -- init counters -- */
	indexStat.internal_pages = 0;
	indexStat.leaf_pages = 0;
	indexStat.empty_pages = 0;
	indexStat.deleted_pages = 0;

	indexStat.max_avail = 0;
	indexStat.free_space = 0;

	indexStat.fragments = 0;

	/*
	 * Scan all blocks except the metapage
	 */
	nblocks = RelationGetNumberOfBlocks(rel);

	for (blkno = 1; blkno < nblocks; blkno++)
	{
		Buffer		buffer;
		Page		page;
		BTPageOpaque opaque;

		CHECK_FOR_INTERRUPTS();

		/* Read and lock buffer */
		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
		LockBuffer(buffer, BUFFER_LOCK_SHARE);

		page = BufferGetPage(buffer);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);

		/* Determine page type, and update totals */

		if (P_ISDELETED(opaque))
			indexStat.deleted_pages++;
		else if (P_IGNORE(opaque))
			indexStat.empty_pages++;	/* this is the "half dead" state */
		else if (P_ISLEAF(opaque))
		{
			int			max_avail;

			max_avail = BLCKSZ - (BLCKSZ - ((PageHeader) page)->pd_special + SizeOfPageHeaderData);
			indexStat.max_avail += max_avail;
			indexStat.free_space += PageGetFreeSpace(page);

			indexStat.leaf_pages++;

			/*
			 * If the next leaf is on an earlier block, it means a
			 * fragmentation.
			 */
			if (opaque->btpo_next != P_NONE && opaque->btpo_next < blkno)
				indexStat.fragments++;
		}
		else
			indexStat.internal_pages++;

		/* Unlock and release buffer */
		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
		ReleaseBuffer(buffer);
	}

	relation_close(rel, AccessShareLock);

	/*----------------------------
	 * Build a result tuple
	 *----------------------------
	 */
	{
		TupleDesc	tupleDesc;
		int			j;
		char	   *values[10];
		HeapTuple	tuple;

		/* Build a tuple descriptor for our result type */
		if (get_call_result_type(fcinfo, NULL, &tupleDesc) != TYPEFUNC_COMPOSITE)
			elog(ERROR, "return type must be a row type");

		j = 0;
		values[j++] = psprintf("%d", indexStat.version);
		values[j++] = psprintf("%d", indexStat.level);
		values[j++] = psprintf(INT64_FORMAT,
							   (1 +		/* include the metapage in index_size */
								indexStat.leaf_pages +
								indexStat.internal_pages +
								indexStat.deleted_pages +
								indexStat.empty_pages) * BLCKSZ);
		values[j++] = psprintf("%u", indexStat.root_blkno);
		values[j++] = psprintf(INT64_FORMAT, indexStat.internal_pages);
		values[j++] = psprintf(INT64_FORMAT, indexStat.leaf_pages);
		values[j++] = psprintf(INT64_FORMAT, indexStat.empty_pages);
		values[j++] = psprintf(INT64_FORMAT, indexStat.deleted_pages);
		if (indexStat.max_avail > 0)
			values[j++] = psprintf("%.2f",
								   100.0 - (double) indexStat.free_space / (double) indexStat.max_avail * 100.0);
		else
			values[j++] = pstrdup("NaN");
		if (indexStat.leaf_pages > 0)
			values[j++] = psprintf("%.2f",
								   (double) indexStat.fragments / (double) indexStat.leaf_pages * 100.0);
		else
			values[j++] = pstrdup("NaN");

		tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
									   values);

		result = HeapTupleGetDatum(tuple);
	}

	return result;
}
예제 #19
0
/*
 *	_bt_search() -- Search the tree for a particular scankey,
 *		or more precisely for the first leaf page it could be on.
 *
 * The passed scankey must be an insertion-type scankey (see nbtree/README),
 * but it can omit the rightmost column(s) of the index.
 *
 * When nextkey is false (the usual case), we are looking for the first
 * item >= scankey.  When nextkey is true, we are looking for the first
 * item strictly greater than scankey.
 *
 * Return value is a stack of parent-page pointers.  *bufP is set to the
 * address of the leaf-page buffer, which is read-locked and pinned.
 * No locks are held on the parent pages, however!
 *
 * NOTE that the returned buffer is read-locked regardless of the access
 * parameter.  However, access = BT_WRITE will allow an empty root page
 * to be created and returned.	When access = BT_READ, an empty index
 * will result in *bufP being set to InvalidBuffer.  Also, in BT_WRITE mode,
 * any incomplete splits encountered during the search will be finished.
 */
BTStack
_bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
		   Buffer *bufP, int access)
{
	BTStack		stack_in = NULL;

	/* Get the root page to start with */
	*bufP = _bt_getroot(rel, access);

	/* If index is empty and access = BT_READ, no root page is created. */
	if (!BufferIsValid(*bufP))
		return (BTStack) NULL;

	/* Loop iterates once per level descended in the tree */
	for (;;)
	{
		Page		page;
		BTPageOpaque opaque;
		OffsetNumber offnum;
		ItemId		itemid;
		IndexTuple	itup;
		BlockNumber blkno;
		BlockNumber par_blkno;
		BTStack		new_stack;

		/*
		 * Race -- the page we just grabbed may have split since we read its
		 * pointer in the parent (or metapage).  If it has, we may need to
		 * move right to its new sibling.  Do that.
		 *
		 * In write-mode, allow _bt_moveright to finish any incomplete splits
		 * along the way.  Strictly speaking, we'd only need to finish an
		 * incomplete split on the leaf page we're about to insert to, not on
		 * any of the upper levels (they is taken care of in _bt_getstackbuf,
		 * if the leaf page is split and we insert to the parent page).  But
		 * this is a good opportunity to finish splits of internal pages too.
		 */
		*bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey,
							  (access == BT_WRITE), stack_in,
							  BT_READ);

		/* if this is a leaf page, we're done */
		page = BufferGetPage(*bufP);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		if (P_ISLEAF(opaque))
			break;

		/*
		 * Find the appropriate item on the internal page, and get the child
		 * page that it points to.
		 */
		offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
		itemid = PageGetItemId(page, offnum);
		itup = (IndexTuple) PageGetItem(page, itemid);
		blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
		par_blkno = BufferGetBlockNumber(*bufP);

		/*
		 * We need to save the location of the index entry we chose in the
		 * parent page on a stack. In case we split the tree, we'll use the
		 * stack to work back up to the parent page.  We also save the actual
		 * downlink (TID) to uniquely identify the index entry, in case it
		 * moves right while we're working lower in the tree.  See the paper
		 * by Lehman and Yao for how this is detected and handled. (We use the
		 * child link to disambiguate duplicate keys in the index -- Lehman
		 * and Yao disallow duplicate keys.)
		 */
		new_stack = (BTStack) palloc(sizeof(BTStackData));
		new_stack->bts_blkno = par_blkno;
		new_stack->bts_offset = offnum;
		memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData));
		new_stack->bts_parent = stack_in;

		/* drop the read lock on the parent page, acquire one on the child */
		*bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);

		/* okay, all set to move down a level */
		stack_in = new_stack;
	}

	return stack_in;
}
예제 #20
0
/*
 *	_bt_endpoint() -- Find the first or last key in the index.
 *
 * This is used by _bt_first() to set up a scan when we've determined
 * that the scan must start at the beginning or end of the index (for
 * a forward or backward scan respectively).
 */
static bool
_bt_endpoint(IndexScanDesc scan, ScanDirection dir)
{
	Relation	rel;
	Buffer		buf;
	Page		page;
	BTPageOpaque opaque;
	ItemPointer current;
	OffsetNumber maxoff;
	OffsetNumber start;
	BlockNumber blkno;
	BTItem		btitem;
	IndexTuple	itup;
	BTScanOpaque so;
	bool		res;
	bool		continuescan;

	rel = scan->indexRelation;
	current = &(scan->currentItemData);
	so = (BTScanOpaque) scan->opaque;

	/*
	 * Scan down to the leftmost or rightmost leaf page.  This is a
	 * simplified version of _bt_search().	We don't maintain a stack
	 * since we know we won't need it.
	 */
	buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));

	if (!BufferIsValid(buf))
	{
		/* empty index... */
		ItemPointerSetInvalid(current);
		so->btso_curbuf = InvalidBuffer;
		return false;
	}

	blkno = BufferGetBlockNumber(buf);
	page = BufferGetPage(buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	Assert(P_ISLEAF(opaque));

	maxoff = PageGetMaxOffsetNumber(page);

	if (ScanDirectionIsForward(dir))
	{
		/* There could be dead pages to the left, so not this: */
		/* Assert(P_LEFTMOST(opaque)); */

		start = P_FIRSTDATAKEY(opaque);
	}
	else if (ScanDirectionIsBackward(dir))
	{
		Assert(P_RIGHTMOST(opaque));

		start = PageGetMaxOffsetNumber(page);
		if (start < P_FIRSTDATAKEY(opaque))		/* watch out for empty
												 * page */
			start = P_FIRSTDATAKEY(opaque);
	}
	else
	{
		elog(ERROR, "invalid scan direction: %d", (int) dir);
		start = 0;				/* keep compiler quiet */
	}

	ItemPointerSet(current, blkno, start);
	/* remember which buffer we have pinned */
	so->btso_curbuf = buf;

	/*
	 * Left/rightmost page could be empty due to deletions, if so step
	 * till we find a nonempty page.
	 */
	if (start > maxoff)
	{
		if (!_bt_step(scan, &buf, dir))
			return false;
		start = ItemPointerGetOffsetNumber(current);
		page = BufferGetPage(buf);
	}

	btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start));
	itup = &(btitem->bti_itup);

	/* see if we picked a winner */
	if (_bt_checkkeys(scan, itup, dir, &continuescan))
	{
		/* yes, return it */
		scan->xs_ctup.t_self = itup->t_tid;
		res = true;
	}
	else if (continuescan)
	{
		/* no, but there might be another one that is */
		res = _bt_next(scan, dir);
	}
	else
	{
		/* no tuples in the index match this scan key */
		ItemPointerSetInvalid(current);
		so->btso_curbuf = InvalidBuffer;
		_bt_relbuf(rel, buf);
		res = false;
	}

	return res;
}
예제 #21
0
파일: nbtree.c 프로젝트: LJoNe/gpdb
/*
 * Post vacuum, iterate over all entries in index, check if the h_tid
 * of each entry exists and is not dead.  For specific system tables,
 * also ensure that the key in index entry matches the corresponding
 * attribute in the heap tuple.
 */
void
_bt_validate_vacuum(Relation irel, Relation hrel, TransactionId oldest_xmin)
{
	MIRROREDLOCK_BUFMGR_DECLARE;

	BlockNumber blkno;
	BlockNumber num_pages;
	Buffer ibuf = InvalidBuffer;
	Buffer hbuf = InvalidBuffer;
	Page ipage;
	BTPageOpaque opaque;
	IndexTuple itup;
	HeapTupleData htup;
	OffsetNumber maxoff,
			minoff,
			offnum;
	Oid ioid,
		hoid;
	bool isnull;

	blkno = BTREE_METAPAGE + 1;
	num_pages = RelationGetNumberOfBlocks(irel);

	elog(LOG, "btvalidatevacuum: index %s, heap %s",
		 RelationGetRelationName(irel), RelationGetRelationName(hrel));

	MIRROREDLOCK_BUFMGR_LOCK;
	for (; blkno < num_pages; blkno++)
	{
		ibuf = ReadBuffer(irel, blkno);
		ipage = BufferGetPage(ibuf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(ipage);
		if (!PageIsNew(ipage))
			_bt_checkpage(irel, ibuf);
		if (P_ISLEAF(opaque))
		{
			minoff = P_FIRSTDATAKEY(opaque);
			maxoff = PageGetMaxOffsetNumber(ipage);
			for (offnum = minoff;
				 offnum <= maxoff;
				 offnum = OffsetNumberNext(offnum))
			{
				itup = (IndexTuple) PageGetItem(ipage,
												PageGetItemId(ipage, offnum));
				ItemPointerCopy(&itup->t_tid, &htup.t_self);
				/*
				 * TODO: construct a tid bitmap based on index tids
				 * and fetch heap tids in order afterwards.  That will
				 * also allow validating if a heap tid appears twice
				 * in a unique index.
				 */
				if (!heap_release_fetch(hrel, SnapshotAny, &htup,
										&hbuf, true, NULL))
				{
					elog(ERROR, "btvalidatevacuum: tid (%d,%d) from index %s "
						 "not found in heap %s",
						 ItemPointerGetBlockNumber(&itup->t_tid),
						 ItemPointerGetOffsetNumber(&itup->t_tid),
						 RelationGetRelationName(irel),
						 RelationGetRelationName(hrel));
				}
				switch (HeapTupleSatisfiesVacuum(hrel, htup.t_data, oldest_xmin, hbuf))
				{
					case HEAPTUPLE_RECENTLY_DEAD:
					case HEAPTUPLE_LIVE:
					case HEAPTUPLE_INSERT_IN_PROGRESS:
					case HEAPTUPLE_DELETE_IN_PROGRESS:
						/* these tuples are considered alive by vacuum */
						break;
					case HEAPTUPLE_DEAD:
						elog(ERROR, "btvalidatevacuum: vacuum did not remove "
							 "dead tuple (%d,%d) from heap %s and index %s",
							 ItemPointerGetBlockNumber(&itup->t_tid),
							 ItemPointerGetOffsetNumber(&itup->t_tid),
							 RelationGetRelationName(hrel),
							 RelationGetRelationName(irel));
						break;
					default:
						elog(ERROR, "btvalidatevacuum: invalid visibility");
						break;
				}
				switch(RelationGetRelid(irel))
				{
					case DatabaseOidIndexId:
					case TypeOidIndexId:
					case ClassOidIndexId:
					case ConstraintOidIndexId:
						hoid = HeapTupleGetOid(&htup);
						ioid = index_getattr(itup, 1, RelationGetDescr(irel), &isnull);
						if (hoid != ioid)
						{
							elog(ERROR,
								 "btvalidatevacuum: index oid(%d) != heap oid(%d)"
								 " tuple (%d,%d) index %s", ioid, hoid,
								 ItemPointerGetBlockNumber(&itup->t_tid),
								 ItemPointerGetOffsetNumber(&itup->t_tid),
								 RelationGetRelationName(irel));
						}
						break;
					case GpRelationNodeOidIndexId:
						hoid = heap_getattr(&htup, 1, RelationGetDescr(hrel), &isnull);
						ioid = index_getattr(itup, 1, RelationGetDescr(irel), &isnull);
						if (hoid != ioid)
						{
							elog(ERROR,
								 "btvalidatevacuum: index oid(%d) != heap oid(%d)"
								 " tuple (%d,%d) index %s", ioid, hoid,
								 ItemPointerGetBlockNumber(&itup->t_tid),
								 ItemPointerGetOffsetNumber(&itup->t_tid),
								 RelationGetRelationName(irel));
						}
						int4 hsegno = heap_getattr(&htup, 2, RelationGetDescr(hrel), &isnull);
						int4 isegno = index_getattr(itup, 2, RelationGetDescr(irel), &isnull);
						if (isegno != hsegno)
						{
							elog(ERROR,
								 "btvalidatevacuum: index segno(%d) != heap segno(%d)"
								 " tuple (%d,%d) index %s", isegno, hsegno,
								 ItemPointerGetBlockNumber(&itup->t_tid),
								 ItemPointerGetOffsetNumber(&itup->t_tid),
								 RelationGetRelationName(irel));
						}
						break;
					default:
						break;
				}
				if (RelationGetNamespace(irel) == PG_AOSEGMENT_NAMESPACE)
				{
					int4 isegno = index_getattr(itup, 1, RelationGetDescr(irel), &isnull);
					int4 hsegno = heap_getattr(&htup, 1, RelationGetDescr(hrel), &isnull);
					if (isegno != hsegno)
					{
						elog(ERROR,
							 "btvalidatevacuum: index segno(%d) != heap segno(%d)"
							 " tuple (%d,%d) index %s", isegno, hsegno,
							 ItemPointerGetBlockNumber(&itup->t_tid),
							 ItemPointerGetOffsetNumber(&itup->t_tid),
							 RelationGetRelationName(irel));
					}
				}
			}
		}
		if (BufferIsValid(ibuf))
			ReleaseBuffer(ibuf);
	}
	if (BufferIsValid(hbuf))
		ReleaseBuffer(hbuf);
	MIRROREDLOCK_BUFMGR_UNLOCK;
}
예제 #22
0
/*
 *	_bt_binsrch() -- Do a binary search for a key on a particular page.
 *
 * The scankey we get has the compare function stored in the procedure
 * entry of each data struct.  We invoke this regproc to do the
 * comparison for every key in the scankey.
 *
 * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
 * key >= given scankey.  (NOTE: in particular, this means it is possible
 * to return a value 1 greater than the number of keys on the page,
 * if the scankey is > all keys on the page.)
 *
 * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
 * of the last key < given scankey.  (Since _bt_compare treats the first
 * data key of such a page as minus infinity, there will be at least one
 * key < scankey, so the result always points at one of the keys on the
 * page.)  This key indicates the right place to descend to be sure we
 * find all leaf keys >= given scankey.
 *
 * This procedure is not responsible for walking right, it just examines
 * the given page.	_bt_binsrch() has no lock or refcount side effects
 * on the buffer.
 */
OffsetNumber
_bt_binsrch(Relation rel,
			Buffer buf,
			int keysz,
			ScanKey scankey)
{
	TupleDesc	itupdesc;
	Page		page;
	BTPageOpaque opaque;
	OffsetNumber low,
				high;
	int32		result;

	itupdesc = RelationGetDescr(rel);
	page = BufferGetPage(buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);

	low = P_FIRSTDATAKEY(opaque);
	high = PageGetMaxOffsetNumber(page);

	/*
	 * If there are no keys on the page, return the first available slot.
	 * Note this covers two cases: the page is really empty (no keys), or
	 * it contains only a high key.  The latter case is possible after
	 * vacuuming.  This can never happen on an internal page, however,
	 * since they are never empty (an internal page must have children).
	 */
	if (high < low)
		return low;

	/*
	 * Binary search to find the first key on the page >= scan key. Loop
	 * invariant: all slots before 'low' are < scan key, all slots at or
	 * after 'high' are >= scan key.  We can fall out when high == low.
	 */
	high++;						/* establish the loop invariant for high */

	while (high > low)
	{
		OffsetNumber mid = low + ((high - low) / 2);

		/* We have low <= mid < high, so mid points at a real slot */

		result = _bt_compare(rel, keysz, scankey, page, mid);

		if (result > 0)
			low = mid + 1;
		else
			high = mid;
	}

	/*
	 * At this point we have high == low, but be careful: they could point
	 * past the last slot on the page.
	 *
	 * On a leaf page, we always return the first key >= scan key (which
	 * could be the last slot + 1).
	 */
	if (P_ISLEAF(opaque))
		return low;

	/*
	 * On a non-leaf page, return the last key < scan key. There must be
	 * one if _bt_compare() is playing by the rules.
	 */
	Assert(low > P_FIRSTDATAKEY(opaque));

	return OffsetNumberPrev(low);
}
예제 #23
0
/*
 * btvacuumpage --- VACUUM one page
 *
 * This processes a single page for btvacuumscan().  In some cases we
 * must go back and re-examine previously-scanned pages; this routine
 * recurses when necessary to handle that case.
 *
 * blkno is the page to process.  orig_blkno is the highest block number
 * reached by the outer btvacuumscan loop (the same as blkno, unless we
 * are recursing to re-examine a previous page).
 */
static void
btvacuumpage(BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
{
	IndexVacuumInfo *info = vstate->info;
	IndexBulkDeleteResult *stats = vstate->stats;
	IndexBulkDeleteCallback callback = vstate->callback;
	void	   *callback_state = vstate->callback_state;
	Relation	rel = info->index;
	bool		delete_now;
	BlockNumber recurse_to;
	Buffer		buf;
	Page		page;
	BTPageOpaque opaque = NULL;

restart:
	delete_now = false;
	recurse_to = P_NONE;

	/* call vacuum_delay_point while not holding any buffer lock */
	vacuum_delay_point();

	/*
	 * We can't use _bt_getbuf() here because it always applies
	 * _bt_checkpage(), which will barf on an all-zero page. We want to
	 * recycle all-zero pages, not fail.  Also, we want to use a nondefault
	 * buffer access strategy.
	 */
	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
							 info->strategy);
	LockBuffer(buf, BT_READ);
	page = BufferGetPage(buf);
	if (!PageIsNew(page))
	{
		_bt_checkpage(rel, buf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	}

	/*
	 * If we are recursing, the only case we want to do anything with is a
	 * live leaf page having the current vacuum cycle ID.  Any other state
	 * implies we already saw the page (eg, deleted it as being empty).
	 */
	if (blkno != orig_blkno)
	{
		if (_bt_page_recyclable(page) ||
			P_IGNORE(opaque) ||
			!P_ISLEAF(opaque) ||
			opaque->btpo_cycleid != vstate->cycleid)
		{
			_bt_relbuf(rel, buf);
			return;
		}
	}

	/* Page is valid, see what to do with it */
	if (_bt_page_recyclable(page))
	{
		/* Okay to recycle this page */
		RecordFreeIndexPage(rel, blkno);
		vstate->totFreePages++;
		stats->pages_deleted++;
	}
	else if (P_ISDELETED(opaque))
	{
		/* Already deleted, but can't recycle yet */
		stats->pages_deleted++;
	}
	else if (P_ISHALFDEAD(opaque))
	{
		/* Half-dead, try to delete */
		delete_now = true;
	}
	else if (P_ISLEAF(opaque))
	{
		OffsetNumber deletable[MaxOffsetNumber];
		int			ndeletable;
		OffsetNumber offnum,
					minoff,
					maxoff;

		/*
		 * Trade in the initial read lock for a super-exclusive write lock on
		 * this page.  We must get such a lock on every leaf page over the
		 * course of the vacuum scan, whether or not it actually contains any
		 * deletable tuples --- see nbtree/README.
		 */
		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
		LockBufferForCleanup(buf);

		/*
		 * Remember highest leaf page number we've taken cleanup lock on; see
		 * notes in btvacuumscan
		 */
		if (blkno > vstate->lastBlockLocked)
			vstate->lastBlockLocked = blkno;

		/*
		 * Check whether we need to recurse back to earlier pages.  What we
		 * are concerned about is a page split that happened since we started
		 * the vacuum scan.  If the split moved some tuples to a lower page
		 * then we might have missed 'em.  If so, set up for tail recursion.
		 * (Must do this before possibly clearing btpo_cycleid below!)
		 */
		if (vstate->cycleid != 0 &&
			opaque->btpo_cycleid == vstate->cycleid &&
			!(opaque->btpo_flags & BTP_SPLIT_END) &&
			!P_RIGHTMOST(opaque) &&
			opaque->btpo_next < orig_blkno)
			recurse_to = opaque->btpo_next;

		/*
		 * Scan over all items to see which ones need deleted according to the
		 * callback function.
		 */
		ndeletable = 0;
		minoff = P_FIRSTDATAKEY(opaque);
		maxoff = PageGetMaxOffsetNumber(page);
		if (callback)
		{
			for (offnum = minoff;
				 offnum <= maxoff;
				 offnum = OffsetNumberNext(offnum))
			{
				IndexTuple	itup;
				ItemPointer htup;

				itup = (IndexTuple) PageGetItem(page,
												PageGetItemId(page, offnum));
				htup = &(itup->t_tid);

				/*
				 * During Hot Standby we currently assume that
				 * XLOG_BTREE_VACUUM records do not produce conflicts. That is
				 * only true as long as the callback function depends only
				 * upon whether the index tuple refers to heap tuples removed
				 * in the initial heap scan. When vacuum starts it derives a
				 * value of OldestXmin. Backends taking later snapshots could
				 * have a RecentGlobalXmin with a later xid than the vacuum's
				 * OldestXmin, so it is possible that row versions deleted
				 * after OldestXmin could be marked as killed by other
				 * backends. The callback function *could* look at the index
				 * tuple state in isolation and decide to delete the index
				 * tuple, though currently it does not. If it ever did, we
				 * would need to reconsider whether XLOG_BTREE_VACUUM records
				 * should cause conflicts. If they did cause conflicts they
				 * would be fairly harsh conflicts, since we haven't yet
				 * worked out a way to pass a useful value for
				 * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
				 * applies to *any* type of index that marks index tuples as
				 * killed.
				 */
				if (callback(htup, callback_state))
					deletable[ndeletable++] = offnum;
			}
		}

		/*
		 * Apply any needed deletes.  We issue just one _bt_delitems_vacuum()
		 * call per page, so as to minimize WAL traffic.
		 */
		if (ndeletable > 0)
		{
			/*
			 * Notice that the issued XLOG_BTREE_VACUUM WAL record includes
			 * all information to the replay code to allow it to get a cleanup
			 * lock on all pages between the previous lastBlockVacuumed and
			 * this page. This ensures that WAL replay locks all leaf pages at
			 * some point, which is important should non-MVCC scans be
			 * requested. This is currently unused on standby, but we record
			 * it anyway, so that the WAL contains the required information.
			 *
			 * Since we can visit leaf pages out-of-order when recursing,
			 * replay might end up locking such pages an extra time, but it
			 * doesn't seem worth the amount of bookkeeping it'd take to avoid
			 * that.
			 */
			_bt_delitems_vacuum(rel, buf, deletable, ndeletable,
								vstate->lastBlockVacuumed);

			/*
			 * Remember highest leaf page number we've issued a
			 * XLOG_BTREE_VACUUM WAL record for.
			 */
			if (blkno > vstate->lastBlockVacuumed)
				vstate->lastBlockVacuumed = blkno;

			stats->tuples_removed += ndeletable;
			/* must recompute maxoff */
			maxoff = PageGetMaxOffsetNumber(page);
		}
		else
		{
			/*
			 * If the page has been split during this vacuum cycle, it seems
			 * worth expending a write to clear btpo_cycleid even if we don't
			 * have any deletions to do.  (If we do, _bt_delitems_vacuum takes
			 * care of this.)  This ensures we won't process the page again.
			 *
			 * We treat this like a hint-bit update because there's no need to
			 * WAL-log it.
			 */
			if (vstate->cycleid != 0 &&
				opaque->btpo_cycleid == vstate->cycleid)
			{
				opaque->btpo_cycleid = 0;
				MarkBufferDirtyHint(buf, true);
			}
		}

		/*
		 * If it's now empty, try to delete; else count the live tuples. We
		 * don't delete when recursing, though, to avoid putting entries into
		 * freePages out-of-order (doesn't seem worth any extra code to handle
		 * the case).
		 */
		if (minoff > maxoff)
			delete_now = (blkno == orig_blkno);
		else
			stats->num_index_tuples += maxoff - minoff + 1;
	}

	if (delete_now)
	{
		MemoryContext oldcontext;
		int			ndel;

		/* Run pagedel in a temp context to avoid memory leakage */
		MemoryContextReset(vstate->pagedelcontext);
		oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);

		ndel = _bt_pagedel(rel, buf);

		/* count only this page, else may double-count parent */
		if (ndel)
			stats->pages_deleted++;

		MemoryContextSwitchTo(oldcontext);
		/* pagedel released buffer, so we shouldn't */
	}
	else
		_bt_relbuf(rel, buf);

	/*
	 * This is really tail recursion, but if the compiler is too stupid to
	 * optimize it as such, we'd eat an uncomfortably large amount of stack
	 * space per recursion level (due to the deletable[] array). A failure is
	 * improbable since the number of levels isn't likely to be large ... but
	 * just in case, let's hand-optimize into a loop.
	 */
	if (recurse_to != P_NONE)
	{
		blkno = recurse_to;
		goto restart;
	}
}
예제 #24
0
파일: index.c 프로젝트: fdr/pg_check
/* checks the individual attributes of the tuple */
uint32 check_index_tuple_attributes(Relation rel, PageHeader header, int block, int i, char *buffer) {
  
	IndexTuple tuple;
	uint32 nerrs = 0;
	int j, off;
	
	bits8 * bitmap;
	BTPageOpaque opaque;
	
	ereport(DEBUG2,(errmsg("[%d:%d] checking attributes for the tuple", block, i)));
	
	/* get the index tuple and info about the page */
	tuple = (IndexTuple)(buffer + header->pd_linp[i].lp_off);
	opaque = (BTPageOpaque)(buffer + header->pd_special);
	
	/* current attribute offset - always starts at (buffer + off) */
	off = header->pd_linp[i].lp_off + IndexInfoFindDataOffset(tuple->t_info);
	
	ereport(DEBUG3,(errmsg("[%d:%d] tuple has %d attributes", block, (i+1),
						   RelationGetNumberOfAttributes(rel))));
	
	bitmap = (bits8*)(buffer + header->pd_linp[i].lp_off + sizeof(IndexTupleData));
	
	/* TODO This is mostly copy'n'paste from check_heap_tuple_attributes,
	   so maybe it could be refactored to share the code. */

	/* For left-most tuples on non-leaf pages, there are no data actually
	   (see src/backend/access/nbtree/README, last paragraph in section "Notes
	   About Data Representation")
   
	   Use P_LEFTMOST/P_ISLEAF to identify such cases (for the leftmost item only)
	   and set len = 0.
	*/
	
	if (P_LEFTMOST(opaque) && (! P_ISLEAF(opaque)) && (i == 0)) {
		ereport(DEBUG3, (errmsg("[%d:%d] leftmost tuple on non-leaf block => no data, skipping", block, i)));
		return nerrs;
	}
	  
	/* check all the index attributes */
	for (j = 0; j < rel->rd_att->natts; j++) {
		
		/* default length of the attribute */
		int len = rel->rd_att->attrs[j]->attlen;
		
		/* copy from src/backend/commands/analyze.c */
		bool is_varlena  = (!rel->rd_att->attrs[j]->attbyval && len == -1);
		bool is_varwidth = (!rel->rd_att->attrs[j]->attbyval && len < 0); /* thus it's "len = -2" */
		
		/* if the attribute is marked as NULL (in the tuple header), skip to the next attribute */
		if (IndexTupleHasNulls(tuple) && att_isnull(j, bitmap)) {
			ereport(DEBUG3, (errmsg("[%d:%d] attribute '%s' is NULL (skipping)", block, (i+1), rel->rd_att->attrs[j]->attname.data)));
			continue;
		}

		/* fix the alignment (see src/include/access/tupmacs.h) */
		off = att_align_pointer(off, rel->rd_att->attrs[j]->attalign, rel->rd_att->attrs[j]->attlen, buffer+off);
		
		if (is_varlena) { 
		
			/*
			  other interesting macros (see postgres.h) - should do something about those ...
			  
			  VARATT_IS_COMPRESSED(PTR)			VARATT_IS_4B_C(PTR)
			  VARATT_IS_EXTERNAL(PTR)				VARATT_IS_1B_E(PTR)
			  VARATT_IS_SHORT(PTR)				VARATT_IS_1B(PTR)
			  VARATT_IS_EXTENDED(PTR)				(!VARATT_IS_4B_U(PTR))
			*/
			
			len = VARSIZE_ANY(buffer + off);
			
			if (len < 0) {
				ereport(WARNING, (errmsg("[%d:%d] attribute '%s' has negative length < 0 (%d)", block, (i+1), rel->rd_att->attrs[j]->attname.data, len)));
				++nerrs;
				break;
			}
			
			if (VARATT_IS_COMPRESSED(buffer + off)) {
				/* the raw length should be less than 1G (and positive) */
				if ((VARRAWSIZE_4B_C(buffer + off) < 0) || (VARRAWSIZE_4B_C(buffer + off) > 1024*1024)) {
					ereport(WARNING, (errmsg("[%d:%d]  attribute '%s' has invalid length %d (should be between 0 and 1G)", block, (i+1), rel->rd_att->attrs[j]->attname.data, VARRAWSIZE_4B_C(buffer + off))));
					++nerrs;
					/* no break here, this does not break the page structure - we may check the other attributes */
				}
			}
				
			/* FIXME Check if the varlena value may be detoasted. */
			
		} else if (is_varwidth) {
		
			/* get the C-string length (at most to the end of tuple), +1 as it does not include '\0' at the end */
			/* if the string is not properly terminated, then this returns 'remaining space + 1' so it's detected */
			len = strnlen(buffer + off, header->pd_linp[i].lp_off + len + header->pd_linp[i].lp_len - off) + 1;
			
		}
			
		/* Check if the length makes sense (is not negative and does not overflow
		 * the tuple end, stop validating the other rows (we don't know where to
		 * continue anyway). */
		
		if (off + len > (header->pd_linp[i].lp_off + header->pd_linp[i].lp_len)) {
			ereport(WARNING, (errmsg("[%d:%d] attribute '%s' (off=%d len=%d) overflows tuple end (off=%d, len=%d)", block, (i+1), rel->rd_att->attrs[j]->attname.data, off, len, header->pd_linp[i].lp_off, header->pd_linp[i].lp_len)));
			++nerrs;
			break;
		}
		
		/* skip to the next attribute */
		off += len;
		
		ereport(DEBUG3,(errmsg("[%d:%d] attribute '%s' len=%d", block, (i+1), rel->rd_att->attrs[j]->attname.data, len)));
		
	}
	
	ereport(DEBUG3,(errmsg("[%d:%d] last attribute ends at %d, tuple ends at %d", block, (i+1), off, header->pd_linp[i].lp_off + header->pd_linp[i].lp_len)));
	
	/* after the last attribute, the offset should be exactly the same as the end of the tuple */
	if (MAXALIGN(off) != header->pd_linp[i].lp_off + header->pd_linp[i].lp_len) {
		ereport(WARNING, (errmsg("[%d:%d] the last attribute ends at %d but the tuple ends at %d", block, (i+1), off, header->pd_linp[i].lp_off + header->pd_linp[i].lp_len)));
		++nerrs;
	}
	
	return nerrs;

}