Esempio n. 1
0
/*
 *	_bt_moveright() -- move right in the btree if necessary.
 *
 *		When we follow a pointer to reach a page, it is possible that
 *		the page has changed in the meanwhile.	If this happens, we're
 *		guaranteed that the page has "split right" -- that is, that any
 *		data that appeared on the page originally is either on the page
 *		or strictly to the right of it.
 *
 *		This routine decides whether or not we need to move right in the
 *		tree by examining the high key entry on the page.  If that entry
 *		is strictly less than one we expect to be on the page, then our
 *		picture of the page is incorrect and we need to move right.
 *
 *		On entry, we have the buffer pinned and a lock of the proper type.
 *		If we move right, we release the buffer and lock and acquire the
 *		same on the right sibling.	Return value is the buffer we stop at.
 */
Buffer
_bt_moveright(Relation rel,
			  Buffer buf,
			  int keysz,
			  ScanKey scankey,
			  int access)
{
	Page		page;
	BTPageOpaque opaque;

	page = BufferGetPage(buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);

	/*
	 * If the scan key that brought us to this page is > the high key
	 * stored on the page, then the page has split and we need to move
	 * right.  (If the scan key is equal to the high key, we might or
	 * might not need to move right; have to scan the page first anyway.)
	 * It could even have split more than once, so scan as far as needed.
	 *
	 * We also have to move right if we followed a link that brought us to a
	 * dead page.
	 */
	while (!P_RIGHTMOST(opaque) &&
		   (P_IGNORE(opaque) ||
			_bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0))
	{
		/* step right one page */
		BlockNumber rblkno = opaque->btpo_next;

		_bt_relbuf(rel, buf);
		buf = _bt_getbuf(rel, rblkno, access);
		page = BufferGetPage(buf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	}

	if (P_IGNORE(opaque))
		elog(ERROR, "fell off the end of \"%s\"",
			 RelationGetRelationName(rel));

	return buf;
}
Esempio n. 2
0
File: nbtpage.c Progetto: 50wu/gpdb
/*
 *	_bt_getroot() -- Get the root page of the btree.
 *
 *		Since the root page can move around the btree file, we have to read
 *		its location from the metadata page, and then read the root page
 *		itself.  If no root page exists yet, we have to create one.  The
 *		standard class of race conditions exists here; I think I covered
 *		them all in the Hopi Indian rain dance of lock requests below.
 *
 *		The access type parameter (BT_READ or BT_WRITE) controls whether
 *		a new root page will be created or not.  If access = BT_READ,
 *		and no root page exists, we just return InvalidBuffer.	For
 *		BT_WRITE, we try to create the root page if it doesn't exist.
 *		NOTE that the returned root page will have only a read lock set
 *		on it even if access = BT_WRITE!
 *
 *		The returned page is not necessarily the true root --- it could be
 *		a "fast root" (a page that is alone in its level due to deletions).
 *		Also, if the root page is split while we are "in flight" to it,
 *		what we will return is the old root, which is now just the leftmost
 *		page on a probably-not-very-wide level.  For most purposes this is
 *		as good as or better than the true root, so we do not bother to
 *		insist on finding the true root.  We do, however, guarantee to
 *		return a live (not deleted or half-dead) page.
 *
 *		On successful return, the root page is pinned and read-locked.
 *		The metadata page is not locked or pinned on exit.
 */
Buffer
_bt_getroot(Relation rel, int access)
{
	Buffer		metabuf;
	Page		metapg;
	BTPageOpaque metaopaque;
	Buffer		rootbuf;
	Page		rootpage;
	BTPageOpaque rootopaque;
	BlockNumber rootblkno;
	uint32		rootlevel;
	BTMetaPageData *metad;

	MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD;

	/*
	 * Try to use previously-cached metapage data to find the root.  This
	 * normally saves one buffer access per index search, which is a very
	 * helpful savings in bufmgr traffic and hence contention.
	 */
	if (rel->rd_amcache != NULL)
	{
		metad = (BTMetaPageData *) rel->rd_amcache;
		/* We shouldn't have cached it if any of these fail */
		Assert(metad->btm_magic == BTREE_MAGIC);
		Assert(metad->btm_version == BTREE_VERSION);
		Assert(metad->btm_root != P_NONE);

		rootblkno = metad->btm_fastroot;
		Assert(rootblkno != P_NONE);
		rootlevel = metad->btm_fastlevel;

		rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
		rootpage = BufferGetPage(rootbuf);
		rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);

		/*
		 * Since the cache might be stale, we check the page more carefully
		 * here than normal.  We *must* check that it's not deleted. If it's
		 * not alone on its level, then we reject too --- this may be overly
		 * paranoid but better safe than sorry.  Note we don't check P_ISROOT,
		 * because that's not set in a "fast root".
		 */
		if (!P_IGNORE(rootopaque) &&
			rootopaque->btpo.level == rootlevel &&
			P_LEFTMOST(rootopaque) &&
			P_RIGHTMOST(rootopaque))
		{
			/* OK, accept cached page as the root */
			return rootbuf;
		}
		_bt_relbuf(rel, rootbuf);
		/* Cache is stale, throw it away */
		if (rel->rd_amcache)
			pfree(rel->rd_amcache);
		rel->rd_amcache = NULL;
	}

	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
	metapg = BufferGetPage(metabuf);
	metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
	metad = BTPageGetMeta(metapg);

	/* sanity-check the metapage */
	if (!(metaopaque->btpo_flags & BTP_META) ||
		metad->btm_magic != BTREE_MAGIC)
		ereport(ERROR,
				(errcode(ERRCODE_INDEX_CORRUPTED),
				 errmsg("index \"%s\" is not a btree",
						RelationGetRelationName(rel))));

	if (metad->btm_version != BTREE_VERSION)
		ereport(ERROR,
				(errcode(ERRCODE_INDEX_CORRUPTED),
				 errmsg("version mismatch in index \"%s\": file version %d, code version %d",
						RelationGetRelationName(rel),
						metad->btm_version, BTREE_VERSION)));

	/* if no root page initialized yet, do it */
	if (metad->btm_root == P_NONE)
	{
		/* If access = BT_READ, caller doesn't want us to create root yet */
		if (access == BT_READ)
		{
			_bt_relbuf(rel, metabuf);
			return InvalidBuffer;
		}

		// Fetch gp_persistent_relation_node information that will be added to XLOG record.
		RelationFetchGpRelationNodeForXLog(rel);
		
		/* trade in our read lock for a write lock */
		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
		LockBuffer(metabuf, BT_WRITE);

		/*
		 * Race condition:	if someone else initialized the metadata between
		 * the time we released the read lock and acquired the write lock, we
		 * must avoid doing it again.
		 */
		if (metad->btm_root != P_NONE)
		{
			/*
			 * Metadata initialized by someone else.  In order to guarantee no
			 * deadlocks, we have to release the metadata page and start all
			 * over again.	(Is that really true? But it's hardly worth trying
			 * to optimize this case.)
			 */
			_bt_relbuf(rel, metabuf);
			return _bt_getroot(rel, access);
		}

		/*
		 * Get, initialize, write, and leave a lock of the appropriate type on
		 * the new root page.  Since this is the first page in the tree, it's
		 * a leaf as well as the root.
		 */
		rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
		rootblkno = BufferGetBlockNumber(rootbuf);
		rootpage = BufferGetPage(rootbuf);
		rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
		rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
		rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
		rootopaque->btpo.level = 0;
		rootopaque->btpo_cycleid = 0;

		/* NO ELOG(ERROR) till meta is updated */
		START_CRIT_SECTION();

		metad->btm_root = rootblkno;
		metad->btm_level = 0;
		metad->btm_fastroot = rootblkno;
		metad->btm_fastlevel = 0;

		MarkBufferDirty(rootbuf);
		MarkBufferDirty(metabuf);

		/* XLOG stuff */
		if (!rel->rd_istemp)
		{
			xl_btree_newroot xlrec;
			XLogRecPtr	recptr;
			XLogRecData rdata;

			xl_btreenode_set(&(xlrec.btreenode), rel);
			xlrec.rootblk = rootblkno;
			xlrec.level = 0;

			rdata.data = (char *) &xlrec;
			rdata.len = SizeOfBtreeNewroot;
			rdata.buffer = InvalidBuffer;
			rdata.next = NULL;

			recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata);

			PageSetLSN(rootpage, recptr);
			PageSetTLI(rootpage, ThisTimeLineID);
			PageSetLSN(metapg, recptr);
			PageSetTLI(metapg, ThisTimeLineID);
		}

		END_CRIT_SECTION();

		/*
		 * Send out relcache inval for metapage change (probably unnecessary
		 * here, but let's be safe).
		 */
		CacheInvalidateRelcache(rel);

		/*
		 * swap root write lock for read lock.	There is no danger of anyone
		 * else accessing the new root page while it's unlocked, since no one
		 * else knows where it is yet.
		 */
		LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
		LockBuffer(rootbuf, BT_READ);

		/* okay, metadata is correct, release lock on it */
		_bt_relbuf(rel, metabuf);
	}
	else
	{
		rootblkno = metad->btm_fastroot;
		Assert(rootblkno != P_NONE);
		rootlevel = metad->btm_fastlevel;

		/*
		 * Cache the metapage data for next time
		 */
		rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
											 sizeof(BTMetaPageData));
		memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));

		/*
		 * We are done with the metapage; arrange to release it via first
		 * _bt_relandgetbuf call
		 */
		rootbuf = metabuf;

		for (;;)
		{
			rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
			rootpage = BufferGetPage(rootbuf);
			rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);

			if (!P_IGNORE(rootopaque))
				break;

			/* it's dead, Jim.  step right one page */
			if (P_RIGHTMOST(rootopaque))
				elog(ERROR, "no live root page found in index \"%s\"",
					 RelationGetRelationName(rel));
			rootblkno = rootopaque->btpo_next;
		}

		/* Note: can't check btpo.level on deleted pages */
		if (rootopaque->btpo.level != rootlevel)
			elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
				 rootblkno, RelationGetRelationName(rel),
				 rootopaque->btpo.level, rootlevel);
	}

	/*
	 * By here, we have a pin and read lock on the root page, and no lock set
	 * on the metadata page.  Return the root page's buffer.
	 */
	return rootbuf;
}
Esempio n. 3
0
File: nbtpage.c Progetto: 50wu/gpdb
/*
 * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so.
 *
 * This action unlinks the page from the b-tree structure, removing all
 * pointers leading to it --- but not touching its own left and right links.
 * The page cannot be physically reclaimed right away, since other processes
 * may currently be trying to follow links leading to the page; they have to
 * be allowed to use its right-link to recover.  See nbtree/README.
 *
 * On entry, the target buffer must be pinned and locked (either read or write
 * lock is OK).  This lock and pin will be dropped before exiting.
 *
 * The "stack" argument can be a search stack leading (approximately) to the
 * target page, or NULL --- outside callers typically pass NULL since they
 * have not done such a search, but internal recursion cases pass the stack
 * to avoid duplicated search effort.
 *
 * Returns the number of pages successfully deleted (zero if page cannot
 * be deleted now; could be more than one if parent pages were deleted too).
 *
 * NOTE: this leaks memory.  Rather than trying to clean up everything
 * carefully, it's better to run it in a temp context that can be reset
 * frequently.
 */
int
_bt_pagedel(Relation rel, Buffer buf, BTStack stack, bool vacuum_full)
{
	int			result;
	BlockNumber target,
				leftsib,
				rightsib,
				parent;
	OffsetNumber poffset,
				maxoff;
	uint32		targetlevel,
				ilevel;
	ItemId		itemid;
	IndexTuple	targetkey,
				itup;
	ScanKey		itup_scankey;
	Buffer		lbuf,
				rbuf,
				pbuf;
	bool		parent_half_dead;
	bool		parent_one_child;
	bool		rightsib_empty;
	Buffer		metabuf = InvalidBuffer;
	Page		metapg = NULL;
	BTMetaPageData *metad = NULL;
	Page		page;
	BTPageOpaque opaque;

	MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD;

	// Fetch gp_persistent_relation_node information that will be added to XLOG record.
	RelationFetchGpRelationNodeForXLog(rel);

	/*
	 * We can never delete rightmost pages nor root pages.	While at it, check
	 * that page is not already deleted and is empty.
	 */
	page = BufferGetPage(buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
		P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
	{
		/* Should never fail to delete a half-dead page */
		Assert(!P_ISHALFDEAD(opaque));

		_bt_relbuf(rel, buf);
		return 0;
	}

	/*
	 * Save info about page, including a copy of its high key (it must have
	 * one, being non-rightmost).
	 */
	target = BufferGetBlockNumber(buf);
	targetlevel = opaque->btpo.level;
	leftsib = opaque->btpo_prev;
	itemid = PageGetItemId(page, P_HIKEY);
	targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));

	/*
	 * To avoid deadlocks, we'd better drop the target page lock before going
	 * further.
	 */
	_bt_relbuf(rel, buf);

	/*
	 * We need an approximate pointer to the page's parent page.  We use the
	 * standard search mechanism to search for the page's high key; this will
	 * give us a link to either the current parent or someplace to its left
	 * (if there are multiple equal high keys).  In recursion cases, the
	 * caller already generated a search stack and we can just re-use that
	 * work.
	 */
	if (stack == NULL)
	{
		if (!InRecovery)
		{
			/* we need an insertion scan key to do our search, so build one */
			itup_scankey = _bt_mkscankey(rel, targetkey);
			/* find the leftmost leaf page containing this key */
			stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false,
							   &lbuf, BT_READ);
			/* don't need a pin on that either */
			_bt_relbuf(rel, lbuf);

			/*
			 * If we are trying to delete an interior page, _bt_search did
			 * more than we needed.  Locate the stack item pointing to our
			 * parent level.
			 */
			ilevel = 0;
			for (;;)
			{
				if (stack == NULL)
					elog(ERROR, "not enough stack items");
				if (ilevel == targetlevel)
					break;
				stack = stack->bts_parent;
				ilevel++;
			}
		}
		else
		{
			/*
			 * During WAL recovery, we can't use _bt_search (for one reason,
			 * it might invoke user-defined comparison functions that expect
			 * facilities not available in recovery mode).	Instead, just set
			 * up a dummy stack pointing to the left end of the parent tree
			 * level, from which _bt_getstackbuf will walk right to the parent
			 * page.  Painful, but we don't care too much about performance in
			 * this scenario.
			 */
			pbuf = _bt_get_endpoint(rel, targetlevel + 1, false);
			stack = (BTStack) palloc(sizeof(BTStackData));
			stack->bts_blkno = BufferGetBlockNumber(pbuf);
			stack->bts_offset = InvalidOffsetNumber;
			/* bts_btentry will be initialized below */
			stack->bts_parent = NULL;
			_bt_relbuf(rel, pbuf);
		}
	}

	/*
	 * We cannot delete a page that is the rightmost child of its immediate
	 * parent, unless it is the only child --- in which case the parent has to
	 * be deleted too, and the same condition applies recursively to it. We
	 * have to check this condition all the way up before trying to delete. We
	 * don't need to re-test when deleting a non-leaf page, though.
	 */
	if (targetlevel == 0 &&
		!_bt_parent_deletion_safe(rel, target, stack))
		return 0;

	/*
	 * We have to lock the pages we need to modify in the standard order:
	 * moving right, then up.  Else we will deadlock against other writers.
	 *
	 * So, we need to find and write-lock the current left sibling of the
	 * target page.  The sibling that was current a moment ago could have
	 * split, so we may have to move right.  This search could fail if either
	 * the sibling or the target page was deleted by someone else meanwhile;
	 * if so, give up.	(Right now, that should never happen, since page
	 * deletion is only done in VACUUM and there shouldn't be multiple VACUUMs
	 * concurrently on the same table.)
	 */
	if (leftsib != P_NONE)
	{
		lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
		page = BufferGetPage(lbuf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		while (P_ISDELETED(opaque) || opaque->btpo_next != target)
		{
			/* step right one page */
			leftsib = opaque->btpo_next;
			_bt_relbuf(rel, lbuf);
			if (leftsib == P_NONE)
			{
				elog(LOG, "no left sibling (concurrent deletion?) in \"%s\"",
					 RelationGetRelationName(rel));
				return 0;
			}
			lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
			page = BufferGetPage(lbuf);
			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		}
	}
	else
		lbuf = InvalidBuffer;

	/*
	 * Next write-lock the target page itself.	It should be okay to take just
	 * a write lock not a superexclusive lock, since no scans would stop on an
	 * empty page.
	 */
	buf = _bt_getbuf(rel, target, BT_WRITE);
	page = BufferGetPage(buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);

	/*
	 * Check page is still empty etc, else abandon deletion.  The empty check
	 * is necessary since someone else might have inserted into it while we
	 * didn't have it locked; the others are just for paranoia's sake.
	 */
	if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
		P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
	{
		_bt_relbuf(rel, buf);
		if (BufferIsValid(lbuf))
			_bt_relbuf(rel, lbuf);
		return 0;
	}
	if (opaque->btpo_prev != leftsib)
		elog(ERROR, "left link changed unexpectedly in block %u of index \"%s\"",
			 target, RelationGetRelationName(rel));

	/*
	 * And next write-lock the (current) right sibling.
	 */
	rightsib = opaque->btpo_next;
	rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
	page = BufferGetPage(rbuf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	if (opaque->btpo_prev != target)
		elog(ERROR, "right sibling's left-link doesn't match: "
			 "block %u links to %u instead of expected %u in index \"%s\"",
			 rightsib, opaque->btpo_prev, target,
			 RelationGetRelationName(rel));

	/*
	 * Next find and write-lock the current parent of the target page. This is
	 * essentially the same as the corresponding step of splitting.
	 */
	ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY);
	pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
	if (pbuf == InvalidBuffer)
		elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
			 RelationGetRelationName(rel), target);
	parent = stack->bts_blkno;
	poffset = stack->bts_offset;

	/*
	 * If the target is the rightmost child of its parent, then we can't
	 * delete, unless it's also the only child --- in which case the parent
	 * changes to half-dead status.  The "can't delete" case should have been
	 * detected by _bt_parent_deletion_safe, so complain if we see it now.
	 */
	page = BufferGetPage(pbuf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	maxoff = PageGetMaxOffsetNumber(page);
	parent_half_dead = false;
	parent_one_child = false;
	if (poffset >= maxoff)
	{
		if (poffset == P_FIRSTDATAKEY(opaque))
			parent_half_dead = true;
		else
			elog(ERROR, "failed to delete rightmost child %u of block %u in index \"%s\"",
				 target, parent, RelationGetRelationName(rel));
	}
	else
	{
		/* Will there be exactly one child left in this parent? */
		if (OffsetNumberNext(P_FIRSTDATAKEY(opaque)) == maxoff)
			parent_one_child = true;
	}

	/*
	 * If we are deleting the next-to-last page on the target's level, then
	 * the rightsib is a candidate to become the new fast root. (In theory, it
	 * might be possible to push the fast root even further down, but the odds
	 * of doing so are slim, and the locking considerations daunting.)
	 *
	 * We don't support handling this in the case where the parent is becoming
	 * half-dead, even though it theoretically could occur.
	 *
	 * We can safely acquire a lock on the metapage here --- see comments for
	 * _bt_newroot().
	 */
	if (leftsib == P_NONE && !parent_half_dead)
	{
		page = BufferGetPage(rbuf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		Assert(opaque->btpo.level == targetlevel);
		if (P_RIGHTMOST(opaque))
		{
			/* rightsib will be the only one left on the level */
			metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
			metapg = BufferGetPage(metabuf);
			metad = BTPageGetMeta(metapg);

			/*
			 * The expected case here is btm_fastlevel == targetlevel+1; if
			 * the fastlevel is <= targetlevel, something is wrong, and we
			 * choose to overwrite it to fix it.
			 */
			if (metad->btm_fastlevel > targetlevel + 1)
			{
				/* no update wanted */
				_bt_relbuf(rel, metabuf);
				metabuf = InvalidBuffer;
			}
		}
	}

	/*
	 * Check that the parent-page index items we're about to delete/overwrite
	 * contain what we expect.  This can fail if the index has become
	 * corrupt for some reason.  We want to throw any error before entering
	 * the critical section --- otherwise it'd be a PANIC.
	 *
	 * The test on the target item is just an Assert because _bt_getstackbuf
	 * should have guaranteed it has the expected contents.  The test on the
	 * next-child downlink is known to sometimes fail in the field, though.
	 */
	page = BufferGetPage(pbuf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);

#ifdef USE_ASSERT_CHECKING
	itemid = PageGetItemId(page, poffset);
	itup = (IndexTuple) PageGetItem(page, itemid);
	Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target);
#endif

	if (!parent_half_dead)
	{
		OffsetNumber nextoffset;

		nextoffset = OffsetNumberNext(poffset);
		itemid = PageGetItemId(page, nextoffset);
		itup = (IndexTuple) PageGetItem(page, itemid);
		if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib)
			elog(ERROR, "right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
				 rightsib, target, ItemPointerGetBlockNumber(&(itup->t_tid)),
				 parent, RelationGetRelationName(rel));
	}

	/*
	 * Here we begin doing the deletion.
	 */

	/* No ereport(ERROR) until changes are logged */
	START_CRIT_SECTION();

	/*
	 * Update parent.  The normal case is a tad tricky because we want to
	 * delete the target's downlink and the *following* key.  Easiest way is
	 * to copy the right sibling's downlink over the target downlink, and then
	 * delete the following item.
	 */
	if (parent_half_dead)
	{
		PageIndexTupleDelete(page, poffset);
		opaque->btpo_flags |= BTP_HALF_DEAD;
	}
	else
	{
		OffsetNumber nextoffset;

		itemid = PageGetItemId(page, poffset);
		itup = (IndexTuple) PageGetItem(page, itemid);
		ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);

		nextoffset = OffsetNumberNext(poffset);
		PageIndexTupleDelete(page, nextoffset);
	}

	/*
	 * Update siblings' side-links.  Note the target page's side-links will
	 * continue to point to the siblings.  Asserts here are just rechecking
	 * things we already verified above.
	 */
	if (BufferIsValid(lbuf))
	{
		page = BufferGetPage(lbuf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		Assert(opaque->btpo_next == target);
		opaque->btpo_next = rightsib;
	}
	page = BufferGetPage(rbuf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	Assert(opaque->btpo_prev == target);
	opaque->btpo_prev = leftsib;
	rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));

	/*
	 * Mark the page itself deleted.  It can be recycled when all current
	 * transactions are gone; or immediately if we're doing VACUUM FULL.
	 */
	page = BufferGetPage(buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	opaque->btpo_flags &= ~BTP_HALF_DEAD;
	opaque->btpo_flags |= BTP_DELETED;
	opaque->btpo.xact =
		vacuum_full ? FrozenTransactionId : ReadNewTransactionId();

	/* And update the metapage, if needed */
	if (BufferIsValid(metabuf))
	{
		metad->btm_fastroot = rightsib;
		metad->btm_fastlevel = targetlevel;
		MarkBufferDirty(metabuf);
	}

	/* Must mark buffers dirty before XLogInsert */
	MarkBufferDirty(pbuf);
	MarkBufferDirty(rbuf);
	MarkBufferDirty(buf);
	if (BufferIsValid(lbuf))
		MarkBufferDirty(lbuf);

	/* XLOG stuff */
	if (!rel->rd_istemp)
	{
		xl_btree_delete_page xlrec;
		xl_btree_metadata xlmeta;
		uint8		xlinfo;
		XLogRecPtr	recptr;
		XLogRecData rdata[5];
		XLogRecData *nextrdata;

		xl_btreetid_set(&(xlrec.target), rel, parent, poffset);
		xlrec.deadblk = target;
		xlrec.leftblk = leftsib;
		xlrec.rightblk = rightsib;

		rdata[0].data = (char *) &xlrec;
		rdata[0].len = SizeOfBtreeDeletePage;
		rdata[0].buffer = InvalidBuffer;
		rdata[0].next = nextrdata = &(rdata[1]);

		if (BufferIsValid(metabuf))
		{
			xlmeta.root = metad->btm_root;
			xlmeta.level = metad->btm_level;
			xlmeta.fastroot = metad->btm_fastroot;
			xlmeta.fastlevel = metad->btm_fastlevel;

			nextrdata->data = (char *) &xlmeta;
			nextrdata->len = sizeof(xl_btree_metadata);
			nextrdata->buffer = InvalidBuffer;
			nextrdata->next = nextrdata + 1;
			nextrdata++;
			xlinfo = XLOG_BTREE_DELETE_PAGE_META;
		}
		else if (parent_half_dead)
			xlinfo = XLOG_BTREE_DELETE_PAGE_HALF;
		else
			xlinfo = XLOG_BTREE_DELETE_PAGE;

		nextrdata->data = NULL;
		nextrdata->len = 0;
		nextrdata->next = nextrdata + 1;
		nextrdata->buffer = pbuf;
		nextrdata->buffer_std = true;
		nextrdata++;

		nextrdata->data = NULL;
		nextrdata->len = 0;
		nextrdata->buffer = rbuf;
		nextrdata->buffer_std = true;
		nextrdata->next = NULL;

		if (BufferIsValid(lbuf))
		{
			nextrdata->next = nextrdata + 1;
			nextrdata++;
			nextrdata->data = NULL;
			nextrdata->len = 0;
			nextrdata->buffer = lbuf;
			nextrdata->buffer_std = true;
			nextrdata->next = NULL;
		}

		recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);

		if (BufferIsValid(metabuf))
		{
			PageSetLSN(metapg, recptr);
			PageSetTLI(metapg, ThisTimeLineID);
		}
		page = BufferGetPage(pbuf);
		PageSetLSN(page, recptr);
		PageSetTLI(page, ThisTimeLineID);
		page = BufferGetPage(rbuf);
		PageSetLSN(page, recptr);
		PageSetTLI(page, ThisTimeLineID);
		page = BufferGetPage(buf);
		PageSetLSN(page, recptr);
		PageSetTLI(page, ThisTimeLineID);
		if (BufferIsValid(lbuf))
		{
			page = BufferGetPage(lbuf);
			PageSetLSN(page, recptr);
			PageSetTLI(page, ThisTimeLineID);
		}
	}

	END_CRIT_SECTION();

	/* release metapage; send out relcache inval if metapage changed */
	if (BufferIsValid(metabuf))
	{
		CacheInvalidateRelcache(rel);
		_bt_relbuf(rel, metabuf);
	}
	/* can always release leftsib immediately */
	if (BufferIsValid(lbuf))
		_bt_relbuf(rel, lbuf);

	/*
	 * If parent became half dead, recurse to delete it. Otherwise, if right
	 * sibling is empty and is now the last child of the parent, recurse to
	 * try to delete it.  (These cases cannot apply at the same time, though
	 * the second case might itself recurse to the first.)
	 *
	 * When recursing to parent, we hold the lock on the target page until
	 * done.  This delays any insertions into the keyspace that was just
	 * effectively reassigned to the parent's right sibling.  If we allowed
	 * that, and there were enough such insertions before we finish deleting
	 * the parent, page splits within that keyspace could lead to inserting
	 * out-of-order keys into the grandparent level.  It is thought that that
	 * wouldn't have any serious consequences, but it still seems like a
	 * pretty bad idea.
	 */
	if (parent_half_dead)
	{
		/* recursive call will release pbuf */
		_bt_relbuf(rel, rbuf);
		result = _bt_pagedel(rel, pbuf, stack->bts_parent, vacuum_full) + 1;
		_bt_relbuf(rel, buf);
	}
	else if (parent_one_child && rightsib_empty)
	{
		_bt_relbuf(rel, pbuf);
		_bt_relbuf(rel, buf);
		/* recursive call will release rbuf */
		result = _bt_pagedel(rel, rbuf, stack, vacuum_full) + 1;
	}
	else
	{
		_bt_relbuf(rel, pbuf);
		_bt_relbuf(rel, buf);
		_bt_relbuf(rel, rbuf);
		result = 1;
	}

	return result;
}
Esempio n. 4
0
File: nbtpage.c Progetto: 50wu/gpdb
/*
 *	_bt_gettrueroot() -- Get the true root page of the btree.
 *
 *		This is the same as the BT_READ case of _bt_getroot(), except
 *		we follow the true-root link not the fast-root link.
 *
 * By the time we acquire lock on the root page, it might have been split and
 * not be the true root anymore.  This is okay for the present uses of this
 * routine; we only really need to be able to move up at least one tree level
 * from whatever non-root page we were at.	If we ever do need to lock the
 * one true root page, we could loop here, re-reading the metapage on each
 * failure.  (Note that it wouldn't do to hold the lock on the metapage while
 * moving to the root --- that'd deadlock against any concurrent root split.)
 */
Buffer
_bt_gettrueroot(Relation rel)
{
	Buffer		metabuf;
	Page		metapg;
	BTPageOpaque metaopaque;
	Buffer		rootbuf;
	Page		rootpage;
	BTPageOpaque rootopaque;
	BlockNumber rootblkno;
	uint32		rootlevel;
	BTMetaPageData *metad;

	MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD;

	/*
	 * We don't try to use cached metapage data here, since (a) this path is
	 * not performance-critical, and (b) if we are here it suggests our cache
	 * is out-of-date anyway.  In light of point (b), it's probably safest to
	 * actively flush any cached metapage info.
	 */
	if (rel->rd_amcache)
		pfree(rel->rd_amcache);
	rel->rd_amcache = NULL;

	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
	metapg = BufferGetPage(metabuf);
	metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
	metad = BTPageGetMeta(metapg);

	if (!(metaopaque->btpo_flags & BTP_META) ||
		metad->btm_magic != BTREE_MAGIC)
		ereport(ERROR,
				(errcode(ERRCODE_INDEX_CORRUPTED),
				 errmsg("index \"%s\" is not a btree",
						RelationGetRelationName(rel))));

	if (metad->btm_version != BTREE_VERSION)
		ereport(ERROR,
				(errcode(ERRCODE_INDEX_CORRUPTED),
				 errmsg("version mismatch in index \"%s\": file version %d, code version %d",
						RelationGetRelationName(rel),
						metad->btm_version, BTREE_VERSION)));

	/* if no root page initialized yet, fail */
	if (metad->btm_root == P_NONE)
	{
		_bt_relbuf(rel, metabuf);
		return InvalidBuffer;
	}

	rootblkno = metad->btm_root;
	rootlevel = metad->btm_level;

	/*
	 * We are done with the metapage; arrange to release it via first
	 * _bt_relandgetbuf call
	 */
	rootbuf = metabuf;

	for (;;)
	{
		rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
		rootpage = BufferGetPage(rootbuf);
		rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);

		if (!P_IGNORE(rootopaque))
			break;

		/* it's dead, Jim.  step right one page */
		if (P_RIGHTMOST(rootopaque))
			elog(ERROR, "no live root page found in index \"%s\"",
				 RelationGetRelationName(rel));
		rootblkno = rootopaque->btpo_next;
	}

	/* Note: can't check btpo.level on deleted pages */
	if (rootopaque->btpo.level != rootlevel)
		elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
			 rootblkno, RelationGetRelationName(rel),
			 rootopaque->btpo.level, rootlevel);

	return rootbuf;
}
Esempio n. 5
0
/*
 *	_bt_moveright() -- move right in the btree if necessary.
 *
 * When we follow a pointer to reach a page, it is possible that
 * the page has changed in the meanwhile.  If this happens, we're
 * guaranteed that the page has "split right" -- that is, that any
 * data that appeared on the page originally is either on the page
 * or strictly to the right of it.
 *
 * This routine decides whether or not we need to move right in the
 * tree by examining the high key entry on the page.  If that entry
 * is strictly less than the scankey, or <= the scankey in the nextkey=true
 * case, then we followed the wrong link and we need to move right.
 *
 * The passed scankey must be an insertion-type scankey (see nbtree/README),
 * but it can omit the rightmost column(s) of the index.
 *
 * When nextkey is false (the usual case), we are looking for the first
 * item >= scankey.  When nextkey is true, we are looking for the first
 * item strictly greater than scankey.
 *
 * If forupdate is true, we will attempt to finish any incomplete splits
 * that we encounter.  This is required when locking a target page for an
 * insertion, because we don't allow inserting on a page before the split
 * is completed.  'stack' is only used if forupdate is true.
 *
 * On entry, we have the buffer pinned and a lock of the type specified by
 * 'access'.  If we move right, we release the buffer and lock and acquire
 * the same on the right sibling.  Return value is the buffer we stop at.
 */
Buffer
_bt_moveright(Relation rel,
			  Buffer buf,
			  int keysz,
			  ScanKey scankey,
			  bool nextkey,
			  bool forupdate,
			  BTStack stack,
			  int access)
{
	Page		page;
	BTPageOpaque opaque;
	int32		cmpval;

	/*
	 * When nextkey = false (normal case): if the scan key that brought us to
	 * this page is > the high key stored on the page, then the page has split
	 * and we need to move right.  (If the scan key is equal to the high key,
	 * we might or might not need to move right; have to scan the page first
	 * anyway.)
	 *
	 * When nextkey = true: move right if the scan key is >= page's high key.
	 *
	 * The page could even have split more than once, so scan as far as
	 * needed.
	 *
	 * We also have to move right if we followed a link that brought us to a
	 * dead page.
	 */
	cmpval = nextkey ? 0 : 1;

	for (;;)
	{
		page = BufferGetPage(buf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);

		if (P_RIGHTMOST(opaque))
			break;

		/*
		 * Finish any incomplete splits we encounter along the way.
		 */
		if (forupdate && P_INCOMPLETE_SPLIT(opaque))
		{
			BlockNumber blkno = BufferGetBlockNumber(buf);

			/* upgrade our lock if necessary */
			if (access == BT_READ)
			{
				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
				LockBuffer(buf, BT_WRITE);
			}

			if (P_INCOMPLETE_SPLIT(opaque))
				_bt_finish_split(rel, buf, stack);
			else
				_bt_relbuf(rel, buf);

			/* re-acquire the lock in the right mode, and re-check */
			buf = _bt_getbuf(rel, blkno, access);
			continue;
		}

		if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)
		{
			/* step right one page */
			buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
			continue;
		}
		else
			break;
	}

	if (P_IGNORE(opaque))
		elog(ERROR, "fell off the end of index \"%s\"",
			 RelationGetRelationName(rel));

	return buf;
}
Esempio n. 6
0
/*
 * _bt_walk_left() -- step left one page, if possible
 *
 * The given buffer must be pinned and read-locked.  This will be dropped
 * before stepping left.  On return, we have pin and read lock on the
 * returned page, instead.
 *
 * Returns InvalidBuffer if there is no page to the left (no lock is held
 * in that case).
 *
 * When working on a non-leaf level, it is possible for the returned page
 * to be half-dead; the caller should check that condition and step left
 * again if it's important.
 */
static Buffer
_bt_walk_left(Relation rel, Buffer buf)
{
	Page		page;
	BTPageOpaque opaque;

	page = BufferGetPage(buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);

	for (;;)
	{
		BlockNumber obknum;
		BlockNumber lblkno;
		BlockNumber blkno;
		int			tries;

		/* if we're at end of tree, release buf and return failure */
		if (P_LEFTMOST(opaque))
		{
			_bt_relbuf(rel, buf);
			break;
		}
		/* remember original page we are stepping left from */
		obknum = BufferGetBlockNumber(buf);
		/* step left */
		blkno = lblkno = opaque->btpo_prev;
		_bt_relbuf(rel, buf);
		/* check for interrupts while we're not holding any buffer lock */
		CHECK_FOR_INTERRUPTS();
		buf = _bt_getbuf(rel, blkno, BT_READ);
		page = BufferGetPage(buf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);

		/*
		 * If this isn't the page we want, walk right till we find what we
		 * want --- but go no more than four hops (an arbitrary limit). If we
		 * don't find the correct page by then, the most likely bet is that
		 * the original page got deleted and isn't in the sibling chain at all
		 * anymore, not that its left sibling got split more than four times.
		 *
		 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
		 * because half-dead pages are still in the sibling chain.	Caller
		 * must reject half-dead pages if wanted.
		 */
		tries = 0;
		for (;;)
		{
			if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
			{
				/* Found desired page, return it */
				return buf;
			}
			if (P_RIGHTMOST(opaque) || ++tries > 4)
				break;
			blkno = opaque->btpo_next;
			buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
			page = BufferGetPage(buf);
			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		}

		/* Return to the original page to see what's up */
		buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
		page = BufferGetPage(buf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		if (P_ISDELETED(opaque))
		{
			/*
			 * It was deleted.	Move right to first nondeleted page (there
			 * must be one); that is the page that has acquired the deleted
			 * one's keyspace, so stepping left from it will take us where we
			 * want to be.
			 */
			for (;;)
			{
				if (P_RIGHTMOST(opaque))
					elog(ERROR, "fell off the end of index \"%s\"",
						 RelationGetRelationName(rel));
				blkno = opaque->btpo_next;
				buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
				page = BufferGetPage(buf);
				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
				if (!P_ISDELETED(opaque))
					break;
			}

			/*
			 * Now return to top of loop, resetting obknum to point to this
			 * nondeleted page, and try again.
			 */
		}
		else
		{
			/*
			 * It wasn't deleted; the explanation had better be that the page
			 * to the left got split or deleted. Without this check, we'd go
			 * into an infinite loop if there's anything wrong.
			 */
			if (opaque->btpo_prev == lblkno)
				elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
					 obknum, RelationGetRelationName(rel));
			/* Okay to try again with new lblkno value */
		}
	}

	return InvalidBuffer;
}
Esempio n. 7
0
/*
 *	_bt_steppage() -- Step to next page containing valid data for scan
 *
 * On entry, so->currPos.buf must be pinned and read-locked.  We'll drop
 * the lock and pin before moving to next page.
 *
 * On success exit, we hold pin and read-lock on the next interesting page,
 * and so->currPos is updated to contain data from that page.
 *
 * If there are no more matching records in the given direction, we drop all
 * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE.
 */
static bool
_bt_steppage(IndexScanDesc scan, ScanDirection dir)
{
	BTScanOpaque so = (BTScanOpaque) scan->opaque;
	Relation	rel;
	Page		page;
	BTPageOpaque opaque;

	/* we must have the buffer pinned and locked */
	Assert(BufferIsValid(so->currPos.buf));

	/* Before leaving current page, deal with any killed items */
	if (so->numKilled > 0)
		_bt_killitems(scan, true);

	/*
	 * Before we modify currPos, make a copy of the page data if there was a
	 * mark position that needs it.
	 */
	if (so->markItemIndex >= 0)
	{
		/* bump pin on current buffer for assignment to mark buffer */
		IncrBufferRefCount(so->currPos.buf);
		memcpy(&so->markPos, &so->currPos,
			   offsetof(BTScanPosData, items[1]) +
			   so->currPos.lastItem * sizeof(BTScanPosItem));
		if (so->markTuples)
			memcpy(so->markTuples, so->currTuples,
				   so->currPos.nextTupleOffset);
		so->markPos.itemIndex = so->markItemIndex;
		so->markItemIndex = -1;
	}

	rel = scan->indexRelation;

	if (ScanDirectionIsForward(dir))
	{
		/* Walk right to the next page with data */
		/* We must rely on the previously saved nextPage link! */
		BlockNumber blkno = so->currPos.nextPage;

		/* Remember we left a page with data */
		so->currPos.moreLeft = true;

		for (;;)
		{
			/* release the previous buffer */
			_bt_relbuf(rel, so->currPos.buf);
			so->currPos.buf = InvalidBuffer;
			/* if we're at end of scan, give up */
			if (blkno == P_NONE || !so->currPos.moreRight)
				return false;
			/* check for interrupts while we're not holding any buffer lock */
			CHECK_FOR_INTERRUPTS();
			/* step right one page */
			so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
			/* check for deleted page */
			page = BufferGetPage(so->currPos.buf);
			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
			if (!P_IGNORE(opaque))
			{
				PredicateLockPage(rel, blkno, scan->xs_snapshot);
				/* see if there are any matches on this page */
				/* note that this will clear moreRight if we can stop */
				if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
					break;
			}
			/* nope, keep going */
			blkno = opaque->btpo_next;
		}
	}
	else
	{
		/* Remember we left a page with data */
		so->currPos.moreRight = true;

		/*
		 * Walk left to the next page with data.  This is much more complex
		 * than the walk-right case because of the possibility that the page
		 * to our left splits while we are in flight to it, plus the
		 * possibility that the page we were on gets deleted after we leave
		 * it.	See nbtree/README for details.
		 */
		for (;;)
		{
			/* Done if we know there are no matching keys to the left */
			if (!so->currPos.moreLeft)
			{
				_bt_relbuf(rel, so->currPos.buf);
				so->currPos.buf = InvalidBuffer;
				return false;
			}

			/* Step to next physical page */
			so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);

			/* if we're physically at end of index, return failure */
			if (so->currPos.buf == InvalidBuffer)
				return false;

			/*
			 * Okay, we managed to move left to a non-deleted page. Done if
			 * it's not half-dead and contains matching tuples. Else loop back
			 * and do it all again.
			 */
			page = BufferGetPage(so->currPos.buf);
			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
			if (!P_IGNORE(opaque))
			{
				PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
				/* see if there are any matches on this page */
				/* note that this will clear moreLeft if we can stop */
				if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
					break;
			}
		}
	}

	return true;
}
Esempio n. 8
0
/*
 *	_bt_step() -- Step one item in the requested direction in a scan on
 *				  the tree.
 *
 *		*bufP is the current buffer (read-locked and pinned).  If we change
 *		pages, it's updated appropriately.
 *
 *		If successful, update scan's currentItemData and return true.
 *		If no adjacent record exists in the requested direction,
 *		release buffer pin/locks and return false.
 */
bool
_bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
{
	Relation	rel = scan->indexRelation;
	ItemPointer current = &(scan->currentItemData);
	BTScanOpaque so = (BTScanOpaque) scan->opaque;
	Page		page;
	BTPageOpaque opaque;
	OffsetNumber offnum,
				maxoff;
	BlockNumber blkno;

	/*
	 * Don't use ItemPointerGetOffsetNumber or you risk to get assertion
	 * due to ability of ip_posid to be equal 0.
	 */
	offnum = current->ip_posid;

	page = BufferGetPage(*bufP);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	maxoff = PageGetMaxOffsetNumber(page);

	if (ScanDirectionIsForward(dir))
	{
		if (!PageIsEmpty(page) && offnum < maxoff)
			offnum = OffsetNumberNext(offnum);
		else
		{
			/* Walk right to the next page with data */
			for (;;)
			{
				/* if we're at end of scan, release the buffer and return */
				if (P_RIGHTMOST(opaque))
				{
					_bt_relbuf(rel, *bufP);
					ItemPointerSetInvalid(current);
					*bufP = so->btso_curbuf = InvalidBuffer;
					return false;
				}
				/* step right one page */
				blkno = opaque->btpo_next;
				_bt_relbuf(rel, *bufP);
				*bufP = _bt_getbuf(rel, blkno, BT_READ);
				page = BufferGetPage(*bufP);
				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
				if (!P_IGNORE(opaque))
				{
					maxoff = PageGetMaxOffsetNumber(page);
					/* done if it's not empty */
					offnum = P_FIRSTDATAKEY(opaque);
					if (!PageIsEmpty(page) && offnum <= maxoff)
						break;
				}
			}
		}
	}
	else
/* backwards scan */
	{
		if (offnum > P_FIRSTDATAKEY(opaque))
			offnum = OffsetNumberPrev(offnum);
		else
		{
			/*
			 * Walk left to the next page with data.  This is much more
			 * complex than the walk-right case because of the possibility
			 * that the page to our left splits while we are in flight to
			 * it, plus the possibility that the page we were on gets
			 * deleted after we leave it.  See nbtree/README for details.
			 */
			for (;;)
			{
				*bufP = _bt_walk_left(rel, *bufP);

				/* if we're at end of scan, return failure */
				if (*bufP == InvalidBuffer)
				{
					ItemPointerSetInvalid(current);
					so->btso_curbuf = InvalidBuffer;
					return false;
				}
				page = BufferGetPage(*bufP);
				opaque = (BTPageOpaque) PageGetSpecialPointer(page);

				/*
				 * Okay, we managed to move left to a non-deleted page.
				 * Done if it's not half-dead and not empty.  Else loop
				 * back and do it all again.
				 */
				if (!P_IGNORE(opaque))
				{
					maxoff = PageGetMaxOffsetNumber(page);
					offnum = maxoff;
					if (!PageIsEmpty(page) &&
						maxoff >= P_FIRSTDATAKEY(opaque))
						break;
				}
			}
		}
	}

	/* Update scan state */
	so->btso_curbuf = *bufP;
	blkno = BufferGetBlockNumber(*bufP);
	ItemPointerSet(current, blkno, offnum);

	return true;
}
Esempio n. 9
0
/*
 *	_bt_search() -- Search the tree for a particular scankey,
 *		or more precisely for the first leaf page it could be on.
 *
 * Return value is a stack of parent-page pointers.  *bufP is set to the
 * address of the leaf-page buffer, which is read-locked and pinned.
 * No locks are held on the parent pages, however!
 *
 * NOTE that the returned buffer is read-locked regardless of the access
 * parameter.  However, access = BT_WRITE will allow an empty root page
 * to be created and returned.	When access = BT_READ, an empty index
 * will result in *bufP being set to InvalidBuffer.
 */
BTStack
_bt_search(Relation rel, int keysz, ScanKey scankey,
		   Buffer *bufP, int access)
{
	BTStack		stack_in = NULL;

	/* Get the root page to start with */
	*bufP = _bt_getroot(rel, access);

	/* If index is empty and access = BT_READ, no root page is created. */
	if (!BufferIsValid(*bufP))
		return (BTStack) NULL;

	/* Loop iterates once per level descended in the tree */
	for (;;)
	{
		Page		page;
		BTPageOpaque opaque;
		OffsetNumber offnum;
		ItemId		itemid;
		BTItem		btitem;
		IndexTuple	itup;
		BlockNumber blkno;
		BlockNumber par_blkno;
		BTStack		new_stack;

		/*
		 * Race -- the page we just grabbed may have split since we read
		 * its pointer in the parent (or metapage).  If it has, we may
		 * need to move right to its new sibling.  Do that.
		 */
		*bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ);

		/* if this is a leaf page, we're done */
		page = BufferGetPage(*bufP);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		if (P_ISLEAF(opaque))
			break;

		/*
		 * Find the appropriate item on the internal page, and get the
		 * child page that it points to.
		 */
		offnum = _bt_binsrch(rel, *bufP, keysz, scankey);
		itemid = PageGetItemId(page, offnum);
		btitem = (BTItem) PageGetItem(page, itemid);
		itup = &(btitem->bti_itup);
		blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
		par_blkno = BufferGetBlockNumber(*bufP);

		/*
		 * We need to save the location of the index entry we chose in the
		 * parent page on a stack. In case we split the tree, we'll use
		 * the stack to work back up to the parent page.  We also save the
		 * actual downlink (TID) to uniquely identify the index entry, in
		 * case it moves right while we're working lower in the tree.  See
		 * the paper by Lehman and Yao for how this is detected and
		 * handled. (We use the child link to disambiguate duplicate keys
		 * in the index -- Lehman and Yao disallow duplicate keys.)
		 */
		new_stack = (BTStack) palloc(sizeof(BTStackData));
		new_stack->bts_blkno = par_blkno;
		new_stack->bts_offset = offnum;
		memcpy(&new_stack->bts_btitem, btitem, sizeof(BTItemData));
		new_stack->bts_parent = stack_in;

		/* drop the read lock on the parent page, acquire one on the child */
		_bt_relbuf(rel, *bufP);
		*bufP = _bt_getbuf(rel, blkno, BT_READ);

		/* okay, all set to move down a level */
		stack_in = new_stack;
	}

	return stack_in;
}
Esempio n. 10
0
/*
 * _bt_get_endpoint() -- Find the first or last page on a given tree level
 *
 * If the index is empty, we will return InvalidBuffer; any other failure
 * condition causes ereport().	We will not return a dead page.
 *
 * The returned buffer is pinned and read-locked.
 */
Buffer
_bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
{
	Buffer		buf;
	Page		page;
	BTPageOpaque opaque;
	OffsetNumber offnum;
	BlockNumber blkno;
	BTItem		btitem;
	IndexTuple	itup;

	/*
	 * If we are looking for a leaf page, okay to descend from fast root;
	 * otherwise better descend from true root.  (There is no point in
	 * being smarter about intermediate levels.)
	 */
	if (level == 0)
		buf = _bt_getroot(rel, BT_READ);
	else
		buf = _bt_gettrueroot(rel);

	if (!BufferIsValid(buf))
	{
		/* empty index... */
		return InvalidBuffer;
	}

	page = BufferGetPage(buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);

	for (;;)
	{
		/*
		 * If we landed on a deleted page, step right to find a live page
		 * (there must be one).  Also, if we want the rightmost page, step
		 * right if needed to get to it (this could happen if the page
		 * split since we obtained a pointer to it).
		 */
		while (P_IGNORE(opaque) ||
			   (rightmost && !P_RIGHTMOST(opaque)))
		{
			blkno = opaque->btpo_next;
			if (blkno == P_NONE)
				elog(ERROR, "fell off the end of \"%s\"",
					 RelationGetRelationName(rel));
			_bt_relbuf(rel, buf);
			buf = _bt_getbuf(rel, blkno, BT_READ);
			page = BufferGetPage(buf);
			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		}

		/* Done? */
		if (opaque->btpo.level == level)
			break;
		if (opaque->btpo.level < level)
			elog(ERROR, "btree level %u not found", level);

		/* Descend to leftmost or rightmost child page */
		if (rightmost)
			offnum = PageGetMaxOffsetNumber(page);
		else
			offnum = P_FIRSTDATAKEY(opaque);

		btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
		itup = &(btitem->bti_itup);
		blkno = ItemPointerGetBlockNumber(&(itup->t_tid));

		_bt_relbuf(rel, buf);
		buf = _bt_getbuf(rel, blkno, BT_READ);
		page = BufferGetPage(buf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	}

	return buf;
}
Esempio n. 11
0
/*
 * _bt_pagedel() -- Delete a page from the b-tree.
 *
 * This action unlinks the page from the b-tree structure, removing all
 * pointers leading to it --- but not touching its own left and right links.
 * The page cannot be physically reclaimed right away, since other processes
 * may currently be trying to follow links leading to the page; they have to
 * be allowed to use its right-link to recover.  See nbtree/README.
 *
 * On entry, the target buffer must be pinned and read-locked.	This lock and
 * pin will be dropped before exiting.
 *
 * Returns the number of pages successfully deleted (zero on failure; could
 * be more than one if parent blocks were deleted).
 *
 * NOTE: this leaks memory.  Rather than trying to clean up everything
 * carefully, it's better to run it in a temp context that can be reset
 * frequently.
 */
int
_bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
{
	BlockNumber target,
				leftsib,
				rightsib,
				parent;
	OffsetNumber poffset,
				maxoff;
	uint32		targetlevel,
				ilevel;
	ItemId		itemid;
	BTItem		targetkey,
				btitem;
	ScanKey		itup_scankey;
	BTStack		stack;
	Buffer		lbuf,
				rbuf,
				pbuf;
	bool		parent_half_dead;
	bool		parent_one_child;
	bool		rightsib_empty;
	Buffer		metabuf = InvalidBuffer;
	Page		metapg = NULL;
	BTMetaPageData *metad = NULL;
	Page		page;
	BTPageOpaque opaque;

	/*
	 * We can never delete rightmost pages nor root pages.	While at it, check
	 * that page is not already deleted and is empty.
	 */
	page = BufferGetPage(buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
		P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
	{
		_bt_relbuf(rel, buf);
		return 0;
	}

	/*
	 * Save info about page, including a copy of its high key (it must have
	 * one, being non-rightmost).
	 */
	target = BufferGetBlockNumber(buf);
	targetlevel = opaque->btpo.level;
	leftsib = opaque->btpo_prev;
	itemid = PageGetItemId(page, P_HIKEY);
	targetkey = CopyBTItem((BTItem) PageGetItem(page, itemid));

	/*
	 * We need to get an approximate pointer to the page's parent page. Use
	 * the standard search mechanism to search for the page's high key; this
	 * will give us a link to either the current parent or someplace to its
	 * left (if there are multiple equal high keys).  To avoid deadlocks, we'd
	 * better drop the target page lock first.
	 */
	_bt_relbuf(rel, buf);
	/* we need a scan key to do our search, so build one */
	itup_scankey = _bt_mkscankey(rel, &(targetkey->bti_itup));
	/* find the leftmost leaf page containing this key */
	stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false,
					   &lbuf, BT_READ);
	/* don't need a pin on that either */
	_bt_relbuf(rel, lbuf);

	/*
	 * If we are trying to delete an interior page, _bt_search did more than
	 * we needed.  Locate the stack item pointing to our parent level.
	 */
	ilevel = 0;
	for (;;)
	{
		if (stack == NULL)
			elog(ERROR, "not enough stack items");
		if (ilevel == targetlevel)
			break;
		stack = stack->bts_parent;
		ilevel++;
	}

	/*
	 * We have to lock the pages we need to modify in the standard order:
	 * moving right, then up.  Else we will deadlock against other writers.
	 *
	 * So, we need to find and write-lock the current left sibling of the
	 * target page.  The sibling that was current a moment ago could have
	 * split, so we may have to move right.  This search could fail if either
	 * the sibling or the target page was deleted by someone else meanwhile;
	 * if so, give up.	(Right now, that should never happen, since page
	 * deletion is only done in VACUUM and there shouldn't be multiple VACUUMs
	 * concurrently on the same table.)
	 */
	if (leftsib != P_NONE)
	{
		lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
		page = BufferGetPage(lbuf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		while (P_ISDELETED(opaque) || opaque->btpo_next != target)
		{
			/* step right one page */
			leftsib = opaque->btpo_next;
			_bt_relbuf(rel, lbuf);
			if (leftsib == P_NONE)
			{
				elog(LOG, "no left sibling (concurrent deletion?) in \"%s\"",
					 RelationGetRelationName(rel));
				return 0;
			}
			lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
			page = BufferGetPage(lbuf);
			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		}
	}
	else
		lbuf = InvalidBuffer;

	/*
	 * Next write-lock the target page itself.	It should be okay to take just
	 * a write lock not a superexclusive lock, since no scans would stop on an
	 * empty page.
	 */
	buf = _bt_getbuf(rel, target, BT_WRITE);
	page = BufferGetPage(buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);

	/*
	 * Check page is still empty etc, else abandon deletion.  The empty check
	 * is necessary since someone else might have inserted into it while we
	 * didn't have it locked; the others are just for paranoia's sake.
	 */
	if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
		P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
	{
		_bt_relbuf(rel, buf);
		if (BufferIsValid(lbuf))
			_bt_relbuf(rel, lbuf);
		return 0;
	}
	if (opaque->btpo_prev != leftsib)
		elog(ERROR, "left link changed unexpectedly in block %u of \"%s\"",
			 target, RelationGetRelationName(rel));

	/*
	 * And next write-lock the (current) right sibling.
	 */
	rightsib = opaque->btpo_next;
	rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);

	/*
	 * Next find and write-lock the current parent of the target page. This is
	 * essentially the same as the corresponding step of splitting.  However,
	 * it's possible for the search to fail (for reasons explained in README).
	 * If that happens, we recover by searching the whole parent level, which
	 * is a tad inefficient but doesn't happen often enough to be a problem.
	 */
	ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid),
				   target, P_HIKEY);
	pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
	if (pbuf == InvalidBuffer)
	{
		/* Find the leftmost page in the parent level */
		pbuf = _bt_get_endpoint(rel, opaque->btpo.level + 1, false);
		stack->bts_blkno = BufferGetBlockNumber(pbuf);
		stack->bts_offset = InvalidOffsetNumber;
		_bt_relbuf(rel, pbuf);
		/* and repeat search from there */
		pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
		if (pbuf == InvalidBuffer)
			elog(ERROR, "failed to re-find parent key in \"%s\" for deletion target page %u",
				 RelationGetRelationName(rel), target);
	}
	parent = stack->bts_blkno;
	poffset = stack->bts_offset;

	/*
	 * If the target is the rightmost child of its parent, then we can't
	 * delete, unless it's also the only child --- in which case the parent
	 * changes to half-dead status.
	 */
	page = BufferGetPage(pbuf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	maxoff = PageGetMaxOffsetNumber(page);
	parent_half_dead = false;
	parent_one_child = false;
	if (poffset >= maxoff)
	{
		if (poffset == P_FIRSTDATAKEY(opaque))
			parent_half_dead = true;
		else
		{
			_bt_relbuf(rel, pbuf);
			_bt_relbuf(rel, rbuf);
			_bt_relbuf(rel, buf);
			if (BufferIsValid(lbuf))
				_bt_relbuf(rel, lbuf);
			return 0;
		}
	}
	else
	{
		/* Will there be exactly one child left in this parent? */
		if (OffsetNumberNext(P_FIRSTDATAKEY(opaque)) == maxoff)
			parent_one_child = true;
	}

	/*
	 * If we are deleting the next-to-last page on the target's level, then
	 * the rightsib is a candidate to become the new fast root. (In theory, it
	 * might be possible to push the fast root even further down, but the odds
	 * of doing so are slim, and the locking considerations daunting.)
	 *
	 * We can safely acquire a lock on the metapage here --- see comments for
	 * _bt_newroot().
	 */
	if (leftsib == P_NONE)
	{
		page = BufferGetPage(rbuf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		Assert(opaque->btpo.level == targetlevel);
		if (P_RIGHTMOST(opaque))
		{
			/* rightsib will be the only one left on the level */
			metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
			metapg = BufferGetPage(metabuf);
			metad = BTPageGetMeta(metapg);

			/*
			 * The expected case here is btm_fastlevel == targetlevel+1; if
			 * the fastlevel is <= targetlevel, something is wrong, and we
			 * choose to overwrite it to fix it.
			 */
			if (metad->btm_fastlevel > targetlevel + 1)
			{
				/* no update wanted */
				_bt_relbuf(rel, metabuf);
				metabuf = InvalidBuffer;
			}
		}
	}

	/*
	 * Here we begin doing the deletion.
	 */

	/* No ereport(ERROR) until changes are logged */
	START_CRIT_SECTION();

	/*
	 * Update parent.  The normal case is a tad tricky because we want to
	 * delete the target's downlink and the *following* key.  Easiest way is
	 * to copy the right sibling's downlink over the target downlink, and then
	 * delete the following item.
	 */
	page = BufferGetPage(pbuf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	if (parent_half_dead)
	{
		PageIndexTupleDelete(page, poffset);
		opaque->btpo_flags |= BTP_HALF_DEAD;
	}
	else
	{
		OffsetNumber nextoffset;

		itemid = PageGetItemId(page, poffset);
		btitem = (BTItem) PageGetItem(page, itemid);
		Assert(ItemPointerGetBlockNumber(&(btitem->bti_itup.t_tid)) == target);
		ItemPointerSet(&(btitem->bti_itup.t_tid), rightsib, P_HIKEY);

		nextoffset = OffsetNumberNext(poffset);
		/* This part is just for double-checking */
		itemid = PageGetItemId(page, nextoffset);
		btitem = (BTItem) PageGetItem(page, itemid);
		if (ItemPointerGetBlockNumber(&(btitem->bti_itup.t_tid)) != rightsib)
			elog(PANIC, "right sibling is not next child in \"%s\"",
				 RelationGetRelationName(rel));
		PageIndexTupleDelete(page, nextoffset);
	}

	/*
	 * Update siblings' side-links.  Note the target page's side-links will
	 * continue to point to the siblings.
	 */
	if (BufferIsValid(lbuf))
	{
		page = BufferGetPage(lbuf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		Assert(opaque->btpo_next == target);
		opaque->btpo_next = rightsib;
	}
	page = BufferGetPage(rbuf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	Assert(opaque->btpo_prev == target);
	opaque->btpo_prev = leftsib;
	rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));

	/*
	 * Mark the page itself deleted.  It can be recycled when all current
	 * transactions are gone; or immediately if we're doing VACUUM FULL.
	 */
	page = BufferGetPage(buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
	opaque->btpo_flags |= BTP_DELETED;
	opaque->btpo.xact =
		vacuum_full ? FrozenTransactionId : ReadNewTransactionId();

	/* And update the metapage, if needed */
	if (BufferIsValid(metabuf))
	{
		metad->btm_fastroot = rightsib;
		metad->btm_fastlevel = targetlevel;
	}

	/* XLOG stuff */
	if (!rel->rd_istemp)
	{
		xl_btree_delete_page xlrec;
		xl_btree_metadata xlmeta;
		uint8		xlinfo;
		XLogRecPtr	recptr;
		XLogRecData rdata[5];
		XLogRecData *nextrdata;

		xlrec.target.node = rel->rd_node;
		ItemPointerSet(&(xlrec.target.tid), parent, poffset);
		xlrec.deadblk = target;
		xlrec.leftblk = leftsib;
		xlrec.rightblk = rightsib;

		rdata[0].data = (char *) &xlrec;
		rdata[0].len = SizeOfBtreeDeletePage;
		rdata[0].buffer = InvalidBuffer;
		rdata[0].next = nextrdata = &(rdata[1]);

		if (BufferIsValid(metabuf))
		{
			xlmeta.root = metad->btm_root;
			xlmeta.level = metad->btm_level;
			xlmeta.fastroot = metad->btm_fastroot;
			xlmeta.fastlevel = metad->btm_fastlevel;

			nextrdata->data = (char *) &xlmeta;
			nextrdata->len = sizeof(xl_btree_metadata);
			nextrdata->buffer = InvalidBuffer;
			nextrdata->next = nextrdata + 1;
			nextrdata++;
			xlinfo = XLOG_BTREE_DELETE_PAGE_META;
		}
		else
			xlinfo = XLOG_BTREE_DELETE_PAGE;

		nextrdata->data = NULL;
		nextrdata->len = 0;
		nextrdata->next = nextrdata + 1;
		nextrdata->buffer = pbuf;
		nextrdata->buffer_std = true;
		nextrdata++;

		nextrdata->data = NULL;
		nextrdata->len = 0;
		nextrdata->buffer = rbuf;
		nextrdata->buffer_std = true;
		nextrdata->next = NULL;

		if (BufferIsValid(lbuf))
		{
			nextrdata->next = nextrdata + 1;
			nextrdata++;
			nextrdata->data = NULL;
			nextrdata->len = 0;
			nextrdata->buffer = lbuf;
			nextrdata->buffer_std = true;
			nextrdata->next = NULL;
		}

		recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);

		if (BufferIsValid(metabuf))
		{
			PageSetLSN(metapg, recptr);
			PageSetTLI(metapg, ThisTimeLineID);
		}
		page = BufferGetPage(pbuf);
		PageSetLSN(page, recptr);
		PageSetTLI(page, ThisTimeLineID);
		page = BufferGetPage(rbuf);
		PageSetLSN(page, recptr);
		PageSetTLI(page, ThisTimeLineID);
		page = BufferGetPage(buf);
		PageSetLSN(page, recptr);
		PageSetTLI(page, ThisTimeLineID);
		if (BufferIsValid(lbuf))
		{
			page = BufferGetPage(lbuf);
			PageSetLSN(page, recptr);
			PageSetTLI(page, ThisTimeLineID);
		}
	}

	END_CRIT_SECTION();

	/* Write and release buffers */
	if (BufferIsValid(metabuf))
		_bt_wrtbuf(rel, metabuf);
	_bt_wrtbuf(rel, pbuf);
	_bt_wrtbuf(rel, rbuf);
	_bt_wrtbuf(rel, buf);
	if (BufferIsValid(lbuf))
		_bt_wrtbuf(rel, lbuf);

	/*
	 * If parent became half dead, recurse to try to delete it. Otherwise, if
	 * right sibling is empty and is now the last child of the parent, recurse
	 * to try to delete it.  (These cases cannot apply at the same time,
	 * though the second case might itself recurse to the first.)
	 */
	if (parent_half_dead)
	{
		buf = _bt_getbuf(rel, parent, BT_READ);
		return _bt_pagedel(rel, buf, vacuum_full) + 1;
	}
	if (parent_one_child && rightsib_empty)
	{
		buf = _bt_getbuf(rel, rightsib, BT_READ);
		return _bt_pagedel(rel, buf, vacuum_full) + 1;
	}

	return 1;
}
Esempio n. 12
0
/*
 *	_bt_gettrueroot() -- Get the true root page of the btree.
 *
 *		This is the same as the BT_READ case of _bt_getroot(), except
 *		we follow the true-root link not the fast-root link.
 *
 * By the time we acquire lock on the root page, it might have been split and
 * not be the true root anymore.  This is okay for the present uses of this
 * routine; we only really need to be able to move up at least one tree level
 * from whatever non-root page we were at.	If we ever do need to lock the
 * one true root page, we could loop here, re-reading the metapage on each
 * failure.  (Note that it wouldn't do to hold the lock on the metapage while
 * moving to the root --- that'd deadlock against any concurrent root split.)
 */
Buffer
_bt_gettrueroot(Relation rel)
{
	Buffer		metabuf;
	Page		metapg;
	BTPageOpaque metaopaque;
	Buffer		rootbuf;
	Page		rootpage;
	BTPageOpaque rootopaque;
	BlockNumber rootblkno;
	uint32		rootlevel;
	BTMetaPageData *metad;

	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
	metapg = BufferGetPage(metabuf);
	metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
	metad = BTPageGetMeta(metapg);

	if (!(metaopaque->btpo_flags & BTP_META) ||
		metad->btm_magic != BTREE_MAGIC)
		ereport(ERROR,
				(errcode(ERRCODE_INDEX_CORRUPTED),
				 errmsg("index \"%s\" is not a btree",
						RelationGetRelationName(rel))));

	if (metad->btm_version != BTREE_VERSION)
		ereport(ERROR,
				(errcode(ERRCODE_INDEX_CORRUPTED),
				 errmsg("version mismatch in index \"%s\": file version %d, code version %d",
						RelationGetRelationName(rel),
						metad->btm_version, BTREE_VERSION)));

	/* if no root page initialized yet, fail */
	if (metad->btm_root == P_NONE)
	{
		_bt_relbuf(rel, metabuf);
		return InvalidBuffer;
	}

	rootblkno = metad->btm_root;
	rootlevel = metad->btm_level;

	/*
	 * We are done with the metapage; arrange to release it via first
	 * _bt_relandgetbuf call
	 */
	rootbuf = metabuf;

	for (;;)
	{
		rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
		rootpage = BufferGetPage(rootbuf);
		rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);

		if (!P_IGNORE(rootopaque))
			break;

		/* it's dead, Jim.  step right one page */
		if (P_RIGHTMOST(rootopaque))
			elog(ERROR, "no live root page found in \"%s\"",
				 RelationGetRelationName(rel));
		rootblkno = rootopaque->btpo_next;
	}

	/* Note: can't check btpo.level on deleted pages */
	if (rootopaque->btpo.level != rootlevel)
		elog(ERROR, "root page %u of \"%s\" has level %u, expected %u",
			 rootblkno, RelationGetRelationName(rel),
			 rootopaque->btpo.level, rootlevel);

	return rootbuf;
}
Esempio n. 13
0
/*
 *	_bt_getroot() -- Get the root page of the btree.
 *
 *		Since the root page can move around the btree file, we have to read
 *		its location from the metadata page, and then read the root page
 *		itself.  If no root page exists yet, we have to create one.  The
 *		standard class of race conditions exists here; I think I covered
 *		them all in the Hopi Indian rain dance of lock requests below.
 *
 *		The access type parameter (BT_READ or BT_WRITE) controls whether
 *		a new root page will be created or not.  If access = BT_READ,
 *		and no root page exists, we just return InvalidBuffer.	For
 *		BT_WRITE, we try to create the root page if it doesn't exist.
 *		NOTE that the returned root page will have only a read lock set
 *		on it even if access = BT_WRITE!
 *
 *		The returned page is not necessarily the true root --- it could be
 *		a "fast root" (a page that is alone in its level due to deletions).
 *		Also, if the root page is split while we are "in flight" to it,
 *		what we will return is the old root, which is now just the leftmost
 *		page on a probably-not-very-wide level.  For most purposes this is
 *		as good as or better than the true root, so we do not bother to
 *		insist on finding the true root.  We do, however, guarantee to
 *		return a live (not deleted or half-dead) page.
 *
 *		On successful return, the root page is pinned and read-locked.
 *		The metadata page is not locked or pinned on exit.
 */
Buffer
_bt_getroot(Relation rel, int access)
{
	Buffer		metabuf;
	Page		metapg;
	BTPageOpaque metaopaque;
	Buffer		rootbuf;
	Page		rootpage;
	BTPageOpaque rootopaque;
	BlockNumber rootblkno;
	uint32		rootlevel;
	BTMetaPageData *metad;

	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
	metapg = BufferGetPage(metabuf);
	metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
	metad = BTPageGetMeta(metapg);

	/* sanity-check the metapage */
	if (!(metaopaque->btpo_flags & BTP_META) ||
		metad->btm_magic != BTREE_MAGIC)
		ereport(ERROR,
				(errcode(ERRCODE_INDEX_CORRUPTED),
				 errmsg("index \"%s\" is not a btree",
						RelationGetRelationName(rel))));

	if (metad->btm_version != BTREE_VERSION)
		ereport(ERROR,
				(errcode(ERRCODE_INDEX_CORRUPTED),
				 errmsg("version mismatch in index \"%s\": file version %d, code version %d",
						RelationGetRelationName(rel),
						metad->btm_version, BTREE_VERSION)));

	/* if no root page initialized yet, do it */
	if (metad->btm_root == P_NONE)
	{
		/* If access = BT_READ, caller doesn't want us to create root yet */
		if (access == BT_READ)
		{
			_bt_relbuf(rel, metabuf);
			return InvalidBuffer;
		}

		/* trade in our read lock for a write lock */
		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
		LockBuffer(metabuf, BT_WRITE);

		/*
		 * Race condition:	if someone else initialized the metadata between
		 * the time we released the read lock and acquired the write lock, we
		 * must avoid doing it again.
		 */
		if (metad->btm_root != P_NONE)
		{
			/*
			 * Metadata initialized by someone else.  In order to guarantee no
			 * deadlocks, we have to release the metadata page and start all
			 * over again.	(Is that really true? But it's hardly worth trying
			 * to optimize this case.)
			 */
			_bt_relbuf(rel, metabuf);
			return _bt_getroot(rel, access);
		}

		/*
		 * Get, initialize, write, and leave a lock of the appropriate type on
		 * the new root page.  Since this is the first page in the tree, it's
		 * a leaf as well as the root.
		 */
		rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
		rootblkno = BufferGetBlockNumber(rootbuf);
		rootpage = BufferGetPage(rootbuf);

		_bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
		rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
		rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
		rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
		rootopaque->btpo.level = 0;

		/* NO ELOG(ERROR) till meta is updated */
		START_CRIT_SECTION();

		metad->btm_root = rootblkno;
		metad->btm_level = 0;
		metad->btm_fastroot = rootblkno;
		metad->btm_fastlevel = 0;

		/* XLOG stuff */
		if (!rel->rd_istemp)
		{
			xl_btree_newroot xlrec;
			XLogRecPtr	recptr;
			XLogRecData rdata;

			xlrec.node = rel->rd_node;
			xlrec.rootblk = rootblkno;
			xlrec.level = 0;

			rdata.data = (char *) &xlrec;
			rdata.len = SizeOfBtreeNewroot;
			rdata.buffer = InvalidBuffer;
			rdata.next = NULL;

			recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata);

			PageSetLSN(rootpage, recptr);
			PageSetTLI(rootpage, ThisTimeLineID);
			PageSetLSN(metapg, recptr);
			PageSetTLI(metapg, ThisTimeLineID);
		}

		END_CRIT_SECTION();

		_bt_wrtnorelbuf(rel, rootbuf);

		/*
		 * swap root write lock for read lock.	There is no danger of anyone
		 * else accessing the new root page while it's unlocked, since no one
		 * else knows where it is yet.
		 */
		LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
		LockBuffer(rootbuf, BT_READ);

		/* okay, metadata is correct, write and release it */
		_bt_wrtbuf(rel, metabuf);
	}
	else
	{
		rootblkno = metad->btm_fastroot;
		Assert(rootblkno != P_NONE);
		rootlevel = metad->btm_fastlevel;

		/*
		 * We are done with the metapage; arrange to release it via first
		 * _bt_relandgetbuf call
		 */
		rootbuf = metabuf;

		for (;;)
		{
			rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
			rootpage = BufferGetPage(rootbuf);
			rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);

			if (!P_IGNORE(rootopaque))
				break;

			/* it's dead, Jim.  step right one page */
			if (P_RIGHTMOST(rootopaque))
				elog(ERROR, "no live root page found in \"%s\"",
					 RelationGetRelationName(rel));
			rootblkno = rootopaque->btpo_next;
		}

		/* Note: can't check btpo.level on deleted pages */
		if (rootopaque->btpo.level != rootlevel)
			elog(ERROR, "root page %u of \"%s\" has level %u, expected %u",
				 rootblkno, RelationGetRelationName(rel),
				 rootopaque->btpo.level, rootlevel);
	}

	/*
	 * By here, we have a pin and read lock on the root page, and no lock set
	 * on the metadata page.  Return the root page's buffer.
	 */
	return rootbuf;
}
Esempio n. 14
0
void
_bitmap_create_lov_heapandindex(Relation rel,
								Oid lovComptypeOid,
								Oid *lovHeapOid,
								Oid *lovIndexOid,
								Oid lovHeapRelfilenode,
								Oid lovIndexRelfilenode)
{
	char		lovHeapName[NAMEDATALEN];
	char		lovIndexName[NAMEDATALEN];
	TupleDesc	tupDesc;
	IndexInfo  *indexInfo;
	ObjectAddress	objAddr, referenced;
	Oid		   *classObjectId;
	int16	   *coloptions;
	Oid			heapid;
	Oid			idxid;
	int			indattrs;
	int			i;
	Oid			unusedArrayOid = InvalidOid;

	Assert(rel != NULL);

	/* create the new names for the new lov heap and index */
	snprintf(lovHeapName, sizeof(lovHeapName),
			 "pg_bm_%u", RelationGetRelid(rel));
	snprintf(lovIndexName, sizeof(lovIndexName),
			 "pg_bm_%u_index", RelationGetRelid(rel));


	heapid = get_relname_relid(lovHeapName, PG_BITMAPINDEX_NAMESPACE);

	/*
	 * If heapid exists, then this is happening during re-indexing.
	 * We allocate new relfilenodes for lov heap and lov index.
	 *
	 * XXX Each segment db may have different relfilenodes for lov heap and
	 * lov index, which should not be an issue now. Ideally, we would like each
	 * segment db use the same oids.
	 */
	if (OidIsValid(heapid))
	{
		Relation lovHeap;
		Relation lovIndex;
		Buffer btree_metabuf;
		Page   btree_metapage;

		*lovHeapOid = heapid;

		idxid = get_relname_relid(lovIndexName, PG_BITMAPINDEX_NAMESPACE);
		Assert(OidIsValid(idxid));
		*lovIndexOid = idxid;

		lovComptypeOid = get_rel_type_id(heapid);
		Assert(OidIsValid(lovComptypeOid));

		lovHeap = heap_open(heapid, AccessExclusiveLock);
		lovIndex = index_open(idxid, AccessExclusiveLock);

		if (OidIsValid(lovHeapRelfilenode))
			setNewRelfilenodeToOid(lovHeap, lovHeapRelfilenode);
		else
			setNewRelfilenode(lovHeap);
		if (OidIsValid(lovIndexRelfilenode))
			setNewRelfilenodeToOid(lovIndex, lovIndexRelfilenode);
		else
			setNewRelfilenode(lovIndex);

		/*
		 * After creating the new relfilenode for a btee index, this is not
		 * a btree anymore. We create the new metapage for this btree.
		 */
		btree_metabuf = _bt_getbuf(lovIndex, P_NEW, BT_WRITE);
		Assert (BTREE_METAPAGE == BufferGetBlockNumber(btree_metabuf));
		btree_metapage = BufferGetPage(btree_metabuf);
		_bt_initmetapage(btree_metapage, P_NONE, 0);

		/* XLOG the metapage */
		if (!XLog_UnconvertedCanBypassWal() && !lovIndex->rd_istemp)
		{
			
			// Fetch gp_persistent_relation_node information that will be added to XLOG record.
			RelationFetchGpRelationNodeForXLog(lovIndex);
			
			_bt_lognewpage(lovIndex,
						   btree_metapage,
						   BufferGetBlockNumber(btree_metabuf));
		}
		
		/* This cache value is not valid anymore. */
		if (lovIndex->rd_amcache)
		{
			pfree(lovIndex->rd_amcache);
			lovIndex->rd_amcache = NULL;
		}
		MarkBufferDirty(btree_metabuf);
		_bt_relbuf(lovIndex, btree_metabuf);

		index_close(lovIndex, NoLock);
		heap_close(lovHeap, NoLock);

		return;
	}

	/*
	 * create a new empty heap to store all attribute values with their
	 * corresponding block number and offset in LOV.
	 */
	tupDesc = _bitmap_create_lov_heapTupleDesc(rel);

	Assert(rel->rd_rel != NULL);

  	heapid =
		heap_create_with_catalog(lovHeapName, PG_BITMAPINDEX_NAMESPACE,
								 rel->rd_rel->reltablespace,
								 *lovHeapOid, rel->rd_rel->relowner,
								 tupDesc,
								 /* relam */ InvalidOid, RELKIND_RELATION, RELSTORAGE_HEAP,
								 rel->rd_rel->relisshared, false, /* bufferPoolBulkLoad */ false, 0,
								 ONCOMMIT_NOOP, NULL /* GP Policy */,
								 (Datum)0, true,
								 /* valid_opts */ true,
								 &lovComptypeOid,
								 &unusedArrayOid,
						 		 /* persistentTid */ NULL,
						 		 /* persistentSerialNum */ NULL);
	Assert(heapid == *lovHeapOid);

	/*
	 * We must bump the command counter to make the newly-created relation
	 * tuple visible for opening.
	 */
	CommandCounterIncrement();

	objAddr.classId = RelationRelationId;
	objAddr.objectId = *lovHeapOid;
	objAddr.objectSubId = 0 ;

	referenced.classId = RelationRelationId;
	referenced.objectId = RelationGetRelid(rel);
	referenced.objectSubId = 0;

	recordDependencyOn(&objAddr, &referenced, DEPENDENCY_INTERNAL);

	/*
	 * create a btree index on the newly-created heap.
	 * The key includes all attributes to be indexed in this bitmap index.
	 */
	indattrs = tupDesc->natts - 2;
	indexInfo = makeNode(IndexInfo);
	indexInfo->ii_NumIndexAttrs = indattrs;
	indexInfo->ii_Expressions = NIL;
	indexInfo->ii_ExpressionsState = NIL;
	indexInfo->ii_Predicate = make_ands_implicit(NULL);
	indexInfo->ii_PredicateState = NIL;
	indexInfo->ii_Unique = true;
	indexInfo->opaque = NULL;

	classObjectId = (Oid *) palloc(indattrs * sizeof(Oid));
	coloptions = (int16 *) palloc(indattrs * sizeof(int16));
	for (i = 0; i < indattrs; i++)
	{
		Oid typid = tupDesc->attrs[i]->atttypid;

		indexInfo->ii_KeyAttrNumbers[i] = i + 1;
		classObjectId[i] = GetDefaultOpClass(typid, BTREE_AM_OID);
		coloptions[i] = 0;
	}

	idxid = index_create(*lovHeapOid, lovIndexName, *lovIndexOid,
						 indexInfo, BTREE_AM_OID,
						 rel->rd_rel->reltablespace,
						 classObjectId, coloptions, 0, false, false, (Oid *) NULL, true,
						 false, false, NULL);
	Assert(idxid == *lovIndexOid);
}