Exemple #1
0
/*
 * _bt_walk_left() -- step left one page, if possible
 *
 * The given buffer must be pinned and read-locked.  This will be dropped
 * before stepping left.  On return, we have pin and read lock on the
 * returned page, instead.
 *
 * Returns InvalidBuffer if there is no page to the left (no lock is held
 * in that case).
 *
 * When working on a non-leaf level, it is possible for the returned page
 * to be half-dead; the caller should check that condition and step left
 * again if it's important.
 */
static Buffer
_bt_walk_left(Relation rel, Buffer buf)
{
	Page		page;
	BTPageOpaque opaque;

	page = BufferGetPage(buf);
	opaque = (BTPageOpaque) PageGetSpecialPointer(page);

	for (;;)
	{
		BlockNumber obknum;
		BlockNumber lblkno;
		BlockNumber blkno;
		int			tries;

		/* if we're at end of tree, release buf and return failure */
		if (P_LEFTMOST(opaque))
		{
			_bt_relbuf(rel, buf);
			break;
		}
		/* remember original page we are stepping left from */
		obknum = BufferGetBlockNumber(buf);
		/* step left */
		blkno = lblkno = opaque->btpo_prev;
		_bt_relbuf(rel, buf);
		/* check for interrupts while we're not holding any buffer lock */
		CHECK_FOR_INTERRUPTS();
		buf = _bt_getbuf(rel, blkno, BT_READ);
		page = BufferGetPage(buf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);

		/*
		 * If this isn't the page we want, walk right till we find what we
		 * want --- but go no more than four hops (an arbitrary limit). If we
		 * don't find the correct page by then, the most likely bet is that
		 * the original page got deleted and isn't in the sibling chain at all
		 * anymore, not that its left sibling got split more than four times.
		 *
		 * Note that it is correct to test P_ISDELETED not P_IGNORE here,
		 * because half-dead pages are still in the sibling chain.	Caller
		 * must reject half-dead pages if wanted.
		 */
		tries = 0;
		for (;;)
		{
			if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
			{
				/* Found desired page, return it */
				return buf;
			}
			if (P_RIGHTMOST(opaque) || ++tries > 4)
				break;
			blkno = opaque->btpo_next;
			buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
			page = BufferGetPage(buf);
			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		}

		/* Return to the original page to see what's up */
		buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
		page = BufferGetPage(buf);
		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
		if (P_ISDELETED(opaque))
		{
			/*
			 * It was deleted.	Move right to first nondeleted page (there
			 * must be one); that is the page that has acquired the deleted
			 * one's keyspace, so stepping left from it will take us where we
			 * want to be.
			 */
			for (;;)
			{
				if (P_RIGHTMOST(opaque))
					elog(ERROR, "fell off the end of index \"%s\"",
						 RelationGetRelationName(rel));
				blkno = opaque->btpo_next;
				buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
				page = BufferGetPage(buf);
				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
				if (!P_ISDELETED(opaque))
					break;
			}

			/*
			 * Now return to top of loop, resetting obknum to point to this
			 * nondeleted page, and try again.
			 */
		}
		else
		{
			/*
			 * It wasn't deleted; the explanation had better be that the page
			 * to the left got split or deleted. Without this check, we'd go
			 * into an infinite loop if there's anything wrong.
			 */
			if (opaque->btpo_prev == lblkno)
				elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
					 obknum, RelationGetRelationName(rel));
			/* Okay to try again with new lblkno value */
		}
	}

	return InvalidBuffer;
}
Exemple #2
0
/*
 *	_bt_getroot() -- Get the root page of the btree.
 *
 *		Since the root page can move around the btree file, we have to read
 *		its location from the metadata page, and then read the root page
 *		itself.  If no root page exists yet, we have to create one.  The
 *		standard class of race conditions exists here; I think I covered
 *		them all in the Hopi Indian rain dance of lock requests below.
 *
 *		The access type parameter (BT_READ or BT_WRITE) controls whether
 *		a new root page will be created or not.  If access = BT_READ,
 *		and no root page exists, we just return InvalidBuffer.	For
 *		BT_WRITE, we try to create the root page if it doesn't exist.
 *		NOTE that the returned root page will have only a read lock set
 *		on it even if access = BT_WRITE!
 *
 *		The returned page is not necessarily the true root --- it could be
 *		a "fast root" (a page that is alone in its level due to deletions).
 *		Also, if the root page is split while we are "in flight" to it,
 *		what we will return is the old root, which is now just the leftmost
 *		page on a probably-not-very-wide level.  For most purposes this is
 *		as good as or better than the true root, so we do not bother to
 *		insist on finding the true root.  We do, however, guarantee to
 *		return a live (not deleted or half-dead) page.
 *
 *		On successful return, the root page is pinned and read-locked.
 *		The metadata page is not locked or pinned on exit.
 */
Buffer
_bt_getroot(Relation rel, int access)
{
	Buffer		metabuf;
	Page		metapg;
	BTPageOpaque metaopaque;
	Buffer		rootbuf;
	Page		rootpage;
	BTPageOpaque rootopaque;
	BlockNumber rootblkno;
	uint32		rootlevel;
	BTMetaPageData *metad;

	MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD;

	/*
	 * Try to use previously-cached metapage data to find the root.  This
	 * normally saves one buffer access per index search, which is a very
	 * helpful savings in bufmgr traffic and hence contention.
	 */
	if (rel->rd_amcache != NULL)
	{
		metad = (BTMetaPageData *) rel->rd_amcache;
		/* We shouldn't have cached it if any of these fail */
		Assert(metad->btm_magic == BTREE_MAGIC);
		Assert(metad->btm_version == BTREE_VERSION);
		Assert(metad->btm_root != P_NONE);

		rootblkno = metad->btm_fastroot;
		Assert(rootblkno != P_NONE);
		rootlevel = metad->btm_fastlevel;

		rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
		rootpage = BufferGetPage(rootbuf);
		rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);

		/*
		 * Since the cache might be stale, we check the page more carefully
		 * here than normal.  We *must* check that it's not deleted. If it's
		 * not alone on its level, then we reject too --- this may be overly
		 * paranoid but better safe than sorry.  Note we don't check P_ISROOT,
		 * because that's not set in a "fast root".
		 */
		if (!P_IGNORE(rootopaque) &&
			rootopaque->btpo.level == rootlevel &&
			P_LEFTMOST(rootopaque) &&
			P_RIGHTMOST(rootopaque))
		{
			/* OK, accept cached page as the root */
			return rootbuf;
		}
		_bt_relbuf(rel, rootbuf);
		/* Cache is stale, throw it away */
		if (rel->rd_amcache)
			pfree(rel->rd_amcache);
		rel->rd_amcache = NULL;
	}

	metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
	metapg = BufferGetPage(metabuf);
	metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
	metad = BTPageGetMeta(metapg);

	/* sanity-check the metapage */
	if (!(metaopaque->btpo_flags & BTP_META) ||
		metad->btm_magic != BTREE_MAGIC)
		ereport(ERROR,
				(errcode(ERRCODE_INDEX_CORRUPTED),
				 errmsg("index \"%s\" is not a btree",
						RelationGetRelationName(rel))));

	if (metad->btm_version != BTREE_VERSION)
		ereport(ERROR,
				(errcode(ERRCODE_INDEX_CORRUPTED),
				 errmsg("version mismatch in index \"%s\": file version %d, code version %d",
						RelationGetRelationName(rel),
						metad->btm_version, BTREE_VERSION)));

	/* if no root page initialized yet, do it */
	if (metad->btm_root == P_NONE)
	{
		/* If access = BT_READ, caller doesn't want us to create root yet */
		if (access == BT_READ)
		{
			_bt_relbuf(rel, metabuf);
			return InvalidBuffer;
		}

		// Fetch gp_persistent_relation_node information that will be added to XLOG record.
		RelationFetchGpRelationNodeForXLog(rel);
		
		/* trade in our read lock for a write lock */
		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
		LockBuffer(metabuf, BT_WRITE);

		/*
		 * Race condition:	if someone else initialized the metadata between
		 * the time we released the read lock and acquired the write lock, we
		 * must avoid doing it again.
		 */
		if (metad->btm_root != P_NONE)
		{
			/*
			 * Metadata initialized by someone else.  In order to guarantee no
			 * deadlocks, we have to release the metadata page and start all
			 * over again.	(Is that really true? But it's hardly worth trying
			 * to optimize this case.)
			 */
			_bt_relbuf(rel, metabuf);
			return _bt_getroot(rel, access);
		}

		/*
		 * Get, initialize, write, and leave a lock of the appropriate type on
		 * the new root page.  Since this is the first page in the tree, it's
		 * a leaf as well as the root.
		 */
		rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
		rootblkno = BufferGetBlockNumber(rootbuf);
		rootpage = BufferGetPage(rootbuf);
		rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
		rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
		rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
		rootopaque->btpo.level = 0;
		rootopaque->btpo_cycleid = 0;

		/* NO ELOG(ERROR) till meta is updated */
		START_CRIT_SECTION();

		metad->btm_root = rootblkno;
		metad->btm_level = 0;
		metad->btm_fastroot = rootblkno;
		metad->btm_fastlevel = 0;

		MarkBufferDirty(rootbuf);
		MarkBufferDirty(metabuf);

		/* XLOG stuff */
		if (!rel->rd_istemp)
		{
			xl_btree_newroot xlrec;
			XLogRecPtr	recptr;
			XLogRecData rdata;

			xl_btreenode_set(&(xlrec.btreenode), rel);
			xlrec.rootblk = rootblkno;
			xlrec.level = 0;

			rdata.data = (char *) &xlrec;
			rdata.len = SizeOfBtreeNewroot;
			rdata.buffer = InvalidBuffer;
			rdata.next = NULL;

			recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata);

			PageSetLSN(rootpage, recptr);
			PageSetTLI(rootpage, ThisTimeLineID);
			PageSetLSN(metapg, recptr);
			PageSetTLI(metapg, ThisTimeLineID);
		}

		END_CRIT_SECTION();

		/*
		 * Send out relcache inval for metapage change (probably unnecessary
		 * here, but let's be safe).
		 */
		CacheInvalidateRelcache(rel);

		/*
		 * swap root write lock for read lock.	There is no danger of anyone
		 * else accessing the new root page while it's unlocked, since no one
		 * else knows where it is yet.
		 */
		LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
		LockBuffer(rootbuf, BT_READ);

		/* okay, metadata is correct, release lock on it */
		_bt_relbuf(rel, metabuf);
	}
	else
	{
		rootblkno = metad->btm_fastroot;
		Assert(rootblkno != P_NONE);
		rootlevel = metad->btm_fastlevel;

		/*
		 * Cache the metapage data for next time
		 */
		rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
											 sizeof(BTMetaPageData));
		memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));

		/*
		 * We are done with the metapage; arrange to release it via first
		 * _bt_relandgetbuf call
		 */
		rootbuf = metabuf;

		for (;;)
		{
			rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
			rootpage = BufferGetPage(rootbuf);
			rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);

			if (!P_IGNORE(rootopaque))
				break;

			/* it's dead, Jim.  step right one page */
			if (P_RIGHTMOST(rootopaque))
				elog(ERROR, "no live root page found in index \"%s\"",
					 RelationGetRelationName(rel));
			rootblkno = rootopaque->btpo_next;
		}

		/* Note: can't check btpo.level on deleted pages */
		if (rootopaque->btpo.level != rootlevel)
			elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
				 rootblkno, RelationGetRelationName(rel),
				 rootopaque->btpo.level, rootlevel);
	}

	/*
	 * By here, we have a pin and read lock on the root page, and no lock set
	 * on the metadata page.  Return the root page's buffer.
	 */
	return rootbuf;
}
Exemple #3
0
/* checks the individual attributes of the tuple */
uint32 check_index_tuple_attributes(Relation rel, PageHeader header, int block, int i, char *buffer) {
  
	IndexTuple tuple;
	uint32 nerrs = 0;
	int j, off;
	
	bits8 * bitmap;
	BTPageOpaque opaque;
	
	ereport(DEBUG2,(errmsg("[%d:%d] checking attributes for the tuple", block, i)));
	
	/* get the index tuple and info about the page */
	tuple = (IndexTuple)(buffer + header->pd_linp[i].lp_off);
	opaque = (BTPageOpaque)(buffer + header->pd_special);
	
	/* current attribute offset - always starts at (buffer + off) */
	off = header->pd_linp[i].lp_off + IndexInfoFindDataOffset(tuple->t_info);
	
	ereport(DEBUG3,(errmsg("[%d:%d] tuple has %d attributes", block, (i+1),
						   RelationGetNumberOfAttributes(rel))));
	
	bitmap = (bits8*)(buffer + header->pd_linp[i].lp_off + sizeof(IndexTupleData));
	
	/* TODO This is mostly copy'n'paste from check_heap_tuple_attributes,
	   so maybe it could be refactored to share the code. */

	/* For left-most tuples on non-leaf pages, there are no data actually
	   (see src/backend/access/nbtree/README, last paragraph in section "Notes
	   About Data Representation")
   
	   Use P_LEFTMOST/P_ISLEAF to identify such cases (for the leftmost item only)
	   and set len = 0.
	*/
	
	if (P_LEFTMOST(opaque) && (! P_ISLEAF(opaque)) && (i == 0)) {
		ereport(DEBUG3, (errmsg("[%d:%d] leftmost tuple on non-leaf block => no data, skipping", block, i)));
		return nerrs;
	}
	  
	/* check all the index attributes */
	for (j = 0; j < rel->rd_att->natts; j++) {
		
		/* default length of the attribute */
		int len = rel->rd_att->attrs[j]->attlen;
		
		/* copy from src/backend/commands/analyze.c */
		bool is_varlena  = (!rel->rd_att->attrs[j]->attbyval && len == -1);
		bool is_varwidth = (!rel->rd_att->attrs[j]->attbyval && len < 0); /* thus it's "len = -2" */
		
		/* if the attribute is marked as NULL (in the tuple header), skip to the next attribute */
		if (IndexTupleHasNulls(tuple) && att_isnull(j, bitmap)) {
			ereport(DEBUG3, (errmsg("[%d:%d] attribute '%s' is NULL (skipping)", block, (i+1), rel->rd_att->attrs[j]->attname.data)));
			continue;
		}

		/* fix the alignment (see src/include/access/tupmacs.h) */
		off = att_align_pointer(off, rel->rd_att->attrs[j]->attalign, rel->rd_att->attrs[j]->attlen, buffer+off);
		
		if (is_varlena) { 
		
			/*
			  other interesting macros (see postgres.h) - should do something about those ...
			  
			  VARATT_IS_COMPRESSED(PTR)			VARATT_IS_4B_C(PTR)
			  VARATT_IS_EXTERNAL(PTR)				VARATT_IS_1B_E(PTR)
			  VARATT_IS_SHORT(PTR)				VARATT_IS_1B(PTR)
			  VARATT_IS_EXTENDED(PTR)				(!VARATT_IS_4B_U(PTR))
			*/
			
			len = VARSIZE_ANY(buffer + off);
			
			if (len < 0) {
				ereport(WARNING, (errmsg("[%d:%d] attribute '%s' has negative length < 0 (%d)", block, (i+1), rel->rd_att->attrs[j]->attname.data, len)));
				++nerrs;
				break;
			}
			
			if (VARATT_IS_COMPRESSED(buffer + off)) {
				/* the raw length should be less than 1G (and positive) */
				if ((VARRAWSIZE_4B_C(buffer + off) < 0) || (VARRAWSIZE_4B_C(buffer + off) > 1024*1024)) {
					ereport(WARNING, (errmsg("[%d:%d]  attribute '%s' has invalid length %d (should be between 0 and 1G)", block, (i+1), rel->rd_att->attrs[j]->attname.data, VARRAWSIZE_4B_C(buffer + off))));
					++nerrs;
					/* no break here, this does not break the page structure - we may check the other attributes */
				}
			}
				
			/* FIXME Check if the varlena value may be detoasted. */
			
		} else if (is_varwidth) {
		
			/* get the C-string length (at most to the end of tuple), +1 as it does not include '\0' at the end */
			/* if the string is not properly terminated, then this returns 'remaining space + 1' so it's detected */
			len = strnlen(buffer + off, header->pd_linp[i].lp_off + len + header->pd_linp[i].lp_len - off) + 1;
			
		}
			
		/* Check if the length makes sense (is not negative and does not overflow
		 * the tuple end, stop validating the other rows (we don't know where to
		 * continue anyway). */
		
		if (off + len > (header->pd_linp[i].lp_off + header->pd_linp[i].lp_len)) {
			ereport(WARNING, (errmsg("[%d:%d] attribute '%s' (off=%d len=%d) overflows tuple end (off=%d, len=%d)", block, (i+1), rel->rd_att->attrs[j]->attname.data, off, len, header->pd_linp[i].lp_off, header->pd_linp[i].lp_len)));
			++nerrs;
			break;
		}
		
		/* skip to the next attribute */
		off += len;
		
		ereport(DEBUG3,(errmsg("[%d:%d] attribute '%s' len=%d", block, (i+1), rel->rd_att->attrs[j]->attname.data, len)));
		
	}
	
	ereport(DEBUG3,(errmsg("[%d:%d] last attribute ends at %d, tuple ends at %d", block, (i+1), off, header->pd_linp[i].lp_off + header->pd_linp[i].lp_len)));
	
	/* after the last attribute, the offset should be exactly the same as the end of the tuple */
	if (MAXALIGN(off) != header->pd_linp[i].lp_off + header->pd_linp[i].lp_len) {
		ereport(WARNING, (errmsg("[%d:%d] the last attribute ends at %d but the tuple ends at %d", block, (i+1), off, header->pd_linp[i].lp_off + header->pd_linp[i].lp_len)));
		++nerrs;
	}
	
	return nerrs;

}