예제 #1
0
/*
 * _bitmap_log_metapage() -- log the changes to the metapage
 */
void
_bitmap_log_metapage(Relation rel, Page page)
{
	BMMetaPage metapage = (BMMetaPage) PageGetContents(page);

	xl_bm_metapage*		xlMeta;
	XLogRecPtr			recptr;
	XLogRecData			rdata[1];

	// Fetch gp_persistent_relation_node information that will be added to XLOG record.
	RelationFetchGpRelationNodeForXLog(rel);

	xlMeta = (xl_bm_metapage *)
		palloc(MAXALIGN(sizeof(xl_bm_metapage)));
	xlMeta->bm_node = rel->rd_node;
	xlMeta->bm_persistentTid = rel->rd_relationnodeinfo.persistentTid;
	xlMeta->bm_persistentSerialNum = rel->rd_relationnodeinfo.persistentSerialNum;
	xlMeta->bm_lov_heapId = metapage->bm_lov_heapId;
	xlMeta->bm_lov_indexId = metapage->bm_lov_indexId;
	xlMeta->bm_lov_lastpage = metapage->bm_lov_lastpage;

	rdata[0].buffer = InvalidBuffer;
	rdata[0].data = (char*)xlMeta;
	rdata[0].len = MAXALIGN(sizeof(xl_bm_metapage));
	rdata[0].next = NULL;
			
	recptr = XLogInsert(RM_BITMAP_ID, XLOG_BITMAP_INSERT_META, rdata);

	PageSetLSN(page, recptr);
	PageSetTLI(page, ThisTimeLineID);
	pfree(xlMeta);
}
예제 #2
0
/*
 * Returns the value at the root of a page.
 *
 * Since this is just a read-only access of a single byte, the page doesn't
 * need to be locked.
 */
uint8
fsm_get_max_avail(Page page)
{
	FSMPage		fsmpage = (FSMPage) PageGetContents(page);

	return fsmpage->fp_nodes[0];
}
예제 #3
0
/*
 *	visibilitymap_clear - clear a bit in visibility map
 *
 * You must pass a buffer containing the correct map page to this function.
 * Call visibilitymap_pin first to pin the right one. This function doesn't do
 * any I/O.
 */
void
visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer buf)
{
	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
	int			mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
	int			mapBit = HEAPBLK_TO_MAPBIT(heapBlk);
	uint8		mask = 1 << mapBit;
	char	   *map;

#ifdef TRACE_VISIBILITYMAP
	elog(DEBUG1, "vm_clear %s %d", RelationGetRelationName(rel), heapBlk);
#endif

	if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock)
		elog(ERROR, "wrong buffer passed to visibilitymap_clear");

	LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
	map = PageGetContents(BufferGetPage(buf));

	if (map[mapByte] & mask)
	{
		map[mapByte] &= ~mask;

		MarkBufferDirty(buf);
	}

	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
예제 #4
0
Datum
brin_metapage_info(PG_FUNCTION_ARGS)
{
    bytea	   *raw_page = PG_GETARG_BYTEA_P(0);
    Page		page;
    BrinMetaPageData *meta;
    TupleDesc	tupdesc;
    Datum		values[4];
    bool		nulls[4];
    HeapTuple	htup;

    page = verify_brin_page(raw_page, BRIN_PAGETYPE_META, "metapage");

    /* Build a tuple descriptor for our result type */
    if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
        elog(ERROR, "return type must be a row type");
    tupdesc = BlessTupleDesc(tupdesc);

    /* Extract values from the metapage */
    meta = (BrinMetaPageData *) PageGetContents(page);
    MemSet(nulls, 0, sizeof(nulls));
    values[0] = CStringGetTextDatum(psprintf("0x%08X", meta->brinMagic));
    values[1] = Int32GetDatum(meta->brinVersion);
    values[2] = Int32GetDatum(meta->pagesPerRange);
    values[3] = Int64GetDatum(meta->lastRevmapPage);

    htup = heap_form_tuple(tupdesc, values, nulls);

    PG_RETURN_DATUM(HeapTupleGetDatum(htup));
}
예제 #5
0
/*
 * _bitmap_get_metapage_data() -- return the metadata info stored
 * in the given metapage buffer.
 */
BMMetaPage
_bitmap_get_metapage_data(Relation rel, Buffer metabuf)
{
	Page page;
	BMMetaPage metapage;
	
	page = BufferGetPage(metabuf);
	metapage = (BMMetaPage)PageGetContents(page);

	/*
	 * If this metapage is from the pre 3.4 version of the bitmap
	 * index, we print "require to reindex" message, and error
	 * out.
	 */
	if (metapage->bm_version != BITMAP_VERSION)
	{
		ereport(ERROR,
				(0,
				 errmsg("The disk format for %s is not valid for this version of "
						"Greenplum Database. Use REINDEX %s to update this index",
						RelationGetRelationName(rel), RelationGetRelationName(rel))));
	}

	return metapage;
}
예제 #6
0
파일: bufpage.c 프로젝트: qiuyesuifeng/gpdb
/*
 * PageGetTempPage
 *		Get a temporary page in local memory for special processing
 */
Page
PageGetTempPage(Page page, Size specialSize)
{
	Size		pageSize;
	Size		size;
	Page		temp;
	PageHeader	thdr;

	pageSize = PageGetPageSize(page);
	temp = (Page) palloc(pageSize);
	thdr = (PageHeader) temp;

	/* copy old page in */
	memcpy(temp, page, pageSize);

	/* clear out the middle */
	size = pageSize - SizeOfPageHeaderData;
	size -= MAXALIGN(specialSize);
	MemSet(PageGetContents(thdr), 0, size);

	/* set high, low water marks */
	thdr->pd_lower = SizeOfPageHeaderData;
	thdr->pd_upper = pageSize - MAXALIGN(specialSize);

	return temp;
}
예제 #7
0
파일: brin_xlog.c 프로젝트: Brar/postgres
/*
 * Replay a revmap page extension
 */
static void
brin_xlog_revmap_extend(XLogReaderState *record)
{
	XLogRecPtr	lsn = record->EndRecPtr;
	xl_brin_revmap_extend *xlrec;
	Buffer		metabuf;
	Buffer		buf;
	Page		page;
	BlockNumber targetBlk;
	XLogRedoAction action;

	xlrec = (xl_brin_revmap_extend *) XLogRecGetData(record);

	XLogRecGetBlockTag(record, 1, NULL, NULL, &targetBlk);
	Assert(xlrec->targetBlk == targetBlk);

	/* Update the metapage */
	action = XLogReadBufferForRedo(record, 0, &metabuf);
	if (action == BLK_NEEDS_REDO)
	{
		Page		metapg;
		BrinMetaPageData *metadata;

		metapg = BufferGetPage(metabuf);
		metadata = (BrinMetaPageData *) PageGetContents(metapg);

		Assert(metadata->lastRevmapPage == xlrec->targetBlk - 1);
		metadata->lastRevmapPage = xlrec->targetBlk;

		PageSetLSN(metapg, lsn);

		/*
		 * Set pd_lower just past the end of the metadata.  This is essential,
		 * because without doing so, metadata will be lost if xlog.c
		 * compresses the page.  (We must do this here because pre-v11
		 * versions of PG did not set the metapage's pd_lower correctly, so a
		 * pg_upgraded index might contain the wrong value.)
		 */
		((PageHeader) metapg)->pd_lower =
			((char *) metadata + sizeof(BrinMetaPageData)) - (char *) metapg;

		MarkBufferDirty(metabuf);
	}

	/*
	 * Re-init the target block as a revmap page.  There's never a full- page
	 * image here.
	 */

	buf = XLogInitBufferForRedo(record, 1);
	page = (Page) BufferGetPage(buf);
	brin_page_init(page, BRIN_PAGETYPE_REVMAP);

	PageSetLSN(page, lsn);
	MarkBufferDirty(buf);

	UnlockReleaseBuffer(buf);
	if (BufferIsValid(metabuf))
		UnlockReleaseBuffer(metabuf);
}
예제 #8
0
Datum
fsm_page_contents(PG_FUNCTION_ARGS)
{
	bytea	   *raw_page = PG_GETARG_BYTEA_P(0);
	StringInfoData sinfo;
	FSMPage		fsmpage;
	int			i;

	if (!superuser())
		ereport(ERROR,
				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
				 (errmsg("must be superuser to use raw page functions"))));

	fsmpage = (FSMPage) PageGetContents(VARDATA(raw_page));

	initStringInfo(&sinfo);

	for (i = 0; i < NodesPerPage; i++)
	{
		if (fsmpage->fp_nodes[i] != 0)
			appendStringInfo(&sinfo, "%d: %d\n", i, fsmpage->fp_nodes[i]);
	}
	appendStringInfo(&sinfo, "fp_next_slot: %d\n", fsmpage->fp_next_slot);

	PG_RETURN_TEXT_P(cstring_to_text_with_len(sinfo.data, sinfo.len));
}
예제 #9
0
/*
 *	visibilitymap_set - set a bit on a previously pinned page
 *
 * recptr is the LSN of the heap page. The LSN of the visibility map page is
 * advanced to that, to make sure that the visibility map doesn't get flushed
 * to disk before the update to the heap page that made all tuples visible.
 *
 * This is an opportunistic function. It does nothing, unless *buf
 * contains the bit for heapBlk. Call visibilitymap_pin first to pin
 * the right map page. This function doesn't do any I/O.
 */
void
visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr,
				  Buffer *buf)
{
	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
	uint32		mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
	uint8		mapBit = HEAPBLK_TO_MAPBIT(heapBlk);
	Page		page;
	char	   *map;

#ifdef TRACE_VISIBILITYMAP
	elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk);
#endif

	/* Check that we have the right page pinned */
	if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != mapBlock)
		return;

	page = BufferGetPage(*buf);
	map = PageGetContents(page);
	LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE);

	if (!(map[mapByte] & (1 << mapBit)))
	{
		map[mapByte] |= (1 << mapBit);

		if (XLByteLT(PageGetLSN(page), recptr))
			PageSetLSN(page, recptr);
		PageSetTLI(page, ThisTimeLineID);
		MarkBufferDirty(*buf);
	}

	LockBuffer(*buf, BUFFER_LOCK_UNLOCK);
}
예제 #10
0
/*
 *	visibilitymap_clear - clear a bit in visibility map
 *
 * Clear a bit in the visibility map, marking that not all tuples are
 * visible to all transactions anymore.
 */
void
visibilitymap_clear(Relation rel, BlockNumber heapBlk)
{
	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
	int			mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
	int			mapBit = HEAPBLK_TO_MAPBIT(heapBlk);
	uint8		mask = 1 << mapBit;
	Buffer		mapBuffer;
	char	   *map;

#ifdef TRACE_VISIBILITYMAP
	elog(DEBUG1, "vm_clear %s %d", RelationGetRelationName(rel), heapBlk);
#endif

	mapBuffer = vm_readbuf(rel, mapBlock, false);
	if (!BufferIsValid(mapBuffer))
		return;					/* nothing to do */

	LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE);
	map = PageGetContents(BufferGetPage(mapBuffer));

	if (map[mapByte] & mask)
	{
		map[mapByte] &= ~mask;

		MarkBufferDirty(mapBuffer);
	}

	UnlockReleaseBuffer(mapBuffer);
}
예제 #11
0
/*
 * Reconstructs the upper levels of a page. Returns true if the page
 * was modified.
 */
bool
fsm_rebuild_page(Page page)
{
	FSMPage		fsmpage = (FSMPage) PageGetContents(page);
	bool		changed = false;
	int			nodeno;

	/*
	 * Start from the lowest non-leaf level, at last node, working our way
	 * backwards, through all non-leaf nodes at all levels, up to the root.
	 */
	for (nodeno = NonLeafNodesPerPage - 1; nodeno >= 0; nodeno--)
	{
		int			lchild = leftchild(nodeno);
		int			rchild = lchild + 1;
		uint8		newvalue = 0;

		/* The first few nodes we examine might have zero or one child. */
		if (lchild < NodesPerPage)
			newvalue = fsmpage->fp_nodes[lchild];

		if (rchild < NodesPerPage)
			newvalue = Max(newvalue,
						   fsmpage->fp_nodes[rchild]);

		if (fsmpage->fp_nodes[nodeno] != newvalue)
		{
			fsmpage->fp_nodes[nodeno] = newvalue;
			changed = true;
		}
	}

	return changed;
}
예제 #12
0
/*
 * Returns the value of given slot on page.
 *
 * Since this is just a read-only access of a single byte, the page doesn't
 * need to be locked.
 */
uint8
fsm_get_avail(Page page, int slot)
{
	FSMPage		fsmpage = (FSMPage) PageGetContents(page);

	Assert(slot < LeafNodesPerPage);

	return fsmpage->fp_nodes[NonLeafNodesPerPage + slot];
}
예제 #13
0
/*
 * Sets the value of a slot on page. Returns true if the page was modified.
 *
 * The caller must hold an exclusive lock on the page.
 */
bool
fsm_set_avail(Page page, int slot, uint8 value)
{
	int			nodeno = NonLeafNodesPerPage + slot;
	FSMPage		fsmpage = (FSMPage) PageGetContents(page);
	uint8		oldvalue;

	Assert(slot < LeafNodesPerPage);

	oldvalue = fsmpage->fp_nodes[nodeno];

	/* If the value hasn't changed, we don't need to do anything */
	if (oldvalue == value && value <= fsmpage->fp_nodes[0])
		return false;

	fsmpage->fp_nodes[nodeno] = value;

	/*
	 * Propagate up, until we hit the root or a node that doesn't need to be
	 * updated.
	 */
	do
	{
		uint8		newvalue = 0;
		int			lchild;
		int			rchild;

		nodeno = parentof(nodeno);
		lchild = leftchild(nodeno);
		rchild = lchild + 1;

		newvalue = fsmpage->fp_nodes[lchild];
		if (rchild < NodesPerPage)
			newvalue = Max(newvalue,
						   fsmpage->fp_nodes[rchild]);

		oldvalue = fsmpage->fp_nodes[nodeno];
		if (oldvalue == newvalue)
			break;

		fsmpage->fp_nodes[nodeno] = newvalue;
	} while (nodeno > 0);

	/*
	 * sanity check: if the new value is (still) higher than the value at the
	 * top, the tree is corrupt.  If so, rebuild.
	 */
	if (value > fsmpage->fp_nodes[0])
		fsm_rebuild_page(page);

	return true;
}
예제 #14
0
/*
 * _bitmap_log_lovmetapage() -- log the lov meta page.
 */
void
_bitmap_log_lovmetapage(Relation rel, Buffer lovMetaBuffer, uint8 numOfAttrs)
{
	Page			lovMetapage;
	BMLOVMetaItem	metaItems;

	lovMetapage = BufferGetPage(lovMetaBuffer);
	metaItems = (BMLOVMetaItem)PageGetContents(lovMetapage);

	/* XLOG stuff */
	START_CRIT_SECTION();

	if (!(rel->rd_istemp))
	{
		BMLOVMetaItem	copyMetaItems;
		XLogRecPtr		recptr;
		XLogRecData		rdata[1];


		xl_bm_lovmetapage* xlLovMeta;

#ifdef BM_DEBUG
		elog(LOG, "call _bitmap_log_lovmetapage: numOfAttrs=%d", numOfAttrs);
#endif

		xlLovMeta = (xl_bm_lovmetapage*)
			palloc(sizeof(xl_bm_lovmetapage)+
					numOfAttrs*sizeof(BMLOVMetaItemData));

		xlLovMeta->bm_node = rel->rd_node;
		xlLovMeta->bm_num_of_attrs = numOfAttrs;

		copyMetaItems = (BMLOVMetaItem)
			(((char*)xlLovMeta) + sizeof(xl_bm_lovmetapage)); 
		memcpy(copyMetaItems, metaItems, numOfAttrs*sizeof(BMLOVMetaItemData));

		rdata[0].buffer = InvalidBuffer;
		rdata[0].data = (char*)xlLovMeta;
		rdata[0].len = 
			sizeof(xl_bm_lovmetapage) + numOfAttrs*sizeof(BMLOVMetaItemData);
		rdata[0].next = NULL;

		recptr = XLogInsert(RM_BITMAP_ID, 
							XLOG_BITMAP_INSERT_LOVMETA, rdata);

		PageSetLSN(lovMetapage, recptr);
		PageSetTLI(lovMetapage, ThisTimeLineID);
		pfree(xlLovMeta);
	}

	END_CRIT_SECTION();
}
예제 #15
0
/*
 * Return the TID array stored in a BRIN revmap page
 */
Datum
brin_revmap_data(PG_FUNCTION_ARGS)
{
    struct
    {
        ItemPointerData *tids;
        int			idx;
    }		   *state;
    FuncCallContext *fctx;

    if (!superuser())
        ereport(ERROR,
                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
                 (errmsg("must be superuser to use raw page functions"))));

    if (SRF_IS_FIRSTCALL())
    {
        bytea	   *raw_page = PG_GETARG_BYTEA_P(0);
        MemoryContext mctx;
        Page		page;

        /* minimally verify the page we got */
        page = verify_brin_page(raw_page, BRIN_PAGETYPE_REVMAP, "revmap");

        /* create a function context for cross-call persistence */
        fctx = SRF_FIRSTCALL_INIT();

        /* switch to memory context appropriate for multiple function calls */
        mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx);

        state = palloc(sizeof(*state));
        state->tids = ((RevmapContents *) PageGetContents(page))->rm_tids;
        state->idx = 0;

        fctx->user_fctx = state;

        MemoryContextSwitchTo(mctx);
    }

    fctx = SRF_PERCALL_SETUP();
    state = fctx->user_fctx;

    if (state->idx < REVMAP_PAGE_MAXITEMS)
        SRF_RETURN_NEXT(fctx, PointerGetDatum(&state->tids[state->idx++]));

    SRF_RETURN_DONE(fctx);
}
예제 #16
0
파일: brin.c 프로젝트: eubide/postgres
/*
 * Fetch index's statistical data into *stats
 */
void
brinGetStats(Relation index, BrinStatsData *stats)
{
	Buffer		metabuffer;
	Page		metapage;
	BrinMetaPageData *metadata;

	metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO);
	LockBuffer(metabuffer, BUFFER_LOCK_SHARE);
	metapage = BufferGetPage(metabuffer);
	metadata = (BrinMetaPageData *) PageGetContents(metapage);

	stats->pagesPerRange = metadata->pagesPerRange;
	stats->revmapNumPages = metadata->lastRevmapPage - 1;

	UnlockReleaseBuffer(metabuffer);
}
예제 #17
0
/*
 *	visibilitymap_set - set a bit on a previously pinned page
 *
 * recptr is the LSN of the XLOG record we're replaying, if we're in recovery,
 * or InvalidXLogRecPtr in normal running.  The page LSN is advanced to the
 * one provided; in normal running, we generate a new XLOG record and set the
 * page LSN to that value.  cutoff_xid is the largest xmin on the page being
 * marked all-visible; it is needed for Hot Standby, and can be
 * InvalidTransactionId if the page contains no tuples.
 *
 * You must pass a buffer containing the correct map page to this function.
 * Call visibilitymap_pin first to pin the right one. This function doesn't do
 * any I/O.
 */
void
visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr,
				  Buffer buf, TransactionId cutoff_xid)
{
	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
	uint32		mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
	uint8		mapBit = HEAPBLK_TO_MAPBIT(heapBlk);
	Page		page;
	char	   *map;

#ifdef TRACE_VISIBILITYMAP
	elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk);
#endif

	Assert(InRecovery || XLogRecPtrIsInvalid(recptr));

	/* Check that we have the right page pinned */
	if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock)
		elog(ERROR, "wrong buffer passed to visibilitymap_set");

	page = BufferGetPage(buf);
	map = PageGetContents(page);
	LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);

	if (!(map[mapByte] & (1 << mapBit)))
	{
		START_CRIT_SECTION();

		map[mapByte] |= (1 << mapBit);
		MarkBufferDirty(buf);

		if (RelationNeedsWAL(rel))
		{
			if (XLogRecPtrIsInvalid(recptr))
				recptr = log_heap_visible(rel->rd_node, heapBlk, buf,
										  cutoff_xid);
			PageSetLSN(page, recptr);
			PageSetTLI(page, ThisTimeLineID);
		}

		END_CRIT_SECTION();
	}

	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
예제 #18
0
파일: brin_revmap.c 프로젝트: c2j/postgres
/*
 * In the given revmap buffer (locked appropriately by caller), which is used
 * in a BRIN index of pagesPerRange pages per range, set the element
 * corresponding to heap block number heapBlk to the given TID.
 *
 * Once the operation is complete, the caller must update the LSN on the
 * returned buffer.
 *
 * This is used both in regular operation and during WAL replay.
 */
void
brinSetHeapBlockItemptr(Buffer buf, BlockNumber pagesPerRange,
						BlockNumber heapBlk, ItemPointerData tid)
{
	RevmapContents *contents;
	ItemPointerData *iptr;
	Page		page;

	/* The correct page should already be pinned and locked */
	page = BufferGetPage(buf);
	contents = (RevmapContents *) PageGetContents(page);
	iptr = (ItemPointerData *) contents->rm_tids;
	iptr += HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk);

	ItemPointerSet(iptr,
				   ItemPointerGetBlockNumber(&tid),
				   ItemPointerGetOffsetNumber(&tid));
}
예제 #19
0
/*
 * Initialize a new BRIN index' metapage.
 */
void
brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version)
{
	BrinMetaPageData *metadata;

	brin_page_init(page, BRIN_PAGETYPE_META);

	metadata = (BrinMetaPageData *) PageGetContents(page);

	metadata->brinMagic = BRIN_META_MAGIC;
	metadata->brinVersion = version;
	metadata->pagesPerRange = pagesPerRange;

	/*
	 * Note we cheat here a little.  0 is not a valid revmap block number
	 * (because it's the metapage buffer), but doing this enables the first
	 * revmap page to be created when the index is.
	 */
	metadata->lastRevmapPage = 0;
}
예제 #20
0
/*
 *	visibilitymap_test - test if a bit is set
 *
 * Are all tuples on heapBlk visible to all, according to the visibility map?
 *
 * On entry, *buf should be InvalidBuffer or a valid buffer returned by an
 * earlier call to visibilitymap_pin or visibilitymap_test on the same
 * relation. On return, *buf is a valid buffer with the map page containing
 * the bit for heapBlk, or InvalidBuffer. The caller is responsible for
 * releasing *buf after it's done testing and setting bits.
 *
 * NOTE: This function is typically called without a lock on the heap page,
 * so somebody else could change the bit just after we look at it.  In fact,
 * since we don't lock the visibility map page either, it's even possible that
 * someone else could have changed the bit just before we look at it, but yet
 * we might see the old value.  It is the caller's responsibility to deal with
 * all concurrency issues!
 */
bool
visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf)
{
	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
	uint32		mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
	uint8		mapBit = HEAPBLK_TO_MAPBIT(heapBlk);
	bool		result;
	char	   *map;

#ifdef TRACE_VISIBILITYMAP
	elog(DEBUG1, "vm_test %s %d", RelationGetRelationName(rel), heapBlk);
#endif

	/* Reuse the old pinned buffer if possible */
	if (BufferIsValid(*buf))
	{
		if (BufferGetBlockNumber(*buf) != mapBlock)
		{
			ReleaseBuffer(*buf);
			*buf = InvalidBuffer;
		}
	}

	if (!BufferIsValid(*buf))
	{
		*buf = vm_readbuf(rel, mapBlock, false);
		if (!BufferIsValid(*buf))
			return false;
	}

	map = PageGetContents(BufferGetPage(*buf));

	/*
	 * A single-bit read is atomic.  There could be memory-ordering effects
	 * here, but for performance reasons we make it the caller's job to worry
	 * about that.
	 */
	result = (map[mapByte] & (1 << mapBit)) ? true : false;

	return result;
}
예제 #21
0
/*
 *	visibilitymap_test - test if a bit is set
 *
 * Are all tuples on heapBlk visible to all, according to the visibility map?
 *
 * On entry, *buf should be InvalidBuffer or a valid buffer returned by an
 * earlier call to visibilitymap_pin or visibilitymap_test on the same
 * relation. On return, *buf is a valid buffer with the map page containing
 * the bit for heapBlk, or InvalidBuffer. The caller is responsible for
 * releasing *buf after it's done testing and setting bits.
 */
bool
visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf)
{
	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
	uint32		mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
	uint8		mapBit = HEAPBLK_TO_MAPBIT(heapBlk);
	bool		result;
	char	   *map;

#ifdef TRACE_VISIBILITYMAP
	elog(DEBUG1, "vm_test %s %d", RelationGetRelationName(rel), heapBlk);
#endif

	/* Reuse the old pinned buffer if possible */
	if (BufferIsValid(*buf))
	{
		if (BufferGetBlockNumber(*buf) != mapBlock)
		{
			ReleaseBuffer(*buf);
			*buf = InvalidBuffer;
		}
	}

	if (!BufferIsValid(*buf))
	{
		*buf = vm_readbuf(rel, mapBlock, false);
		if (!BufferIsValid(*buf))
			return false;
	}

	map = PageGetContents(BufferGetPage(*buf));

	/*
	 * We don't need to lock the page, as we're only looking at a single bit.
	 */
	result = (map[mapByte] & (1 << mapBit)) ? true : false;

	return result;
}
예제 #22
0
/*
 *	visibilitymap_count	 - count number of bits set in visibility map
 *
 * Note: we ignore the possibility of race conditions when the table is being
 * extended concurrently with the call.  New pages added to the table aren't
 * going to be marked all-visible, so they won't affect the result.
 */
BlockNumber
visibilitymap_count(Relation rel)
{
	BlockNumber result = 0;
	BlockNumber mapBlock;

	for (mapBlock = 0; ; mapBlock++)
	{
		Buffer		mapBuffer;
		unsigned char *map;
		int			i;

		/*
		 * Read till we fall off the end of the map.  We assume that any
		 * extra bytes in the last page are zeroed, so we don't bother
		 * excluding them from the count.
		 */
		mapBuffer = vm_readbuf(rel, mapBlock, false);
		if (!BufferIsValid(mapBuffer))
			break;

		/*
		 * We choose not to lock the page, since the result is going to be
		 * immediately stale anyway if anyone is concurrently setting or
		 * clearing bits, and we only really need an approximate value.
		 */
		map = (unsigned char *) PageGetContents(BufferGetPage(mapBuffer));

		for (i = 0; i < MAPSIZE; i++)
		{
			result += number_of_ones[map[i]];
		}

		ReleaseBuffer(mapBuffer);
	}

	return result;
}
예제 #23
0
파일: brin_revmap.c 프로젝트: c2j/postgres
/*
 * Initialize an access object for a range map.  This must be freed by
 * brinRevmapTerminate when caller is done with it.
 */
BrinRevmap *
brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange)
{
	BrinRevmap *revmap;
	Buffer		meta;
	BrinMetaPageData *metadata;

	meta = ReadBuffer(idxrel, BRIN_METAPAGE_BLKNO);
	LockBuffer(meta, BUFFER_LOCK_SHARE);
	metadata = (BrinMetaPageData *) PageGetContents(BufferGetPage(meta));

	revmap = palloc(sizeof(BrinRevmap));
	revmap->rm_irel = idxrel;
	revmap->rm_pagesPerRange = metadata->pagesPerRange;
	revmap->rm_lastRevmapPage = metadata->lastRevmapPage;
	revmap->rm_metaBuf = meta;
	revmap->rm_currBuf = InvalidBuffer;

	*pagesPerRange = metadata->pagesPerRange;

	LockBuffer(meta, BUFFER_LOCK_UNLOCK);

	return revmap;
}
예제 #24
0
/*
 * Sets the available space to zero for all slots numbered >= nslots.
 * Returns true if the page was modified.
 */
bool
fsm_truncate_avail(Page page, int nslots)
{
	FSMPage		fsmpage = (FSMPage) PageGetContents(page);
	uint8	   *ptr;
	bool		changed = false;

	Assert(nslots >= 0 && nslots < LeafNodesPerPage);

	/* Clear all truncated leaf nodes */
	ptr = &fsmpage->fp_nodes[NonLeafNodesPerPage + nslots];
	for (; ptr < &fsmpage->fp_nodes[NodesPerPage]; ptr++)
	{
		if (*ptr != 0)
			changed = true;
		*ptr = 0;
	}

	/* Fix upper nodes. */
	if (changed)
		fsm_rebuild_page(page);

	return changed;
}
예제 #25
0
/*
 *	visibilitymap_truncate - truncate the visibility map
 *
 * The caller must hold AccessExclusiveLock on the relation, to ensure that
 * other backends receive the smgr invalidation event that this function sends
 * before they access the VM again.
 *
 * nheapblocks is the new size of the heap.
 */
void
visibilitymap_truncate(Relation rel, BlockNumber nheapblocks)
{
	BlockNumber newnblocks;

	/* last remaining block, byte, and bit */
	BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks);
	uint32		truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks);
	uint8		truncBit = HEAPBLK_TO_MAPBIT(nheapblocks);

#ifdef TRACE_VISIBILITYMAP
	elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks);
#endif

	RelationOpenSmgr(rel);

	/*
	 * If no visibility map has been created yet for this relation, there's
	 * nothing to truncate.
	 */
	if (!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
		return;

	/*
	 * Unless the new size is exactly at a visibility map page boundary, the
	 * tail bits in the last remaining map page, representing truncated heap
	 * blocks, need to be cleared. This is not only tidy, but also necessary
	 * because we don't get a chance to clear the bits if the heap is extended
	 * again.
	 */
	if (truncByte != 0 || truncBit != 0)
	{
		Buffer		mapBuffer;
		Page		page;
		char	   *map;

		newnblocks = truncBlock + 1;

		mapBuffer = vm_readbuf(rel, truncBlock, false);
		if (!BufferIsValid(mapBuffer))
		{
			/* nothing to do, the file was already smaller */
			return;
		}

		page = BufferGetPage(mapBuffer);
		map = PageGetContents(page);

		LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE);

		/* Clear out the unwanted bytes. */
		MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1));

		/*
		 * Mask out the unwanted bits of the last remaining byte.
		 *
		 * ((1 << 0) - 1) = 00000000 ((1 << 1) - 1) = 00000001 ... ((1 << 6) -
		 * 1) = 00111111 ((1 << 7) - 1) = 01111111
		 */
		map[truncByte] &= (1 << truncBit) - 1;

		MarkBufferDirty(mapBuffer);
		UnlockReleaseBuffer(mapBuffer);
	}
	else
		newnblocks = truncBlock;

	if (smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM) <= newnblocks)
	{
		/* nothing to do, the file was already smaller than requested size */
		return;
	}

	/* Truncate the unused VM pages, and send smgr inval message */
	smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks);

	/*
	 * We might as well update the local smgr_vm_nblocks setting. smgrtruncate
	 * sent an smgr cache inval message, which will cause other backends to
	 * invalidate their copy of smgr_vm_nblocks, and this one too at the next
	 * command boundary.  But this ensures it isn't outright wrong until then.
	 */
	if (rel->rd_smgr)
		rel->rd_smgr->smgr_vm_nblocks = newnblocks;
}
예제 #26
0
/*
 * Recursive guts of FreeSpaceMapVacuum
 */
static uint8
fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p)
{
	Buffer		buf;
	Page		page;
	uint8		max_avail;

	/* Read the page if it exists, or return EOF */
	buf = fsm_readbuf(rel, addr, false);
	if (!BufferIsValid(buf))
	{
		*eof_p = true;
		return 0;
	}
	else
		*eof_p = false;

	page = BufferGetPage(buf);

	/*
	 * Recurse into children, and fix the information stored about them at
	 * this level.
	 */
	if (addr.level > FSM_BOTTOM_LEVEL)
	{
		int			slot;
		bool		eof = false;

		for (slot = 0; slot < SlotsPerFSMPage; slot++)
		{
			int			child_avail;

			CHECK_FOR_INTERRUPTS();

			/* After we hit end-of-file, just clear the rest of the slots */
			if (!eof)
				child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot), &eof);
			else
				child_avail = 0;

			/* Update information about the child */
			if (fsm_get_avail(page, slot) != child_avail)
			{
				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
				fsm_set_avail(BufferGetPage(buf), slot, child_avail);
				MarkBufferDirtyHint(buf, false);
				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
			}
		}
	}

	max_avail = fsm_get_max_avail(BufferGetPage(buf));

	/*
	 * Reset the next slot pointer. This encourages the use of low-numbered
	 * pages, increasing the chances that a later vacuum can truncate the
	 * relation.
	 */
	((FSMPage) PageGetContents(page))->fp_next_slot = 0;

	ReleaseBuffer(buf);

	return max_avail;
}
예제 #27
0
파일: brin_revmap.c 프로젝트: c2j/postgres
/*
 * Try to extend the revmap by one page.  This might not happen for a number of
 * reasons; caller is expected to retry until the expected outcome is obtained.
 */
static void
revmap_physical_extend(BrinRevmap *revmap)
{
	Buffer		buf;
	Page		page;
	Page		metapage;
	BrinMetaPageData *metadata;
	BlockNumber mapBlk;
	BlockNumber nblocks;
	Relation	irel = revmap->rm_irel;
	bool		needLock = !RELATION_IS_LOCAL(irel);

	/*
	 * Lock the metapage. This locks out concurrent extensions of the revmap,
	 * but note that we still need to grab the relation extension lock because
	 * another backend can extend the index with regular BRIN pages.
	 */
	LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_EXCLUSIVE);
	metapage = BufferGetPage(revmap->rm_metaBuf);
	metadata = (BrinMetaPageData *) PageGetContents(metapage);

	/*
	 * Check that our cached lastRevmapPage value was up-to-date; if it
	 * wasn't, update the cached copy and have caller start over.
	 */
	if (metadata->lastRevmapPage != revmap->rm_lastRevmapPage)
	{
		revmap->rm_lastRevmapPage = metadata->lastRevmapPage;
		LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
		return;
	}
	mapBlk = metadata->lastRevmapPage + 1;

	nblocks = RelationGetNumberOfBlocks(irel);
	if (mapBlk < nblocks)
	{
		buf = ReadBuffer(irel, mapBlk);
		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
		page = BufferGetPage(buf);
	}
	else
	{
		if (needLock)
			LockRelationForExtension(irel, ExclusiveLock);

		buf = ReadBuffer(irel, P_NEW);
		if (BufferGetBlockNumber(buf) != mapBlk)
		{
			/*
			 * Very rare corner case: somebody extended the relation
			 * concurrently after we read its length.  If this happens, give
			 * up and have caller start over.  We will have to evacuate that
			 * page from under whoever is using it.
			 */
			if (needLock)
				UnlockRelationForExtension(irel, ExclusiveLock);
			LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
			return;
		}
		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
		page = BufferGetPage(buf);

		if (needLock)
			UnlockRelationForExtension(irel, ExclusiveLock);
	}

	/* Check that it's a regular block (or an empty page) */
	if (!PageIsNew(page) && !BRIN_IS_REGULAR_PAGE(page))
		ereport(ERROR,
				(errcode(ERRCODE_INDEX_CORRUPTED),
				 errmsg("unexpected page type 0x%04X in BRIN index \"%s\" block %u",
						BRIN_PAGE_TYPE(page),
						RelationGetRelationName(irel),
						BufferGetBlockNumber(buf))));

	/* If the page is in use, evacuate it and restart */
	if (brin_start_evacuating_page(irel, buf))
	{
		LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
		brin_evacuate_page(irel, revmap->rm_pagesPerRange, revmap, buf);

		/* have caller start over */
		return;
	}

	/*
	 * Ok, we have now locked the metapage and the target block. Re-initialize
	 * it as a revmap page.
	 */
	START_CRIT_SECTION();

	/* the rm_tids array is initialized to all invalid by PageInit */
	brin_page_init(page, BRIN_PAGETYPE_REVMAP);
	MarkBufferDirty(buf);

	metadata->lastRevmapPage = mapBlk;
	MarkBufferDirty(revmap->rm_metaBuf);

	if (RelationNeedsWAL(revmap->rm_irel))
	{
		xl_brin_revmap_extend xlrec;
		XLogRecPtr	recptr;
		XLogRecData rdata[2];

		xlrec.node = revmap->rm_irel->rd_node;
		xlrec.targetBlk = mapBlk;
		rdata[0].data = (char *) &xlrec;
		rdata[0].len = SizeOfBrinRevmapExtend;
		rdata[0].buffer = InvalidBuffer;
		rdata[0].buffer_std = false;
		rdata[0].next = &(rdata[1]);

		rdata[1].data = (char *) NULL;
		rdata[1].len = 0;
		rdata[1].buffer = revmap->rm_metaBuf;
		rdata[1].buffer_std = false;
		rdata[1].next = NULL;

		recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_REVMAP_EXTEND, rdata);
		PageSetLSN(metapage, recptr);
		PageSetLSN(page, recptr);
	}

	END_CRIT_SECTION();

	LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);

	UnlockReleaseBuffer(buf);
}
예제 #28
0
파일: brin_revmap.c 프로젝트: c2j/postgres
/*
 * Fetch the BrinTuple for a given heap block.
 *
 * The buffer containing the tuple is locked, and returned in *buf. As an
 * optimization, the caller can pass a pinned buffer *buf on entry, which will
 * avoid a pin-unpin cycle when the next tuple is on the same page as a
 * previous one.
 *
 * If no tuple is found for the given heap range, returns NULL. In that case,
 * *buf might still be updated, but it's not locked.
 *
 * The output tuple offset within the buffer is returned in *off, and its size
 * is returned in *size.
 */
BrinTuple *
brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk,
						 Buffer *buf, OffsetNumber *off, Size *size, int mode)
{
	Relation	idxRel = revmap->rm_irel;
	BlockNumber mapBlk;
	RevmapContents *contents;
	ItemPointerData *iptr;
	BlockNumber blk;
	Page		page;
	ItemId		lp;
	BrinTuple  *tup;
	ItemPointerData previptr;

	/* normalize the heap block number to be the first page in the range */
	heapBlk = (heapBlk / revmap->rm_pagesPerRange) * revmap->rm_pagesPerRange;

	/* Compute the revmap page number we need */
	mapBlk = revmap_get_blkno(revmap, heapBlk);
	if (mapBlk == InvalidBlockNumber)
	{
		*off = InvalidOffsetNumber;
		return NULL;
	}

	ItemPointerSetInvalid(&previptr);
	for (;;)
	{
		CHECK_FOR_INTERRUPTS();

		if (revmap->rm_currBuf == InvalidBuffer ||
			BufferGetBlockNumber(revmap->rm_currBuf) != mapBlk)
		{
			if (revmap->rm_currBuf != InvalidBuffer)
				ReleaseBuffer(revmap->rm_currBuf);

			Assert(mapBlk != InvalidBlockNumber);
			revmap->rm_currBuf = ReadBuffer(revmap->rm_irel, mapBlk);
		}

		LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_SHARE);

		contents = (RevmapContents *)
			PageGetContents(BufferGetPage(revmap->rm_currBuf));
		iptr = contents->rm_tids;
		iptr += HEAPBLK_TO_REVMAP_INDEX(revmap->rm_pagesPerRange, heapBlk);

		if (!ItemPointerIsValid(iptr))
		{
			LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK);
			return NULL;
		}

		/*
		 * Check the TID we got in a previous iteration, if any, and save the
		 * current TID we got from the revmap; if we loop, we can sanity-check
		 * that the next one we get is different.  Otherwise we might be stuck
		 * looping forever if the revmap is somehow badly broken.
		 */
		if (ItemPointerIsValid(&previptr) && ItemPointerEquals(&previptr, iptr))
			ereport(ERROR,
					(errcode(ERRCODE_INDEX_CORRUPTED),
					 errmsg_internal("corrupted BRIN index: inconsistent range map")));
		previptr = *iptr;

		blk = ItemPointerGetBlockNumber(iptr);
		*off = ItemPointerGetOffsetNumber(iptr);

		LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK);

		/* Ok, got a pointer to where the BrinTuple should be. Fetch it. */
		if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != blk)
		{
			if (BufferIsValid(*buf))
				ReleaseBuffer(*buf);
			*buf = ReadBuffer(idxRel, blk);
		}
		LockBuffer(*buf, mode);
		page = BufferGetPage(*buf);

		/* If we land on a revmap page, start over */
		if (BRIN_IS_REGULAR_PAGE(page))
		{
			lp = PageGetItemId(page, *off);
			if (ItemIdIsUsed(lp))
			{
				tup = (BrinTuple *) PageGetItem(page, lp);

				if (tup->bt_blkno == heapBlk)
				{
					if (size)
						*size = ItemIdGetLength(lp);
					/* found it! */
					return tup;
				}
			}
		}

		/*
		 * No luck. Assume that the revmap was updated concurrently.
		 */
		LockBuffer(*buf, BUFFER_LOCK_UNLOCK);
	}
	/* not reached, but keep compiler quiet */
	return NULL;
}
예제 #29
0
/*
 *	visibilitymap_set - set a bit on a previously pinned page
 *
 * recptr is the LSN of the XLOG record we're replaying, if we're in recovery,
 * or InvalidXLogRecPtr in normal running.  The page LSN is advanced to the
 * one provided; in normal running, we generate a new___ XLOG record and set the
 * page LSN to that value.  cutoff_xid is the largest xmin on the page being
 * marked all-visible; it is needed for Hot Standby, and can be
 * InvalidTransactionId if the page contains no tuples.
 *
 * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling
 * this function. Except in recovery, caller should also pass the heap
 * buffer. When checksums are enabled and we're not in recovery, we must add
 * the heap buffer to the WAL chain to protect it from being torn.
 *
 * You must pass a buffer containing the correct map page to this function.
 * Call visibilitymap_pin first to pin the right one. This function doesn't do
 * any I/O.
 */
void
visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
				  XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid)
{
	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
	uint32		mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
	uint8		mapBit = HEAPBLK_TO_MAPBIT(heapBlk);
	Page		page;
	char	   *map;

#ifdef TRACE_VISIBILITYMAP
	elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk);
#endif

	Assert(InRecovery || XLogRecPtrIsInvalid(recptr));
	Assert(InRecovery || BufferIsValid(heapBuf));

	/* Check that we have the right heap page pinned, if present */
	if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk)
		elog(ERROR, "wrong heap buffer passed to visibilitymap_set");

	/* Check that we have the right VM page pinned */
	if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock)
		elog(ERROR, "wrong VM buffer passed to visibilitymap_set");

	page = BufferGetPage(vmBuf);
	map = PageGetContents(page);
	LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE);

	if (!(map[mapByte] & (1 << mapBit)))
	{
		START_CRIT_SECTION();

		map[mapByte] |= (1 << mapBit);
		MarkBufferDirty(vmBuf);

		if (RelationNeedsWAL(rel))
		{
			if (XLogRecPtrIsInvalid(recptr))
			{
				Assert(!InRecovery);
				recptr = log_heap_visible(rel->rd_node, heapBuf, vmBuf,
										  cutoff_xid);

				/*
				 * If data checksums are enabled (or wal_log_hints=on), we
				 * need to protect the heap page from being torn.
				 */
				if (XLogHintBitIsNeeded())
				{
					Page		heapPage = BufferGetPage(heapBuf);

					/* caller is expected to set PD_ALL_VISIBLE first */
					Assert(PageIsAllVisible(heapPage));
					PageSetLSN(heapPage, recptr);
				}
			}
			PageSetLSN(page, recptr);
		}

		END_CRIT_SECTION();
	}

	LockBuffer(vmBuf, BUFFER_LOCK_UNLOCK);
}
예제 #30
0
/*
 * Searches for a slot with category at least minvalue.
 * Returns slot number, or -1 if none found.
 *
 * The caller must hold at least a shared lock on the page, and this
 * function can unlock and lock the page again in exclusive mode if it
 * needs to be updated. exclusive_lock_held should be set to true if the
 * caller is already holding an exclusive lock, to avoid extra work.
 *
 * If advancenext is false, fp_next_slot is set to point to the returned
 * slot, and if it's true, to the slot after the returned slot.
 */
int
fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext,
				 bool exclusive_lock_held)
{
	Page		page = BufferGetPage(buf);
	FSMPage		fsmpage = (FSMPage) PageGetContents(page);
	int			nodeno;
	int			target;
	uint16		slot;

restart:

	/*
	 * Check the root first, and exit quickly if there's no leaf with enough
	 * free space
	 */
	if (fsmpage->fp_nodes[0] < minvalue)
		return -1;

	/*
	 * Start search using fp_next_slot.  It's just a hint, so check that it's
	 * sane.  (This also handles wrapping around when the prior call returned
	 * the last slot on the page.)
	 */
	target = fsmpage->fp_next_slot;
	if (target < 0 || target >= LeafNodesPerPage)
		target = 0;
	target += NonLeafNodesPerPage;

	/*----------
	 * Start the search from the target slot.  At every step, move one
	 * node to the right, then climb up to the parent.	Stop when we reach
	 * a node with enough free space (as we must, since the root has enough
	 * space).
	 *
	 * The idea is to gradually expand our "search triangle", that is, all
	 * nodes covered by the current node, and to be sure we search to the
	 * right from the start point.	At the first step, only the target slot
	 * is examined.  When we move up from a left child to its parent, we are
	 * adding the right-hand subtree of that parent to the search triangle.
	 * When we move right then up from a right child, we are dropping the
	 * current search triangle (which we know doesn't contain any suitable
	 * page) and instead looking at the next-larger-size triangle to its
	 * right.  So we never look left from our original start point, and at
	 * each step the size of the search triangle doubles, ensuring it takes
	 * only log2(N) work to search N pages.
	 *
	 * The "move right" operation will wrap around if it hits the right edge
	 * of the tree, so the behavior is still good if we start near the right.
	 * Note also that the move-and-climb behavior ensures that we can't end
	 * up on one of the missing nodes at the right of the leaf level.
	 *
	 * For example, consider this tree:
	 *
	 *		   7
	 *	   7	   6
	 *	 5	 7	 6	 5
	 *	4 5 5 7 2 6 5 2
	 *				T
	 *
	 * Assume that the target node is the node indicated by the letter T,
	 * and we're searching for a node with value of 6 or higher. The search
	 * begins at T. At the first iteration, we move to the right, then to the
	 * parent, arriving at the rightmost 5. At the second iteration, we move
	 * to the right, wrapping around, then climb up, arriving at the 7 on the
	 * third level.  7 satisfies our search, so we descend down to the bottom,
	 * following the path of sevens.  This is in fact the first suitable page
	 * to the right of (allowing for wraparound) our start point.
	 *----------
	 */
	nodeno = target;
	while (nodeno > 0)
	{
		if (fsmpage->fp_nodes[nodeno] >= minvalue)
			break;

		/*
		 * Move to the right, wrapping around on same level if necessary, then
		 * climb up.
		 */
		nodeno = parentof(rightneighbor(nodeno));
	}

	/*
	 * We're now at a node with enough free space, somewhere in the middle of
	 * the tree. Descend to the bottom, following a path with enough free
	 * space, preferring to move left if there's a choice.
	 */
	while (nodeno < NonLeafNodesPerPage)
	{
		int			childnodeno = leftchild(nodeno);

		if (childnodeno < NodesPerPage &&
			fsmpage->fp_nodes[childnodeno] >= minvalue)
		{
			nodeno = childnodeno;
			continue;
		}
		childnodeno++;			/* point to right child */
		if (childnodeno < NodesPerPage &&
			fsmpage->fp_nodes[childnodeno] >= minvalue)
		{
			nodeno = childnodeno;
		}
		else
		{
			/*
			 * Oops. The parent node promised that either left or right child
			 * has enough space, but neither actually did. This can happen in
			 * case of a "torn page", IOW if we crashed earlier while writing
			 * the page to disk, and only part of the page made it to disk.
			 *
			 * Fix the corruption and restart.
			 */
			RelFileNode rnode;
			ForkNumber	forknum;
			BlockNumber blknum;

			BufferGetTag(buf, &rnode, &forknum, &blknum);
			elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u",
				 blknum, rnode.spcNode, rnode.dbNode, rnode.relNode);

			/* make sure we hold an exclusive lock */
			if (!exclusive_lock_held)
			{
				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
				exclusive_lock_held = true;
			}
			fsm_rebuild_page(page);
			MarkBufferDirty(buf);
			goto restart;
		}
	}

	/* We're now at the bottom level, at a node with enough space. */
	slot = nodeno - NonLeafNodesPerPage;

	/*
	 * Update the next-target pointer. Note that we do this even if we're only
	 * holding a shared lock, on the grounds that it's better to use a shared
	 * lock and get a garbled next pointer every now and then, than take the
	 * concurrency hit of an exclusive lock.
	 *
	 * Wrap-around is handled at the beginning of this function.
	 */
	fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0);

	return slot;
}