/* * _bitmap_log_metapage() -- log the changes to the metapage */ void _bitmap_log_metapage(Relation rel, Page page) { BMMetaPage metapage = (BMMetaPage) PageGetContents(page); xl_bm_metapage* xlMeta; XLogRecPtr recptr; XLogRecData rdata[1]; // Fetch gp_persistent_relation_node information that will be added to XLOG record. RelationFetchGpRelationNodeForXLog(rel); xlMeta = (xl_bm_metapage *) palloc(MAXALIGN(sizeof(xl_bm_metapage))); xlMeta->bm_node = rel->rd_node; xlMeta->bm_persistentTid = rel->rd_relationnodeinfo.persistentTid; xlMeta->bm_persistentSerialNum = rel->rd_relationnodeinfo.persistentSerialNum; xlMeta->bm_lov_heapId = metapage->bm_lov_heapId; xlMeta->bm_lov_indexId = metapage->bm_lov_indexId; xlMeta->bm_lov_lastpage = metapage->bm_lov_lastpage; rdata[0].buffer = InvalidBuffer; rdata[0].data = (char*)xlMeta; rdata[0].len = MAXALIGN(sizeof(xl_bm_metapage)); rdata[0].next = NULL; recptr = XLogInsert(RM_BITMAP_ID, XLOG_BITMAP_INSERT_META, rdata); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); pfree(xlMeta); }
/* * Returns the value at the root of a page. * * Since this is just a read-only access of a single byte, the page doesn't * need to be locked. */ uint8 fsm_get_max_avail(Page page) { FSMPage fsmpage = (FSMPage) PageGetContents(page); return fsmpage->fp_nodes[0]; }
/* * visibilitymap_clear - clear a bit in visibility map * * You must pass a buffer containing the correct map page to this function. * Call visibilitymap_pin first to pin the right one. This function doesn't do * any I/O. */ void visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer buf) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); int mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); int mapBit = HEAPBLK_TO_MAPBIT(heapBlk); uint8 mask = 1 << mapBit; char *map; #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_clear %s %d", RelationGetRelationName(rel), heapBlk); #endif if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock) elog(ERROR, "wrong buffer passed to visibilitymap_clear"); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); map = PageGetContents(BufferGetPage(buf)); if (map[mapByte] & mask) { map[mapByte] &= ~mask; MarkBufferDirty(buf); } LockBuffer(buf, BUFFER_LOCK_UNLOCK); }
Datum brin_metapage_info(PG_FUNCTION_ARGS) { bytea *raw_page = PG_GETARG_BYTEA_P(0); Page page; BrinMetaPageData *meta; TupleDesc tupdesc; Datum values[4]; bool nulls[4]; HeapTuple htup; page = verify_brin_page(raw_page, BRIN_PAGETYPE_META, "metapage"); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); tupdesc = BlessTupleDesc(tupdesc); /* Extract values from the metapage */ meta = (BrinMetaPageData *) PageGetContents(page); MemSet(nulls, 0, sizeof(nulls)); values[0] = CStringGetTextDatum(psprintf("0x%08X", meta->brinMagic)); values[1] = Int32GetDatum(meta->brinVersion); values[2] = Int32GetDatum(meta->pagesPerRange); values[3] = Int64GetDatum(meta->lastRevmapPage); htup = heap_form_tuple(tupdesc, values, nulls); PG_RETURN_DATUM(HeapTupleGetDatum(htup)); }
/* * _bitmap_get_metapage_data() -- return the metadata info stored * in the given metapage buffer. */ BMMetaPage _bitmap_get_metapage_data(Relation rel, Buffer metabuf) { Page page; BMMetaPage metapage; page = BufferGetPage(metabuf); metapage = (BMMetaPage)PageGetContents(page); /* * If this metapage is from the pre 3.4 version of the bitmap * index, we print "require to reindex" message, and error * out. */ if (metapage->bm_version != BITMAP_VERSION) { ereport(ERROR, (0, errmsg("The disk format for %s is not valid for this version of " "Greenplum Database. Use REINDEX %s to update this index", RelationGetRelationName(rel), RelationGetRelationName(rel)))); } return metapage; }
/* * PageGetTempPage * Get a temporary page in local memory for special processing */ Page PageGetTempPage(Page page, Size specialSize) { Size pageSize; Size size; Page temp; PageHeader thdr; pageSize = PageGetPageSize(page); temp = (Page) palloc(pageSize); thdr = (PageHeader) temp; /* copy old page in */ memcpy(temp, page, pageSize); /* clear out the middle */ size = pageSize - SizeOfPageHeaderData; size -= MAXALIGN(specialSize); MemSet(PageGetContents(thdr), 0, size); /* set high, low water marks */ thdr->pd_lower = SizeOfPageHeaderData; thdr->pd_upper = pageSize - MAXALIGN(specialSize); return temp; }
/* * Replay a revmap page extension */ static void brin_xlog_revmap_extend(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_brin_revmap_extend *xlrec; Buffer metabuf; Buffer buf; Page page; BlockNumber targetBlk; XLogRedoAction action; xlrec = (xl_brin_revmap_extend *) XLogRecGetData(record); XLogRecGetBlockTag(record, 1, NULL, NULL, &targetBlk); Assert(xlrec->targetBlk == targetBlk); /* Update the metapage */ action = XLogReadBufferForRedo(record, 0, &metabuf); if (action == BLK_NEEDS_REDO) { Page metapg; BrinMetaPageData *metadata; metapg = BufferGetPage(metabuf); metadata = (BrinMetaPageData *) PageGetContents(metapg); Assert(metadata->lastRevmapPage == xlrec->targetBlk - 1); metadata->lastRevmapPage = xlrec->targetBlk; PageSetLSN(metapg, lsn); /* * Set pd_lower just past the end of the metadata. This is essential, * because without doing so, metadata will be lost if xlog.c * compresses the page. (We must do this here because pre-v11 * versions of PG did not set the metapage's pd_lower correctly, so a * pg_upgraded index might contain the wrong value.) */ ((PageHeader) metapg)->pd_lower = ((char *) metadata + sizeof(BrinMetaPageData)) - (char *) metapg; MarkBufferDirty(metabuf); } /* * Re-init the target block as a revmap page. There's never a full- page * image here. */ buf = XLogInitBufferForRedo(record, 1); page = (Page) BufferGetPage(buf); brin_page_init(page, BRIN_PAGETYPE_REVMAP); PageSetLSN(page, lsn); MarkBufferDirty(buf); UnlockReleaseBuffer(buf); if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); }
Datum fsm_page_contents(PG_FUNCTION_ARGS) { bytea *raw_page = PG_GETARG_BYTEA_P(0); StringInfoData sinfo; FSMPage fsmpage; int i; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use raw page functions")))); fsmpage = (FSMPage) PageGetContents(VARDATA(raw_page)); initStringInfo(&sinfo); for (i = 0; i < NodesPerPage; i++) { if (fsmpage->fp_nodes[i] != 0) appendStringInfo(&sinfo, "%d: %d\n", i, fsmpage->fp_nodes[i]); } appendStringInfo(&sinfo, "fp_next_slot: %d\n", fsmpage->fp_next_slot); PG_RETURN_TEXT_P(cstring_to_text_with_len(sinfo.data, sinfo.len)); }
/* * visibilitymap_set - set a bit on a previously pinned page * * recptr is the LSN of the heap page. The LSN of the visibility map page is * advanced to that, to make sure that the visibility map doesn't get flushed * to disk before the update to the heap page that made all tuples visible. * * This is an opportunistic function. It does nothing, unless *buf * contains the bit for heapBlk. Call visibilitymap_pin first to pin * the right map page. This function doesn't do any I/O. */ void visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr, Buffer *buf) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk); Page page; char *map; #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); #endif /* Check that we have the right page pinned */ if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != mapBlock) return; page = BufferGetPage(*buf); map = PageGetContents(page); LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE); if (!(map[mapByte] & (1 << mapBit))) { map[mapByte] |= (1 << mapBit); if (XLByteLT(PageGetLSN(page), recptr)) PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); MarkBufferDirty(*buf); } LockBuffer(*buf, BUFFER_LOCK_UNLOCK); }
/* * visibilitymap_clear - clear a bit in visibility map * * Clear a bit in the visibility map, marking that not all tuples are * visible to all transactions anymore. */ void visibilitymap_clear(Relation rel, BlockNumber heapBlk) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); int mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); int mapBit = HEAPBLK_TO_MAPBIT(heapBlk); uint8 mask = 1 << mapBit; Buffer mapBuffer; char *map; #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_clear %s %d", RelationGetRelationName(rel), heapBlk); #endif mapBuffer = vm_readbuf(rel, mapBlock, false); if (!BufferIsValid(mapBuffer)) return; /* nothing to do */ LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE); map = PageGetContents(BufferGetPage(mapBuffer)); if (map[mapByte] & mask) { map[mapByte] &= ~mask; MarkBufferDirty(mapBuffer); } UnlockReleaseBuffer(mapBuffer); }
/* * Reconstructs the upper levels of a page. Returns true if the page * was modified. */ bool fsm_rebuild_page(Page page) { FSMPage fsmpage = (FSMPage) PageGetContents(page); bool changed = false; int nodeno; /* * Start from the lowest non-leaf level, at last node, working our way * backwards, through all non-leaf nodes at all levels, up to the root. */ for (nodeno = NonLeafNodesPerPage - 1; nodeno >= 0; nodeno--) { int lchild = leftchild(nodeno); int rchild = lchild + 1; uint8 newvalue = 0; /* The first few nodes we examine might have zero or one child. */ if (lchild < NodesPerPage) newvalue = fsmpage->fp_nodes[lchild]; if (rchild < NodesPerPage) newvalue = Max(newvalue, fsmpage->fp_nodes[rchild]); if (fsmpage->fp_nodes[nodeno] != newvalue) { fsmpage->fp_nodes[nodeno] = newvalue; changed = true; } } return changed; }
/* * Returns the value of given slot on page. * * Since this is just a read-only access of a single byte, the page doesn't * need to be locked. */ uint8 fsm_get_avail(Page page, int slot) { FSMPage fsmpage = (FSMPage) PageGetContents(page); Assert(slot < LeafNodesPerPage); return fsmpage->fp_nodes[NonLeafNodesPerPage + slot]; }
/* * Sets the value of a slot on page. Returns true if the page was modified. * * The caller must hold an exclusive lock on the page. */ bool fsm_set_avail(Page page, int slot, uint8 value) { int nodeno = NonLeafNodesPerPage + slot; FSMPage fsmpage = (FSMPage) PageGetContents(page); uint8 oldvalue; Assert(slot < LeafNodesPerPage); oldvalue = fsmpage->fp_nodes[nodeno]; /* If the value hasn't changed, we don't need to do anything */ if (oldvalue == value && value <= fsmpage->fp_nodes[0]) return false; fsmpage->fp_nodes[nodeno] = value; /* * Propagate up, until we hit the root or a node that doesn't need to be * updated. */ do { uint8 newvalue = 0; int lchild; int rchild; nodeno = parentof(nodeno); lchild = leftchild(nodeno); rchild = lchild + 1; newvalue = fsmpage->fp_nodes[lchild]; if (rchild < NodesPerPage) newvalue = Max(newvalue, fsmpage->fp_nodes[rchild]); oldvalue = fsmpage->fp_nodes[nodeno]; if (oldvalue == newvalue) break; fsmpage->fp_nodes[nodeno] = newvalue; } while (nodeno > 0); /* * sanity check: if the new value is (still) higher than the value at the * top, the tree is corrupt. If so, rebuild. */ if (value > fsmpage->fp_nodes[0]) fsm_rebuild_page(page); return true; }
/* * _bitmap_log_lovmetapage() -- log the lov meta page. */ void _bitmap_log_lovmetapage(Relation rel, Buffer lovMetaBuffer, uint8 numOfAttrs) { Page lovMetapage; BMLOVMetaItem metaItems; lovMetapage = BufferGetPage(lovMetaBuffer); metaItems = (BMLOVMetaItem)PageGetContents(lovMetapage); /* XLOG stuff */ START_CRIT_SECTION(); if (!(rel->rd_istemp)) { BMLOVMetaItem copyMetaItems; XLogRecPtr recptr; XLogRecData rdata[1]; xl_bm_lovmetapage* xlLovMeta; #ifdef BM_DEBUG elog(LOG, "call _bitmap_log_lovmetapage: numOfAttrs=%d", numOfAttrs); #endif xlLovMeta = (xl_bm_lovmetapage*) palloc(sizeof(xl_bm_lovmetapage)+ numOfAttrs*sizeof(BMLOVMetaItemData)); xlLovMeta->bm_node = rel->rd_node; xlLovMeta->bm_num_of_attrs = numOfAttrs; copyMetaItems = (BMLOVMetaItem) (((char*)xlLovMeta) + sizeof(xl_bm_lovmetapage)); memcpy(copyMetaItems, metaItems, numOfAttrs*sizeof(BMLOVMetaItemData)); rdata[0].buffer = InvalidBuffer; rdata[0].data = (char*)xlLovMeta; rdata[0].len = sizeof(xl_bm_lovmetapage) + numOfAttrs*sizeof(BMLOVMetaItemData); rdata[0].next = NULL; recptr = XLogInsert(RM_BITMAP_ID, XLOG_BITMAP_INSERT_LOVMETA, rdata); PageSetLSN(lovMetapage, recptr); PageSetTLI(lovMetapage, ThisTimeLineID); pfree(xlLovMeta); } END_CRIT_SECTION(); }
/* * Return the TID array stored in a BRIN revmap page */ Datum brin_revmap_data(PG_FUNCTION_ARGS) { struct { ItemPointerData *tids; int idx; } *state; FuncCallContext *fctx; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use raw page functions")))); if (SRF_IS_FIRSTCALL()) { bytea *raw_page = PG_GETARG_BYTEA_P(0); MemoryContext mctx; Page page; /* minimally verify the page we got */ page = verify_brin_page(raw_page, BRIN_PAGETYPE_REVMAP, "revmap"); /* create a function context for cross-call persistence */ fctx = SRF_FIRSTCALL_INIT(); /* switch to memory context appropriate for multiple function calls */ mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); state = palloc(sizeof(*state)); state->tids = ((RevmapContents *) PageGetContents(page))->rm_tids; state->idx = 0; fctx->user_fctx = state; MemoryContextSwitchTo(mctx); } fctx = SRF_PERCALL_SETUP(); state = fctx->user_fctx; if (state->idx < REVMAP_PAGE_MAXITEMS) SRF_RETURN_NEXT(fctx, PointerGetDatum(&state->tids[state->idx++])); SRF_RETURN_DONE(fctx); }
/* * Fetch index's statistical data into *stats */ void brinGetStats(Relation index, BrinStatsData *stats) { Buffer metabuffer; Page metapage; BrinMetaPageData *metadata; metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO); LockBuffer(metabuffer, BUFFER_LOCK_SHARE); metapage = BufferGetPage(metabuffer); metadata = (BrinMetaPageData *) PageGetContents(metapage); stats->pagesPerRange = metadata->pagesPerRange; stats->revmapNumPages = metadata->lastRevmapPage - 1; UnlockReleaseBuffer(metabuffer); }
/* * visibilitymap_set - set a bit on a previously pinned page * * recptr is the LSN of the XLOG record we're replaying, if we're in recovery, * or InvalidXLogRecPtr in normal running. The page LSN is advanced to the * one provided; in normal running, we generate a new XLOG record and set the * page LSN to that value. cutoff_xid is the largest xmin on the page being * marked all-visible; it is needed for Hot Standby, and can be * InvalidTransactionId if the page contains no tuples. * * You must pass a buffer containing the correct map page to this function. * Call visibilitymap_pin first to pin the right one. This function doesn't do * any I/O. */ void visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr, Buffer buf, TransactionId cutoff_xid) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk); Page page; char *map; #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); #endif Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); /* Check that we have the right page pinned */ if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock) elog(ERROR, "wrong buffer passed to visibilitymap_set"); page = BufferGetPage(buf); map = PageGetContents(page); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); if (!(map[mapByte] & (1 << mapBit))) { START_CRIT_SECTION(); map[mapByte] |= (1 << mapBit); MarkBufferDirty(buf); if (RelationNeedsWAL(rel)) { if (XLogRecPtrIsInvalid(recptr)) recptr = log_heap_visible(rel->rd_node, heapBlk, buf, cutoff_xid); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } END_CRIT_SECTION(); } LockBuffer(buf, BUFFER_LOCK_UNLOCK); }
/* * In the given revmap buffer (locked appropriately by caller), which is used * in a BRIN index of pagesPerRange pages per range, set the element * corresponding to heap block number heapBlk to the given TID. * * Once the operation is complete, the caller must update the LSN on the * returned buffer. * * This is used both in regular operation and during WAL replay. */ void brinSetHeapBlockItemptr(Buffer buf, BlockNumber pagesPerRange, BlockNumber heapBlk, ItemPointerData tid) { RevmapContents *contents; ItemPointerData *iptr; Page page; /* The correct page should already be pinned and locked */ page = BufferGetPage(buf); contents = (RevmapContents *) PageGetContents(page); iptr = (ItemPointerData *) contents->rm_tids; iptr += HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk); ItemPointerSet(iptr, ItemPointerGetBlockNumber(&tid), ItemPointerGetOffsetNumber(&tid)); }
/* * Initialize a new BRIN index' metapage. */ void brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version) { BrinMetaPageData *metadata; brin_page_init(page, BRIN_PAGETYPE_META); metadata = (BrinMetaPageData *) PageGetContents(page); metadata->brinMagic = BRIN_META_MAGIC; metadata->brinVersion = version; metadata->pagesPerRange = pagesPerRange; /* * Note we cheat here a little. 0 is not a valid revmap block number * (because it's the metapage buffer), but doing this enables the first * revmap page to be created when the index is. */ metadata->lastRevmapPage = 0; }
/* * visibilitymap_test - test if a bit is set * * Are all tuples on heapBlk visible to all, according to the visibility map? * * On entry, *buf should be InvalidBuffer or a valid buffer returned by an * earlier call to visibilitymap_pin or visibilitymap_test on the same * relation. On return, *buf is a valid buffer with the map page containing * the bit for heapBlk, or InvalidBuffer. The caller is responsible for * releasing *buf after it's done testing and setting bits. * * NOTE: This function is typically called without a lock on the heap page, * so somebody else could change the bit just after we look at it. In fact, * since we don't lock the visibility map page either, it's even possible that * someone else could have changed the bit just before we look at it, but yet * we might see the old value. It is the caller's responsibility to deal with * all concurrency issues! */ bool visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk); bool result; char *map; #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_test %s %d", RelationGetRelationName(rel), heapBlk); #endif /* Reuse the old pinned buffer if possible */ if (BufferIsValid(*buf)) { if (BufferGetBlockNumber(*buf) != mapBlock) { ReleaseBuffer(*buf); *buf = InvalidBuffer; } } if (!BufferIsValid(*buf)) { *buf = vm_readbuf(rel, mapBlock, false); if (!BufferIsValid(*buf)) return false; } map = PageGetContents(BufferGetPage(*buf)); /* * A single-bit read is atomic. There could be memory-ordering effects * here, but for performance reasons we make it the caller's job to worry * about that. */ result = (map[mapByte] & (1 << mapBit)) ? true : false; return result; }
/* * visibilitymap_test - test if a bit is set * * Are all tuples on heapBlk visible to all, according to the visibility map? * * On entry, *buf should be InvalidBuffer or a valid buffer returned by an * earlier call to visibilitymap_pin or visibilitymap_test on the same * relation. On return, *buf is a valid buffer with the map page containing * the bit for heapBlk, or InvalidBuffer. The caller is responsible for * releasing *buf after it's done testing and setting bits. */ bool visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk); bool result; char *map; #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_test %s %d", RelationGetRelationName(rel), heapBlk); #endif /* Reuse the old pinned buffer if possible */ if (BufferIsValid(*buf)) { if (BufferGetBlockNumber(*buf) != mapBlock) { ReleaseBuffer(*buf); *buf = InvalidBuffer; } } if (!BufferIsValid(*buf)) { *buf = vm_readbuf(rel, mapBlock, false); if (!BufferIsValid(*buf)) return false; } map = PageGetContents(BufferGetPage(*buf)); /* * We don't need to lock the page, as we're only looking at a single bit. */ result = (map[mapByte] & (1 << mapBit)) ? true : false; return result; }
/* * visibilitymap_count - count number of bits set in visibility map * * Note: we ignore the possibility of race conditions when the table is being * extended concurrently with the call. New pages added to the table aren't * going to be marked all-visible, so they won't affect the result. */ BlockNumber visibilitymap_count(Relation rel) { BlockNumber result = 0; BlockNumber mapBlock; for (mapBlock = 0; ; mapBlock++) { Buffer mapBuffer; unsigned char *map; int i; /* * Read till we fall off the end of the map. We assume that any * extra bytes in the last page are zeroed, so we don't bother * excluding them from the count. */ mapBuffer = vm_readbuf(rel, mapBlock, false); if (!BufferIsValid(mapBuffer)) break; /* * We choose not to lock the page, since the result is going to be * immediately stale anyway if anyone is concurrently setting or * clearing bits, and we only really need an approximate value. */ map = (unsigned char *) PageGetContents(BufferGetPage(mapBuffer)); for (i = 0; i < MAPSIZE; i++) { result += number_of_ones[map[i]]; } ReleaseBuffer(mapBuffer); } return result; }
/* * Initialize an access object for a range map. This must be freed by * brinRevmapTerminate when caller is done with it. */ BrinRevmap * brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange) { BrinRevmap *revmap; Buffer meta; BrinMetaPageData *metadata; meta = ReadBuffer(idxrel, BRIN_METAPAGE_BLKNO); LockBuffer(meta, BUFFER_LOCK_SHARE); metadata = (BrinMetaPageData *) PageGetContents(BufferGetPage(meta)); revmap = palloc(sizeof(BrinRevmap)); revmap->rm_irel = idxrel; revmap->rm_pagesPerRange = metadata->pagesPerRange; revmap->rm_lastRevmapPage = metadata->lastRevmapPage; revmap->rm_metaBuf = meta; revmap->rm_currBuf = InvalidBuffer; *pagesPerRange = metadata->pagesPerRange; LockBuffer(meta, BUFFER_LOCK_UNLOCK); return revmap; }
/* * Sets the available space to zero for all slots numbered >= nslots. * Returns true if the page was modified. */ bool fsm_truncate_avail(Page page, int nslots) { FSMPage fsmpage = (FSMPage) PageGetContents(page); uint8 *ptr; bool changed = false; Assert(nslots >= 0 && nslots < LeafNodesPerPage); /* Clear all truncated leaf nodes */ ptr = &fsmpage->fp_nodes[NonLeafNodesPerPage + nslots]; for (; ptr < &fsmpage->fp_nodes[NodesPerPage]; ptr++) { if (*ptr != 0) changed = true; *ptr = 0; } /* Fix upper nodes. */ if (changed) fsm_rebuild_page(page); return changed; }
/* * visibilitymap_truncate - truncate the visibility map * * The caller must hold AccessExclusiveLock on the relation, to ensure that * other backends receive the smgr invalidation event that this function sends * before they access the VM again. * * nheapblocks is the new size of the heap. */ void visibilitymap_truncate(Relation rel, BlockNumber nheapblocks) { BlockNumber newnblocks; /* last remaining block, byte, and bit */ BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks); uint32 truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks); uint8 truncBit = HEAPBLK_TO_MAPBIT(nheapblocks); #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks); #endif RelationOpenSmgr(rel); /* * If no visibility map has been created yet for this relation, there's * nothing to truncate. */ if (!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) return; /* * Unless the new size is exactly at a visibility map page boundary, the * tail bits in the last remaining map page, representing truncated heap * blocks, need to be cleared. This is not only tidy, but also necessary * because we don't get a chance to clear the bits if the heap is extended * again. */ if (truncByte != 0 || truncBit != 0) { Buffer mapBuffer; Page page; char *map; newnblocks = truncBlock + 1; mapBuffer = vm_readbuf(rel, truncBlock, false); if (!BufferIsValid(mapBuffer)) { /* nothing to do, the file was already smaller */ return; } page = BufferGetPage(mapBuffer); map = PageGetContents(page); LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE); /* Clear out the unwanted bytes. */ MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1)); /* * Mask out the unwanted bits of the last remaining byte. * * ((1 << 0) - 1) = 00000000 ((1 << 1) - 1) = 00000001 ... ((1 << 6) - * 1) = 00111111 ((1 << 7) - 1) = 01111111 */ map[truncByte] &= (1 << truncBit) - 1; MarkBufferDirty(mapBuffer); UnlockReleaseBuffer(mapBuffer); } else newnblocks = truncBlock; if (smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM) <= newnblocks) { /* nothing to do, the file was already smaller than requested size */ return; } /* Truncate the unused VM pages, and send smgr inval message */ smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks); /* * We might as well update the local smgr_vm_nblocks setting. smgrtruncate * sent an smgr cache inval message, which will cause other backends to * invalidate their copy of smgr_vm_nblocks, and this one too at the next * command boundary. But this ensures it isn't outright wrong until then. */ if (rel->rd_smgr) rel->rd_smgr->smgr_vm_nblocks = newnblocks; }
/* * Recursive guts of FreeSpaceMapVacuum */ static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p) { Buffer buf; Page page; uint8 max_avail; /* Read the page if it exists, or return EOF */ buf = fsm_readbuf(rel, addr, false); if (!BufferIsValid(buf)) { *eof_p = true; return 0; } else *eof_p = false; page = BufferGetPage(buf); /* * Recurse into children, and fix the information stored about them at * this level. */ if (addr.level > FSM_BOTTOM_LEVEL) { int slot; bool eof = false; for (slot = 0; slot < SlotsPerFSMPage; slot++) { int child_avail; CHECK_FOR_INTERRUPTS(); /* After we hit end-of-file, just clear the rest of the slots */ if (!eof) child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot), &eof); else child_avail = 0; /* Update information about the child */ if (fsm_get_avail(page, slot) != child_avail) { LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); fsm_set_avail(BufferGetPage(buf), slot, child_avail); MarkBufferDirtyHint(buf, false); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } } } max_avail = fsm_get_max_avail(BufferGetPage(buf)); /* * Reset the next slot pointer. This encourages the use of low-numbered * pages, increasing the chances that a later vacuum can truncate the * relation. */ ((FSMPage) PageGetContents(page))->fp_next_slot = 0; ReleaseBuffer(buf); return max_avail; }
/* * Try to extend the revmap by one page. This might not happen for a number of * reasons; caller is expected to retry until the expected outcome is obtained. */ static void revmap_physical_extend(BrinRevmap *revmap) { Buffer buf; Page page; Page metapage; BrinMetaPageData *metadata; BlockNumber mapBlk; BlockNumber nblocks; Relation irel = revmap->rm_irel; bool needLock = !RELATION_IS_LOCAL(irel); /* * Lock the metapage. This locks out concurrent extensions of the revmap, * but note that we still need to grab the relation extension lock because * another backend can extend the index with regular BRIN pages. */ LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_EXCLUSIVE); metapage = BufferGetPage(revmap->rm_metaBuf); metadata = (BrinMetaPageData *) PageGetContents(metapage); /* * Check that our cached lastRevmapPage value was up-to-date; if it * wasn't, update the cached copy and have caller start over. */ if (metadata->lastRevmapPage != revmap->rm_lastRevmapPage) { revmap->rm_lastRevmapPage = metadata->lastRevmapPage; LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); return; } mapBlk = metadata->lastRevmapPage + 1; nblocks = RelationGetNumberOfBlocks(irel); if (mapBlk < nblocks) { buf = ReadBuffer(irel, mapBlk); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); } else { if (needLock) LockRelationForExtension(irel, ExclusiveLock); buf = ReadBuffer(irel, P_NEW); if (BufferGetBlockNumber(buf) != mapBlk) { /* * Very rare corner case: somebody extended the relation * concurrently after we read its length. If this happens, give * up and have caller start over. We will have to evacuate that * page from under whoever is using it. */ if (needLock) UnlockRelationForExtension(irel, ExclusiveLock); LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); return; } LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); if (needLock) UnlockRelationForExtension(irel, ExclusiveLock); } /* Check that it's a regular block (or an empty page) */ if (!PageIsNew(page) && !BRIN_IS_REGULAR_PAGE(page)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("unexpected page type 0x%04X in BRIN index \"%s\" block %u", BRIN_PAGE_TYPE(page), RelationGetRelationName(irel), BufferGetBlockNumber(buf)))); /* If the page is in use, evacuate it and restart */ if (brin_start_evacuating_page(irel, buf)) { LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); brin_evacuate_page(irel, revmap->rm_pagesPerRange, revmap, buf); /* have caller start over */ return; } /* * Ok, we have now locked the metapage and the target block. Re-initialize * it as a revmap page. */ START_CRIT_SECTION(); /* the rm_tids array is initialized to all invalid by PageInit */ brin_page_init(page, BRIN_PAGETYPE_REVMAP); MarkBufferDirty(buf); metadata->lastRevmapPage = mapBlk; MarkBufferDirty(revmap->rm_metaBuf); if (RelationNeedsWAL(revmap->rm_irel)) { xl_brin_revmap_extend xlrec; XLogRecPtr recptr; XLogRecData rdata[2]; xlrec.node = revmap->rm_irel->rd_node; xlrec.targetBlk = mapBlk; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfBrinRevmapExtend; rdata[0].buffer = InvalidBuffer; rdata[0].buffer_std = false; rdata[0].next = &(rdata[1]); rdata[1].data = (char *) NULL; rdata[1].len = 0; rdata[1].buffer = revmap->rm_metaBuf; rdata[1].buffer_std = false; rdata[1].next = NULL; recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_REVMAP_EXTEND, rdata); PageSetLSN(metapage, recptr); PageSetLSN(page, recptr); } END_CRIT_SECTION(); LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK); UnlockReleaseBuffer(buf); }
/* * Fetch the BrinTuple for a given heap block. * * The buffer containing the tuple is locked, and returned in *buf. As an * optimization, the caller can pass a pinned buffer *buf on entry, which will * avoid a pin-unpin cycle when the next tuple is on the same page as a * previous one. * * If no tuple is found for the given heap range, returns NULL. In that case, * *buf might still be updated, but it's not locked. * * The output tuple offset within the buffer is returned in *off, and its size * is returned in *size. */ BrinTuple * brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk, Buffer *buf, OffsetNumber *off, Size *size, int mode) { Relation idxRel = revmap->rm_irel; BlockNumber mapBlk; RevmapContents *contents; ItemPointerData *iptr; BlockNumber blk; Page page; ItemId lp; BrinTuple *tup; ItemPointerData previptr; /* normalize the heap block number to be the first page in the range */ heapBlk = (heapBlk / revmap->rm_pagesPerRange) * revmap->rm_pagesPerRange; /* Compute the revmap page number we need */ mapBlk = revmap_get_blkno(revmap, heapBlk); if (mapBlk == InvalidBlockNumber) { *off = InvalidOffsetNumber; return NULL; } ItemPointerSetInvalid(&previptr); for (;;) { CHECK_FOR_INTERRUPTS(); if (revmap->rm_currBuf == InvalidBuffer || BufferGetBlockNumber(revmap->rm_currBuf) != mapBlk) { if (revmap->rm_currBuf != InvalidBuffer) ReleaseBuffer(revmap->rm_currBuf); Assert(mapBlk != InvalidBlockNumber); revmap->rm_currBuf = ReadBuffer(revmap->rm_irel, mapBlk); } LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_SHARE); contents = (RevmapContents *) PageGetContents(BufferGetPage(revmap->rm_currBuf)); iptr = contents->rm_tids; iptr += HEAPBLK_TO_REVMAP_INDEX(revmap->rm_pagesPerRange, heapBlk); if (!ItemPointerIsValid(iptr)) { LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK); return NULL; } /* * Check the TID we got in a previous iteration, if any, and save the * current TID we got from the revmap; if we loop, we can sanity-check * that the next one we get is different. Otherwise we might be stuck * looping forever if the revmap is somehow badly broken. */ if (ItemPointerIsValid(&previptr) && ItemPointerEquals(&previptr, iptr)) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg_internal("corrupted BRIN index: inconsistent range map"))); previptr = *iptr; blk = ItemPointerGetBlockNumber(iptr); *off = ItemPointerGetOffsetNumber(iptr); LockBuffer(revmap->rm_currBuf, BUFFER_LOCK_UNLOCK); /* Ok, got a pointer to where the BrinTuple should be. Fetch it. */ if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != blk) { if (BufferIsValid(*buf)) ReleaseBuffer(*buf); *buf = ReadBuffer(idxRel, blk); } LockBuffer(*buf, mode); page = BufferGetPage(*buf); /* If we land on a revmap page, start over */ if (BRIN_IS_REGULAR_PAGE(page)) { lp = PageGetItemId(page, *off); if (ItemIdIsUsed(lp)) { tup = (BrinTuple *) PageGetItem(page, lp); if (tup->bt_blkno == heapBlk) { if (size) *size = ItemIdGetLength(lp); /* found it! */ return tup; } } } /* * No luck. Assume that the revmap was updated concurrently. */ LockBuffer(*buf, BUFFER_LOCK_UNLOCK); } /* not reached, but keep compiler quiet */ return NULL; }
/* * visibilitymap_set - set a bit on a previously pinned page * * recptr is the LSN of the XLOG record we're replaying, if we're in recovery, * or InvalidXLogRecPtr in normal running. The page LSN is advanced to the * one provided; in normal running, we generate a new___ XLOG record and set the * page LSN to that value. cutoff_xid is the largest xmin on the page being * marked all-visible; it is needed for Hot Standby, and can be * InvalidTransactionId if the page contains no tuples. * * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling * this function. Except in recovery, caller should also pass the heap * buffer. When checksums are enabled and we're not in recovery, we must add * the heap buffer to the WAL chain to protect it from being torn. * * You must pass a buffer containing the correct map page to this function. * Call visibilitymap_pin first to pin the right one. This function doesn't do * any I/O. */ void visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid) { BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk); Page page; char *map; #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); #endif Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); Assert(InRecovery || BufferIsValid(heapBuf)); /* Check that we have the right heap page pinned, if present */ if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk) elog(ERROR, "wrong heap buffer passed to visibilitymap_set"); /* Check that we have the right VM page pinned */ if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock) elog(ERROR, "wrong VM buffer passed to visibilitymap_set"); page = BufferGetPage(vmBuf); map = PageGetContents(page); LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE); if (!(map[mapByte] & (1 << mapBit))) { START_CRIT_SECTION(); map[mapByte] |= (1 << mapBit); MarkBufferDirty(vmBuf); if (RelationNeedsWAL(rel)) { if (XLogRecPtrIsInvalid(recptr)) { Assert(!InRecovery); recptr = log_heap_visible(rel->rd_node, heapBuf, vmBuf, cutoff_xid); /* * If data checksums are enabled (or wal_log_hints=on), we * need to protect the heap page from being torn. */ if (XLogHintBitIsNeeded()) { Page heapPage = BufferGetPage(heapBuf); /* caller is expected to set PD_ALL_VISIBLE first */ Assert(PageIsAllVisible(heapPage)); PageSetLSN(heapPage, recptr); } } PageSetLSN(page, recptr); } END_CRIT_SECTION(); } LockBuffer(vmBuf, BUFFER_LOCK_UNLOCK); }
/* * Searches for a slot with category at least minvalue. * Returns slot number, or -1 if none found. * * The caller must hold at least a shared lock on the page, and this * function can unlock and lock the page again in exclusive mode if it * needs to be updated. exclusive_lock_held should be set to true if the * caller is already holding an exclusive lock, to avoid extra work. * * If advancenext is false, fp_next_slot is set to point to the returned * slot, and if it's true, to the slot after the returned slot. */ int fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext, bool exclusive_lock_held) { Page page = BufferGetPage(buf); FSMPage fsmpage = (FSMPage) PageGetContents(page); int nodeno; int target; uint16 slot; restart: /* * Check the root first, and exit quickly if there's no leaf with enough * free space */ if (fsmpage->fp_nodes[0] < minvalue) return -1; /* * Start search using fp_next_slot. It's just a hint, so check that it's * sane. (This also handles wrapping around when the prior call returned * the last slot on the page.) */ target = fsmpage->fp_next_slot; if (target < 0 || target >= LeafNodesPerPage) target = 0; target += NonLeafNodesPerPage; /*---------- * Start the search from the target slot. At every step, move one * node to the right, then climb up to the parent. Stop when we reach * a node with enough free space (as we must, since the root has enough * space). * * The idea is to gradually expand our "search triangle", that is, all * nodes covered by the current node, and to be sure we search to the * right from the start point. At the first step, only the target slot * is examined. When we move up from a left child to its parent, we are * adding the right-hand subtree of that parent to the search triangle. * When we move right then up from a right child, we are dropping the * current search triangle (which we know doesn't contain any suitable * page) and instead looking at the next-larger-size triangle to its * right. So we never look left from our original start point, and at * each step the size of the search triangle doubles, ensuring it takes * only log2(N) work to search N pages. * * The "move right" operation will wrap around if it hits the right edge * of the tree, so the behavior is still good if we start near the right. * Note also that the move-and-climb behavior ensures that we can't end * up on one of the missing nodes at the right of the leaf level. * * For example, consider this tree: * * 7 * 7 6 * 5 7 6 5 * 4 5 5 7 2 6 5 2 * T * * Assume that the target node is the node indicated by the letter T, * and we're searching for a node with value of 6 or higher. The search * begins at T. At the first iteration, we move to the right, then to the * parent, arriving at the rightmost 5. At the second iteration, we move * to the right, wrapping around, then climb up, arriving at the 7 on the * third level. 7 satisfies our search, so we descend down to the bottom, * following the path of sevens. This is in fact the first suitable page * to the right of (allowing for wraparound) our start point. *---------- */ nodeno = target; while (nodeno > 0) { if (fsmpage->fp_nodes[nodeno] >= minvalue) break; /* * Move to the right, wrapping around on same level if necessary, then * climb up. */ nodeno = parentof(rightneighbor(nodeno)); } /* * We're now at a node with enough free space, somewhere in the middle of * the tree. Descend to the bottom, following a path with enough free * space, preferring to move left if there's a choice. */ while (nodeno < NonLeafNodesPerPage) { int childnodeno = leftchild(nodeno); if (childnodeno < NodesPerPage && fsmpage->fp_nodes[childnodeno] >= minvalue) { nodeno = childnodeno; continue; } childnodeno++; /* point to right child */ if (childnodeno < NodesPerPage && fsmpage->fp_nodes[childnodeno] >= minvalue) { nodeno = childnodeno; } else { /* * Oops. The parent node promised that either left or right child * has enough space, but neither actually did. This can happen in * case of a "torn page", IOW if we crashed earlier while writing * the page to disk, and only part of the page made it to disk. * * Fix the corruption and restart. */ RelFileNode rnode; ForkNumber forknum; BlockNumber blknum; BufferGetTag(buf, &rnode, &forknum, &blknum); elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u", blknum, rnode.spcNode, rnode.dbNode, rnode.relNode); /* make sure we hold an exclusive lock */ if (!exclusive_lock_held) { LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); exclusive_lock_held = true; } fsm_rebuild_page(page); MarkBufferDirty(buf); goto restart; } } /* We're now at the bottom level, at a node with enough space. */ slot = nodeno - NonLeafNodesPerPage; /* * Update the next-target pointer. Note that we do this even if we're only * holding a shared lock, on the grounds that it's better to use a shared * lock and get a garbled next pointer every now and then, than take the * concurrency hit of an exclusive lock. * * Wrap-around is handled at the beginning of this function. */ fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0); return slot; }