/* * XLogReadBufferForRedoExtended * Like XLogReadBufferForRedo, but with extra options. * * If mode is RBM_ZERO or RBM_ZERO_ON_ERROR, if the page doesn't exist, the * relation is extended with all-zeroes pages up to the referenced block * number. In RBM_ZERO mode, the return value is always BLK_NEEDS_REDO. * * If 'get_cleanup_lock' is true, a "cleanup lock" is acquired on the buffer * using LockBufferForCleanup(), instead of a regular exclusive lock. */ XLogRedoAction XLogReadBufferForRedoExtended(XLogRecPtr lsn, XLogRecord *record, int block_index, RelFileNode rnode, ForkNumber forkno, BlockNumber blkno, ReadBufferMode mode, bool get_cleanup_lock, Buffer *buf) { if (record->xl_info & XLR_BKP_BLOCK(block_index)) { *buf = RestoreBackupBlock(lsn, record, block_index, get_cleanup_lock, true); return BLK_RESTORED; } else { *buf = XLogReadBufferExtended(rnode, forkno, blkno, mode); if (BufferIsValid(*buf)) { LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE); if (lsn <= PageGetLSN(BufferGetPage(*buf))) return BLK_DONE; else return BLK_NEEDS_REDO; } else return BLK_NOTFOUND; } }
/* * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in * WAL replay */ void XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk, Size spaceAvail) { int new_cat = fsm_space_avail_to_cat(spaceAvail); FSMAddress addr; uint16 slot; BlockNumber blkno; Buffer buf; Page page; /* Get the location of the FSM byte representing the heap block */ addr = fsm_get_location(heapBlk, &slot); blkno = fsm_logical_to_physical(addr); /* If the page doesn't exist already, extend */ buf = XLogReadBufferExtended(rnode, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); if (PageIsNew(page)) PageInit(page, BLCKSZ, 0); if (fsm_set_avail(page, slot, new_cat)) MarkBufferDirtyHint(buf, false); UnlockReleaseBuffer(buf); }
/* * XLogReadBuffer * Read a page during XLOG replay. * * This is a shorthand of XLogReadBufferExtended() followed by * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), for reading from the main * fork. * * (Getting the buffer lock is not really necessary during single-process * crash recovery, but some subroutines such as MarkBufferDirty will complain * if we don't have the lock. In hot standby mode it's definitely necessary.) * * The returned buffer is exclusively-locked. * * For historical reasons, instead of a ReadBufferMode argument, this only * supports RBM_ZERO (init == true) and RBM_NORMAL (init == false) modes. */ Buffer XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init) { Buffer buf; buf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, init ? RBM_ZERO : RBM_NORMAL); if (BufferIsValid(buf)) LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); return buf; }
/* * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in * WAL replay */ void XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk, Size spaceAvail) { int new_cat = fsm_space_avail_to_cat(spaceAvail); FSMAddress addr; uint16 slot; BlockNumber blkno; Buffer buf; Page page; bool write_to_fsm; /* This is meant to mirror the logic in fsm_allow_writes() */ if (heapBlk >= HEAP_FSM_CREATION_THRESHOLD) write_to_fsm = true; else { /* Open the relation at smgr level */ SMgrRelation smgr = smgropen(rnode, InvalidBackendId); if (smgrexists(smgr, FSM_FORKNUM)) write_to_fsm = true; else { BlockNumber heap_nblocks = smgrnblocks(smgr, MAIN_FORKNUM); if (heap_nblocks > HEAP_FSM_CREATION_THRESHOLD) write_to_fsm = true; else write_to_fsm = false; } } if (!write_to_fsm) return; /* Get the location of the FSM byte representing the heap block */ addr = fsm_get_location(heapBlk, &slot); blkno = fsm_logical_to_physical(addr); /* If the page doesn't exist already, extend */ buf = XLogReadBufferExtended(rnode, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); if (PageIsNew(page)) PageInit(page, BLCKSZ, 0); if (fsm_set_avail(page, slot, new_cat)) MarkBufferDirtyHint(buf, false); UnlockReleaseBuffer(buf); }
/* * Get the latestRemovedXid from the heap pages pointed at by the index * tuples being deleted. This puts the work for calculating latestRemovedXid * into the recovery path rather than the primary path. * * It's possible that this generates a fair amount of I/O, since an index * block may have hundreds of tuples being deleted. Repeat accesses to the * same heap blocks are common, though are not yet optimised. * * XXX optimise later with something like XLogPrefetchBuffer() */ static TransactionId btree_xlog_delete_get_latestRemovedXid(XLogReaderState *record) { xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); OffsetNumber *unused; Buffer ibuffer, hbuffer; Page ipage, hpage; RelFileNode rnode; BlockNumber blkno; ItemId iitemid, hitemid; IndexTuple itup; HeapTupleHeader htuphdr; BlockNumber hblkno; OffsetNumber hoffnum; TransactionId latestRemovedXid = InvalidTransactionId; int i; /* * If there's nothing running on the standby we don't need to derive a * full latestRemovedXid value, so use a fast path out of here. This * returns InvalidTransactionId, and so will conflict with all HS * transactions; but since we just worked out that that's zero people, * it's OK. * * XXX There is a race condition here, which is that a new backend might * start just after we look. If so, it cannot need to conflict, but this * coding will result in throwing a conflict anyway. */ if (CountDBBackends(InvalidOid) == 0) return latestRemovedXid; /* * In what follows, we have to examine the previous state of the index * page, as well as the heap page(s) it points to. This is only valid if * WAL replay has reached a consistent database state; which means that * the preceding check is not just an optimization, but is *necessary*. We * won't have let in any user sessions before we reach consistency. */ if (!reachedConsistency) elog(PANIC, "btree_xlog_delete_get_latestRemovedXid: cannot operate with inconsistent data"); /* * Get index page. If the DB is consistent, this should not fail, nor * should any of the heap page fetches below. If one does, we return * InvalidTransactionId to cancel all HS transactions. That's probably * overkill, but it's safe, and certainly better than panicking here. */ XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL); if (!BufferIsValid(ibuffer)) return InvalidTransactionId; LockBuffer(ibuffer, BT_READ); ipage = (Page) BufferGetPage(ibuffer); /* * Loop through the deleted index items to obtain the TransactionId from * the heap items they point to. */ unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete); for (i = 0; i < xlrec->nitems; i++) { /* * Identify the index tuple about to be deleted */ iitemid = PageGetItemId(ipage, unused[i]); itup = (IndexTuple) PageGetItem(ipage, iitemid); /* * Locate the heap page that the index tuple points at */ hblkno = ItemPointerGetBlockNumber(&(itup->t_tid)); hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL); if (!BufferIsValid(hbuffer)) { UnlockReleaseBuffer(ibuffer); return InvalidTransactionId; } LockBuffer(hbuffer, BUFFER_LOCK_SHARE); hpage = (Page) BufferGetPage(hbuffer); /* * Look up the heap tuple header that the index tuple points at by * using the heap node supplied with the xlrec. We can't use * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer. * Note that we are not looking at tuple data here, just headers. */ hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid)); hitemid = PageGetItemId(hpage, hoffnum); /* * Follow any redirections until we find something useful. */ while (ItemIdIsRedirected(hitemid)) { hoffnum = ItemIdGetRedirect(hitemid); hitemid = PageGetItemId(hpage, hoffnum); CHECK_FOR_INTERRUPTS(); } /* * If the heap item has storage, then read the header and use that to * set latestRemovedXid. * * Some LP_DEAD items may not be accessible, so we ignore them. */ if (ItemIdHasStorage(hitemid)) { htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid); HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid); } else if (ItemIdIsDead(hitemid)) { /* * Conjecture: if hitemid is dead then it had xids before the xids * marked on LP_NORMAL items. So we just ignore this item and move * onto the next, for the purposes of calculating * latestRemovedxids. */ } else Assert(!ItemIdIsUsed(hitemid)); UnlockReleaseBuffer(hbuffer); } UnlockReleaseBuffer(ibuffer); /* * If all heap tuples were LP_DEAD then we will be returning * InvalidTransactionId here, which avoids conflicts. This matches * existing logic which assumes that LP_DEAD tuples must already be older * than the latestRemovedXid on the cleanup record that set them as * LP_DEAD, hence must already have generated a conflict. */ return latestRemovedXid; }
static void btree_xlog_vacuum(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record); Buffer buffer; Page page; BTPageOpaque opaque; /* * If queries might be active then we need to ensure every leaf page is * unpinned between the lastBlockVacuumed and the current block, if there * are any. This prevents replay of the VACUUM from reaching the stage of * removing heap tuples while there could still be indexscans "in flight" * to those particular tuples (see nbtree/README). * * It might be worth checking if there are actually any backends running; * if not, we could just skip this. * * Since VACUUM can visit leaf pages out-of-order, it might issue records * with lastBlockVacuumed >= block; that's not an error, it just means * nothing to do now. * * Note: since we touch all pages in the range, we will lock non-leaf * pages, and also any empty (all-zero) pages that may be in the index. It * doesn't seem worth the complexity to avoid that. But it's important * that HotStandbyActiveInReplay() will not return true if the database * isn't yet consistent; so we need not fear reading still-corrupt blocks * here during crash recovery. */ if (HotStandbyActiveInReplay()) { RelFileNode thisrnode; BlockNumber thisblkno; BlockNumber blkno; XLogRecGetBlockTag(record, 0, &thisrnode, NULL, &thisblkno); for (blkno = xlrec->lastBlockVacuumed + 1; blkno < thisblkno; blkno++) { /* * We use RBM_NORMAL_NO_LOG mode because it's not an error * condition to see all-zero pages. The original btvacuumpage * scan would have skipped over all-zero pages, noting them in FSM * but not bothering to initialize them just yet; so we mustn't * throw an error here. (We could skip acquiring the cleanup lock * if PageIsNew, but it's probably not worth the cycles to test.) * * XXX we don't actually need to read the block, we just need to * confirm it is unpinned. If we had a special call into the * buffer manager we could optimise this so that if the block is * not in shared_buffers we confirm it as unpinned. */ buffer = XLogReadBufferExtended(thisrnode, MAIN_FORKNUM, blkno, RBM_NORMAL_NO_LOG); if (BufferIsValid(buffer)) { LockBufferForCleanup(buffer); UnlockReleaseBuffer(buffer); } } } /* * Like in btvacuumpage(), we need to take a cleanup lock on every leaf * page. See nbtree/README for details. */ if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer) == BLK_NEEDS_REDO) { char *ptr; Size len; ptr = XLogRecGetBlockData(record, 0, &len); page = (Page) BufferGetPage(buffer); if (len > 0) { OffsetNumber *unused; OffsetNumber *unend; unused = (OffsetNumber *) ptr; unend = (OffsetNumber *) ((char *) ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); } /* * Mark the page as not containing any LP_DEAD items --- see comments * in _bt_delitems_vacuum(). */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_flags &= ~BTP_HAS_GARBAGE; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }
/* * XLogReadBufferForRedoExtended * Like XLogReadBufferForRedo, but with extra options. * * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended * with all-zeroes pages up to the referenced block number. In * RBM_ZERO_AND_LOCK and RBM_ZERO_AND_CLEANUP_LOCK modes, the return value * is always BLK_NEEDS_REDO. * * (The RBM_ZERO_AND_CLEANUP_LOCK mode is redundant with the get_cleanup_lock * parameter. Do not use an inconsistent combination!) * * If 'get_cleanup_lock' is true, a "cleanup lock" is acquired on the buffer * using LockBufferForCleanup(), instead of a regular exclusive lock. */ XLogRedoAction XLogReadBufferForRedoExtended(XLogReaderState *record, uint8 block_id, ReadBufferMode mode, bool get_cleanup_lock, Buffer *buf) { XLogRecPtr lsn = record->EndRecPtr; RelFileNode rnode; ForkNumber forknum; BlockNumber blkno; Page page; bool zeromode; bool willinit; if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno)) { /* Caller specified a bogus block_id */ elog(PANIC, "failed to locate backup block with ID %d", block_id); } /* * Make sure that if the block is marked with WILL_INIT, the caller is * going to initialize it. And vice versa. */ zeromode = (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK); willinit = (record->blocks[block_id].flags & BKPBLOCK_WILL_INIT) != 0; if (willinit && !zeromode) elog(PANIC, "block with WILL_INIT flag in WAL record must be zeroed by redo routine"); if (!willinit && zeromode) elog(PANIC, "block to be initialized in redo routine must be marked with WILL_INIT flag in the WAL record"); /* If it's a full-page image, restore it. */ if (XLogRecHasBlockImage(record, block_id)) { *buf = XLogReadBufferExtended(rnode, forknum, blkno, get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK); page = BufferGetPage(*buf); if (!RestoreBlockImage(record, block_id, page)) elog(ERROR, "failed to restore block image"); /* * The page may be uninitialized. If so, we can't set the LSN because * that would corrupt the page. */ if (!PageIsNew(page)) { PageSetLSN(page, lsn); } MarkBufferDirty(*buf); /* * At the end of crash recovery the init forks of unlogged relations * are copied, without going through shared buffers. So we need to * force the on-disk state of init forks to always be in sync with the * state in shared buffers. */ if (forknum == INIT_FORKNUM) FlushOneBuffer(*buf); return BLK_RESTORED; } else { *buf = XLogReadBufferExtended(rnode, forknum, blkno, mode); if (BufferIsValid(*buf)) { if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK) { if (get_cleanup_lock) LockBufferForCleanup(*buf); else LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE); } if (lsn <= PageGetLSN(BufferGetPage(*buf))) return BLK_DONE; else return BLK_NEEDS_REDO; } else return BLK_NOTFOUND; } }