void smgr_redo(XLogRecPtr lsn, XLogRecord *record) { uint8 info = record->xl_info & ~XLR_INFO_MASK; /* Backup blocks are not used in smgr records */ Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); if (info == XLOG_SMGR_CREATE) { xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record); SMgrRelation reln; reln = smgropen(xlrec->rnode); smgrcreate(reln, MAIN_FORKNUM, true); } else if (info == XLOG_SMGR_TRUNCATE) { xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record); SMgrRelation reln; Relation rel; reln = smgropen(xlrec->rnode); /* * Forcibly create relation if it doesn't exist (which suggests that * it was dropped somewhere later in the WAL sequence). As in * XLogOpenRelation, we prefer to recreate the rel and replay the log * as best we can until the drop is seen. */ smgrcreate(reln, MAIN_FORKNUM, true); smgrtruncate(reln, MAIN_FORKNUM, xlrec->blkno, false); /* Also tell xlogutils.c about it */ XLogTruncateRelation(xlrec->rnode, MAIN_FORKNUM, xlrec->blkno); /* Truncate FSM and VM too */ rel = CreateFakeRelcacheEntry(xlrec->rnode); if (smgrexists(reln, FSM_FORKNUM)) FreeSpaceMapTruncateRel(rel, xlrec->blkno); if (smgrexists(reln, VISIBILITYMAP_FORKNUM)) visibilitymap_truncate(rel, xlrec->blkno); FreeFakeRelcacheEntry(rel); } else elog(PANIC, "smgr_redo: unknown op code %u", info); }
/* * Ensure that the visibility map fork is at least vm_nblocks long, extending * it if necessary with zeroed pages. */ static void vm_extend(Relation rel, BlockNumber vm_nblocks) { BlockNumber vm_nblocks_now; Page pg; pg = (Page) palloc(BLCKSZ); PageInit(pg, BLCKSZ, 0); /* * We use the relation extension lock to lock out other backends trying to * extend the visibility map at the same time. It also locks out extension * of the main fork, unnecessarily, but extending the visibility map * happens seldom enough that it doesn't seem worthwhile to have a * separate lock tag type for it. * * Note that another backend might have extended or created the relation * by the time we get the lock. */ LockRelationForExtension(rel, ExclusiveLock); /* Might have to re-open if a cache flush happened */ RelationOpenSmgr(rel); /* * Create the file first if it doesn't exist. If smgr_vm_nblocks is * positive then it must exist, no need for an smgrexists call. */ if ((rel->rd_smgr->smgr_vm_nblocks == 0 || rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber) && !smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) smgrcreate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, false); vm_nblocks_now = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM); /* Now extend the file */ while (vm_nblocks_now < vm_nblocks) { smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now, (char *) pg, false); vm_nblocks_now++; } /* * Send a shared-inval message to force other backends to close any smgr * references they may have for this rel, which we are about to change. * This is a useful optimization because it means that backends don't have * to keep checking for creation or extension of the file, which happens * infrequently. */ CacheInvalidateSmgr(rel->rd_smgr->smgr_rnode); /* Update local cache with the up-to-date size */ rel->rd_smgr->smgr_vm_nblocks = vm_nblocks_now; UnlockRelationForExtension(rel, ExclusiveLock); pfree(pg); }
/* * FreeSpaceMapTruncateRel - adjust for truncation of a relation. * * The caller must hold AccessExclusiveLock on the relation, to ensure that * other backends receive the smgr invalidation event that this function sends * before they access the FSM again. * * nblocks is the new___ size of the heap. */ void FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks) { BlockNumber new_nfsmblocks; FSMAddress first_removed_address; uint16 first_removed_slot; Buffer buf; RelationOpenSmgr(rel); /* * If no FSM has been created yet for this relation, there's nothing to * truncate. */ if (!smgrexists(rel->rd_smgr, FSM_FORKNUM)) return; /* Get the location in the FSM of the first removed heap block */ first_removed_address = fsm_get_location(nblocks, &first_removed_slot); /* * Zero out the tail of the last remaining FSM page. If the slot * representing the first removed heap block is at a page boundary, as the * first slot on the FSM page that first_removed_address points to, we can * just truncate that page altogether. */ if (first_removed_slot > 0) { buf = fsm_readbuf(rel, first_removed_address, false); if (!BufferIsValid(buf)) return; /* nothing to do; the FSM was already smaller */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); fsm_truncate_avail(BufferGetPage(buf), first_removed_slot); MarkBufferDirtyHint(buf, false); UnlockReleaseBuffer(buf); new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1; } else { new_nfsmblocks = fsm_logical_to_physical(first_removed_address); if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks) return; /* nothing to do; the FSM was already smaller */ } /* Truncate the unused FSM pages, and send smgr inval message */ smgrtruncate(rel->rd_smgr, FSM_FORKNUM, new_nfsmblocks); /* * We might as well update the local smgr_fsm_nblocks setting. * smgrtruncate sent an smgr cache inval message, which will cause other * backends to invalidate their copy of smgr_fsm_nblocks, and this one too * at the next command boundary. But this ensures it isn't outright wrong * until then. */ if (rel->rd_smgr) rel->rd_smgr->smgr_fsm_nblocks = new_nfsmblocks; }
/* * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in * WAL replay */ void XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk, Size spaceAvail) { int new_cat = fsm_space_avail_to_cat(spaceAvail); FSMAddress addr; uint16 slot; BlockNumber blkno; Buffer buf; Page page; bool write_to_fsm; /* This is meant to mirror the logic in fsm_allow_writes() */ if (heapBlk >= HEAP_FSM_CREATION_THRESHOLD) write_to_fsm = true; else { /* Open the relation at smgr level */ SMgrRelation smgr = smgropen(rnode, InvalidBackendId); if (smgrexists(smgr, FSM_FORKNUM)) write_to_fsm = true; else { BlockNumber heap_nblocks = smgrnblocks(smgr, MAIN_FORKNUM); if (heap_nblocks > HEAP_FSM_CREATION_THRESHOLD) write_to_fsm = true; else write_to_fsm = false; } } if (!write_to_fsm) return; /* Get the location of the FSM byte representing the heap block */ addr = fsm_get_location(heapBlk, &slot); blkno = fsm_logical_to_physical(addr); /* If the page doesn't exist already, extend */ buf = XLogReadBufferExtended(rnode, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(buf); if (PageIsNew(page)) PageInit(page, BLCKSZ, 0); if (fsm_set_avail(page, slot, new_cat)) MarkBufferDirtyHint(buf, false); UnlockReleaseBuffer(buf); }
/* * Ensure that the FSM fork is at least fsm_nblocks long, extending * it if necessary with empty pages. And by empty, I mean pages filled * with zeros, meaning there's no free space. */ static void fsm_extend(Relation rel, BlockNumber fsm_nblocks) { BlockNumber fsm_nblocks_now; Page pg; pg = (Page) palloc(BLCKSZ); PageInit(pg, BLCKSZ, 0); /* * We use the relation extension lock to lock out other backends trying to * extend the FSM at the same time. It also locks out extension of the * main fork, unnecessarily, but extending the FSM happens seldom enough * that it doesn't seem worthwhile to have a separate lock tag type for * it. * * Note that another backend might have extended or created the relation * by the time we get the lock. */ LockRelationForExtension(rel, ExclusiveLock); /* Might have to re-open if a cache flush happened */ RelationOpenSmgr(rel); /* * Create the FSM file first if it doesn't exist. If smgr_fsm_nblocks is * positive then it must exist, no need for an smgrexists call. */ if ((rel->rd_smgr->smgr_fsm_nblocks == 0 || rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber) && !smgrexists(rel->rd_smgr, FSM_FORKNUM)) smgrcreate(rel->rd_smgr, FSM_FORKNUM, false); fsm_nblocks_now = smgrnblocks(rel->rd_smgr, FSM_FORKNUM); while (fsm_nblocks_now < fsm_nblocks) { PageSetChecksumInplace(pg, fsm_nblocks_now); smgrextend(rel->rd_smgr, FSM_FORKNUM, fsm_nblocks_now, (char *) pg, false); fsm_nblocks_now++; } /* Update local cache with the up-to-date size */ rel->rd_smgr->smgr_fsm_nblocks = fsm_nblocks_now; UnlockRelationForExtension(rel, ExclusiveLock); pfree(pg); }
/* * For heaps, we prevent creation of the FSM unless the number of pages * exceeds HEAP_FSM_CREATION_THRESHOLD. For tables that don't already have * a FSM, this will save an inode and a few kB of space. * * XXX The API is a little awkward -- if the caller passes a valid nblocks * value, it can avoid invoking a system call. If the caller passes * InvalidBlockNumber and receives a false return value, it can get an * up-to-date relation size from get_nblocks. This saves a few cycles in * the caller, which would otherwise need to get the relation size by itself. */ static bool fsm_allow_writes(Relation rel, BlockNumber heapblk, BlockNumber nblocks, BlockNumber *get_nblocks) { bool skip_get_nblocks; if (heapblk >= HEAP_FSM_CREATION_THRESHOLD) return true; /* Non-heap rels can always create a FSM. */ if (rel->rd_rel->relkind != RELKIND_RELATION && rel->rd_rel->relkind != RELKIND_TOASTVALUE) return true; /* * If the caller knows nblocks, we can avoid a system call later. If it * doesn't, maybe we have relpages from a previous VACUUM. Since the table * may have extended since then, we still have to count the pages later if * we can't return now. */ if (nblocks != InvalidBlockNumber) { if (nblocks > HEAP_FSM_CREATION_THRESHOLD) return true; else skip_get_nblocks = true; } else { if (rel->rd_rel->relpages != InvalidBlockNumber && rel->rd_rel->relpages > HEAP_FSM_CREATION_THRESHOLD) return true; else skip_get_nblocks = false; } RelationOpenSmgr(rel); if (smgrexists(rel->rd_smgr, FSM_FORKNUM)) return true; if (skip_get_nblocks) return false; /* last resort */ *get_nblocks = RelationGetNumberOfBlocks(rel); if (*get_nblocks > HEAP_FSM_CREATION_THRESHOLD) return true; else return false; }
/* * smgrDoPendingDeletes() -- Take care of relation deletes at end of xact. * * This also runs when aborting a subxact; we want to clean up a failed * subxact immediately. */ void smgrDoPendingDeletes(bool isCommit) { int nestLevel = GetCurrentTransactionNestLevel(); PendingRelDelete *pending; PendingRelDelete *prev; PendingRelDelete *next; prev = NULL; for (pending = pendingDeletes; pending != NULL; pending = next) { next = pending->next; if (pending->nestLevel < nestLevel) { /* outer-level entries should not be processed yet */ prev = pending; } else { /* unlink list entry first, so we don't retry on failure */ if (prev) prev->next = next; else pendingDeletes = next; /* do deletion if called for */ if (pending->atCommit == isCommit) { int i; /* schedule unlinking old files */ SMgrRelation srel; srel = smgropen(pending->relnode); for (i = 0; i <= MAX_FORKNUM; i++) { if (smgrexists(srel, i)) smgrdounlink(srel, i, pending->isTemp, false); } smgrclose(srel); } /* must explicitly free the list entry */ pfree(pending); /* prev does not change */ } } }
/* * Read a visibility map page. * * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is * true, the visibility map file is extended. */ static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend) { Buffer buf; /* * We might not have opened the relation at the smgr level yet, or we * might have been forced to close it by a sinval message. The code below * won't necessarily notice relation extension immediately when extend = * false, so we rely on sinval messages to ensure that our ideas about the * size of the map aren't too far out of date. */ RelationOpenSmgr(rel); /* * If we haven't cached the size of the visibility map fork yet, check it * first. */ if (rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber) { if (smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) rel->rd_smgr->smgr_vm_nblocks = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM); else rel->rd_smgr->smgr_vm_nblocks = 0; } /* Handle requests beyond EOF */ if (blkno >= rel->rd_smgr->smgr_vm_nblocks) { if (extend) vm_extend(rel, blkno + 1); else return InvalidBuffer; } /* * Use ZERO_ON_ERROR mode, and initialize the page if necessary. It's * always safe to clear bits, so it's better to clear corrupt pages than * error out. */ buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL); if (PageIsNew(BufferGetPage(buf))) PageInit(BufferGetPage(buf), BLCKSZ, 0); return buf; }
/* * Ensure that the visibility map fork is at least vm_nblocks long, extending * it if necessary with zeroed pages. */ static void vm_extend(Relation rel, BlockNumber vm_nblocks) { BlockNumber vm_nblocks_now; Page pg; pg = (Page) palloc(BLCKSZ); PageInit(pg, BLCKSZ, 0); /* * We use the relation extension lock to lock out other backends trying to * extend the visibility map at the same time. It also locks out extension * of the main fork, unnecessarily, but extending the visibility map * happens seldom enough that it doesn't seem worthwhile to have a * separate lock tag type for it. * * Note that another backend might have extended or created the relation * before we get the lock. */ LockRelationForExtension(rel, ExclusiveLock); /* Create the file first if it doesn't exist */ if ((rel->rd_vm_nblocks == 0 || rel->rd_vm_nblocks == InvalidBlockNumber) && !smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) { smgrcreate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, false); vm_nblocks_now = 0; } else vm_nblocks_now = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM); while (vm_nblocks_now < vm_nblocks) { smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now, (char *) pg, rel->rd_istemp); vm_nblocks_now++; } UnlockRelationForExtension(rel, ExclusiveLock); pfree(pg); /* Update the relcache with the up-to-date size */ if (!InRecovery) CacheInvalidateRelcache(rel); rel->rd_vm_nblocks = vm_nblocks_now; }
/* * Read a FSM page. * * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is * true, the FSM file is extended. */ static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend) { BlockNumber blkno = fsm_logical_to_physical(addr); Buffer buf; RelationOpenSmgr(rel); /* * If we haven't cached the size of the FSM yet, check it first. Also * recheck if the requested block seems to be past end, since our cached * value might be stale. (We send smgr inval messages on truncation, but * not on extension.) */ if (rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber || blkno >= rel->rd_smgr->smgr_fsm_nblocks) { if (smgrexists(rel->rd_smgr, FSM_FORKNUM)) rel->rd_smgr->smgr_fsm_nblocks = smgrnblocks(rel->rd_smgr, FSM_FORKNUM); else rel->rd_smgr->smgr_fsm_nblocks = 0; } /* Handle requests beyond EOF */ if (blkno >= rel->rd_smgr->smgr_fsm_nblocks) { if (extend) fsm_extend(rel, blkno + 1); else return InvalidBuffer; } /* * Use ZERO_ON_ERROR mode, and initialize the page if necessary. The FSM * information is not accurate anyway, so it's better to clear corrupt * pages than error out. Since the FSM changes are not WAL-logged, the * so-called torn page problem on crash can lead to pages with corrupt * headers, for example. */ buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL); if (PageIsNew(BufferGetPage(buf))) PageInit(BufferGetPage(buf), BLCKSZ, 0); return buf; }
/* * Read a visibility map page. * * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is * true, the visibility map file is extended. */ static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend) { Buffer buf; RelationOpenSmgr(rel); /* * If we haven't cached the size of the visibility map fork yet, check it * first. Also recheck if the requested block seems to be past end, since * our cached value might be stale. (We send smgr inval messages on * truncation, but not on extension.) */ if (rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber || blkno >= rel->rd_smgr->smgr_vm_nblocks) { if (smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) rel->rd_smgr->smgr_vm_nblocks = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM); else rel->rd_smgr->smgr_vm_nblocks = 0; } /* Handle requests beyond EOF */ if (blkno >= rel->rd_smgr->smgr_vm_nblocks) { if (extend) vm_extend(rel, blkno + 1); else return InvalidBuffer; } /* * Use ZERO_ON_ERROR mode, and initialize the page if necessary. It's * always safe to clear bits, so it's better to clear corrupt pages than * error out. */ buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL); if (PageIsNew(BufferGetPage(buf))) PageInit(BufferGetPage(buf), BLCKSZ, 0); return buf; }
/* * Read a visibility map page. * * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is * true, the visibility map file is extended. */ static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend) { Buffer buf; RelationOpenSmgr(rel); /* * The current size of the visibility map fork is kept in relcache, to * avoid reading beyond EOF. If we haven't cached the size of the map yet, * do that first. */ if (rel->rd_vm_nblocks == InvalidBlockNumber) { if (smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) rel->rd_vm_nblocks = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM); else rel->rd_vm_nblocks = 0; } /* Handle requests beyond EOF */ if (blkno >= rel->rd_vm_nblocks) { if (extend) vm_extend(rel, blkno + 1); else return InvalidBuffer; } /* * Use ZERO_ON_ERROR mode, and initialize the page if necessary. It's * always safe to clear bits, so it's better to clear corrupt pages than * error out. */ buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL); if (PageIsNew(BufferGetPage(buf))) PageInit(BufferGetPage(buf), BLCKSZ, 0); return buf; }
/* * visibilitymap_truncate - truncate the visibility map * * The caller must hold AccessExclusiveLock on the relation, to ensure that * other backends receive the smgr invalidation event that this function sends * before they access the VM again. * * nheapblocks is the new size of the heap. */ void visibilitymap_truncate(Relation rel, BlockNumber nheapblocks) { BlockNumber newnblocks; /* last remaining block, byte, and bit */ BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks); uint32 truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks); uint8 truncBit = HEAPBLK_TO_MAPBIT(nheapblocks); #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks); #endif RelationOpenSmgr(rel); /* * If no visibility map has been created yet for this relation, there's * nothing to truncate. */ if (!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) return; /* * Unless the new size is exactly at a visibility map page boundary, the * tail bits in the last remaining map page, representing truncated heap * blocks, need to be cleared. This is not only tidy, but also necessary * because we don't get a chance to clear the bits if the heap is extended * again. */ if (truncByte != 0 || truncBit != 0) { Buffer mapBuffer; Page page; char *map; newnblocks = truncBlock + 1; mapBuffer = vm_readbuf(rel, truncBlock, false); if (!BufferIsValid(mapBuffer)) { /* nothing to do, the file was already smaller */ return; } page = BufferGetPage(mapBuffer); map = PageGetContents(page); LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE); /* Clear out the unwanted bytes. */ MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1)); /* * Mask out the unwanted bits of the last remaining byte. * * ((1 << 0) - 1) = 00000000 ((1 << 1) - 1) = 00000001 ... ((1 << 6) - * 1) = 00111111 ((1 << 7) - 1) = 01111111 */ map[truncByte] &= (1 << truncBit) - 1; MarkBufferDirty(mapBuffer); UnlockReleaseBuffer(mapBuffer); } else newnblocks = truncBlock; if (smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM) <= newnblocks) { /* nothing to do, the file was already smaller than requested size */ return; } /* Truncate the unused VM pages, and send smgr inval message */ smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks); /* * We might as well update the local smgr_vm_nblocks setting. smgrtruncate * sent an smgr cache inval message, which will cause other backends to * invalidate their copy of smgr_vm_nblocks, and this one too at the next * command boundary. But this ensures it isn't outright wrong until then. */ if (rel->rd_smgr) rel->rd_smgr->smgr_vm_nblocks = newnblocks; }
/* * visibilitymap_test - truncate the visibility map */ void visibilitymap_truncate(Relation rel, BlockNumber nheapblocks) { BlockNumber newnblocks; /* last remaining block, byte, and bit */ BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks); uint32 truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks); uint8 truncBit = HEAPBLK_TO_MAPBIT(nheapblocks); #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks); #endif /* * If no visibility map has been created yet for this relation, there's * nothing to truncate. */ if (!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) return; /* * Unless the new size is exactly at a visibility map page boundary, the * tail bits in the last remaining map page, representing truncated heap * blocks, need to be cleared. This is not only tidy, but also necessary * because we don't get a chance to clear the bits if the heap is extended * again. */ if (truncByte != 0 || truncBit != 0) { Buffer mapBuffer; Page page; char *map; newnblocks = truncBlock + 1; mapBuffer = vm_readbuf(rel, truncBlock, false); if (!BufferIsValid(mapBuffer)) { /* nothing to do, the file was already smaller */ return; } page = BufferGetPage(mapBuffer); map = PageGetContents(page); LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE); /* Clear out the unwanted bytes. */ MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1)); /* * Mask out the unwanted bits of the last remaining byte. * * ((1 << 0) - 1) = 00000000 ((1 << 1) - 1) = 00000001 ... ((1 << 6) - * 1) = 00111111 ((1 << 7) - 1) = 01111111 */ map[truncByte] &= (1 << truncBit) - 1; MarkBufferDirty(mapBuffer); UnlockReleaseBuffer(mapBuffer); } else newnblocks = truncBlock; if (smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM) < newnblocks) { /* nothing to do, the file was already smaller than requested size */ return; } smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks, rel->rd_istemp); /* * Need to invalidate the relcache entry, because rd_vm_nblocks seen by * other backends is no longer valid. */ if (!InRecovery) CacheInvalidateRelcache(rel); rel->rd_vm_nblocks = newnblocks; }
/* * FinishPreparedTransaction: execute COMMIT PREPARED or ROLLBACK PREPARED */ void FinishPreparedTransaction(const char *gid, bool isCommit) { GlobalTransaction gxact; TransactionId xid; char *buf; char *bufptr; TwoPhaseFileHeader *hdr; TransactionId latestXid; TransactionId *children; RelFileNode *commitrels; RelFileNode *abortrels; RelFileNode *delrels; int ndelrels; int i; /* * Validate the GID, and lock the GXACT to ensure that two backends do not * try to commit the same GID at once. */ gxact = LockGXact(gid, GetUserId()); xid = gxact->proc.xid; /* * Read and validate the state file */ buf = ReadTwoPhaseFile(xid); if (buf == NULL) ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("two-phase state file for transaction %u is corrupt", xid))); /* * Disassemble the header area */ hdr = (TwoPhaseFileHeader *) buf; Assert(TransactionIdEquals(hdr->xid, xid)); bufptr = buf + MAXALIGN(sizeof(TwoPhaseFileHeader)); children = (TransactionId *) bufptr; bufptr += MAXALIGN(hdr->nsubxacts * sizeof(TransactionId)); commitrels = (RelFileNode *) bufptr; bufptr += MAXALIGN(hdr->ncommitrels * sizeof(RelFileNode)); abortrels = (RelFileNode *) bufptr; bufptr += MAXALIGN(hdr->nabortrels * sizeof(RelFileNode)); /* compute latestXid among all children */ latestXid = TransactionIdLatest(xid, hdr->nsubxacts, children); /* * The order of operations here is critical: make the XLOG entry for * commit or abort, then mark the transaction committed or aborted in * pg_clog, then remove its PGPROC from the global ProcArray (which means * TransactionIdIsInProgress will stop saying the prepared xact is in * progress), then run the post-commit or post-abort callbacks. The * callbacks will release the locks the transaction held. */ if (isCommit) RecordTransactionCommitPrepared(xid, hdr->nsubxacts, children, hdr->ncommitrels, commitrels); else RecordTransactionAbortPrepared(xid, hdr->nsubxacts, children, hdr->nabortrels, abortrels); ProcArrayRemove(&gxact->proc, latestXid); /* * In case we fail while running the callbacks, mark the gxact invalid so * no one else will try to commit/rollback, and so it can be recycled * properly later. It is still locked by our XID so it won't go away yet. * * (We assume it's safe to do this without taking TwoPhaseStateLock.) */ gxact->valid = false; /* * We have to remove any files that were supposed to be dropped. For * consistency with the regular xact.c code paths, must do this before * releasing locks, so do it before running the callbacks. * * NB: this code knows that we couldn't be dropping any temp rels ... */ if (isCommit) { delrels = commitrels; ndelrels = hdr->ncommitrels; } else { delrels = abortrels; ndelrels = hdr->nabortrels; } for (i = 0; i < ndelrels; i++) { SMgrRelation srel = smgropen(delrels[i]); ForkNumber fork; for (fork = 0; fork <= MAX_FORKNUM; fork++) { if (smgrexists(srel, fork)) smgrdounlink(srel, fork, false, false); } smgrclose(srel); } /* And now do the callbacks */ if (isCommit) ProcessRecords(bufptr, xid, twophase_postcommit_callbacks); else ProcessRecords(bufptr, xid, twophase_postabort_callbacks); /* Count the prepared xact as committed or aborted */ AtEOXact_PgStat(isCommit); /* * And now we can clean up our mess. */ RemoveTwoPhaseFile(xid, true); RemoveGXact(gxact); pfree(buf); }
/* * pg_prewarm(regclass, mode text, fork text, * first_block int8, last_block int8) * * The first argument is the relation to be prewarmed; the second controls * how prewarming is done; legal options are 'prefetch', 'read', and 'buffer'. * The third is the name of the relation fork to be prewarmed. The fourth * and fifth arguments specify the first and last block to be prewarmed. * If the fourth argument is NULL, it will be taken as 0; if the fifth argument * is NULL, it will be taken as the number of blocks in the relation. The * return value is the number of blocks successfully prewarmed. */ Datum pg_prewarm(PG_FUNCTION_ARGS) { Oid relOid; text *forkName; text *type; int64 first_block; int64 last_block; int64 nblocks; int64 blocks_done = 0; int64 block; Relation rel; ForkNumber forkNumber; char *forkString; char *ttype; PrewarmType ptype; AclResult aclresult; /* Basic sanity checking. */ if (PG_ARGISNULL(0)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("relation cannot be null"))); relOid = PG_GETARG_OID(0); if (PG_ARGISNULL(1)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errmsg("prewarm type cannot be null")))); type = PG_GETARG_TEXT_P(1); ttype = text_to_cstring(type); if (strcmp(ttype, "prefetch") == 0) ptype = PREWARM_PREFETCH; else if (strcmp(ttype, "read") == 0) ptype = PREWARM_READ; else if (strcmp(ttype, "buffer") == 0) ptype = PREWARM_BUFFER; else { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid prewarm type"), errhint("Valid prewarm types are \"prefetch\", \"read\", and \"buffer\"."))); PG_RETURN_INT64(0); /* Placate compiler. */ } if (PG_ARGISNULL(2)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), (errmsg("relation fork cannot be null")))); forkName = PG_GETARG_TEXT_P(2); forkString = text_to_cstring(forkName); forkNumber = forkname_to_number(forkString); /* Open relation and check privileges. */ rel = relation_open(relOid, AccessShareLock); aclresult = pg_class_aclcheck(relOid, GetUserId(), ACL_SELECT); if (aclresult != ACLCHECK_OK) aclcheck_error(aclresult, ACL_KIND_CLASS, get_rel_name(relOid)); /* Check that the fork exists. */ RelationOpenSmgr(rel); if (!smgrexists(rel->rd_smgr, forkNumber)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("fork \"%s\" does not exist for this relation", forkString))); /* Validate block numbers, or handle nulls. */ nblocks = RelationGetNumberOfBlocksInFork(rel, forkNumber); if (PG_ARGISNULL(3)) first_block = 0; else { first_block = PG_GETARG_INT64(3); if (first_block < 0 || first_block >= nblocks) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("starting block number must be between 0 and " INT64_FORMAT, nblocks - 1))); } if (PG_ARGISNULL(4)) last_block = nblocks - 1; else { last_block = PG_GETARG_INT64(4); if (last_block < 0 || last_block >= nblocks) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("ending block number must be between 0 and " INT64_FORMAT, nblocks - 1))); } /* Now we're ready to do the real work. */ if (ptype == PREWARM_PREFETCH) { #ifdef USE_PREFETCH /* * In prefetch mode, we just hint the OS to read the blocks, but we * don't know whether it really does it, and we don't wait for it to * finish. * * It would probably be better to pass our prefetch requests in chunks * of a megabyte or maybe even a whole segment at a time, but there's * no practical way to do that at present without a gross modularity * violation, so we just do this. */ for (block = first_block; block <= last_block; ++block) { CHECK_FOR_INTERRUPTS(); PrefetchBuffer(rel, forkNumber, block); ++blocks_done; } #else ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("prefetch is not supported by this build"))); #endif } else if (ptype == PREWARM_READ) { /* * In read mode, we actually read the blocks, but not into shared * buffers. This is more portable than prefetch mode (it works * everywhere) and is synchronous. */ for (block = first_block; block <= last_block; ++block) { CHECK_FOR_INTERRUPTS(); smgrread(rel->rd_smgr, forkNumber, block, blockbuffer); ++blocks_done; } } else if (ptype == PREWARM_BUFFER) { /* * In buffer mode, we actually pull the data into shared_buffers. */ for (block = first_block; block <= last_block; ++block) { Buffer buf; CHECK_FOR_INTERRUPTS(); buf = ReadBufferExtended(rel, forkNumber, block, RBM_NORMAL, NULL); ReleaseBuffer(buf); ++blocks_done; } } /* Close relation, release lock. */ relation_close(rel, AccessShareLock); PG_RETURN_INT64(blocks_done); }
/* * RelationTruncate * Physically truncate a relation to the specified number of blocks. * * This includes getting rid of any buffers for the blocks that are to be * dropped. */ void RelationTruncate(Relation rel, BlockNumber nblocks) { bool fsm; bool vm; /* Open it at the smgr level if not already done */ RelationOpenSmgr(rel); /* * Make sure smgr_targblock etc aren't pointing somewhere past new end */ rel->rd_smgr->smgr_targblock = InvalidBlockNumber; rel->rd_smgr->smgr_fsm_nblocks = InvalidBlockNumber; rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber; /* Truncate the FSM first if it exists */ fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM); if (fsm) FreeSpaceMapTruncateRel(rel, nblocks); /* Truncate the visibility map too if it exists. */ vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM); if (vm) visibilitymap_truncate(rel, nblocks); /* * We WAL-log the truncation before actually truncating, which means * trouble if the truncation fails. If we then crash, the WAL replay * likely isn't going to succeed in the truncation either, and cause a * PANIC. It's tempting to put a critical section here, but that cure * would be worse than the disease. It would turn a usually harmless * failure to truncate, that might spell trouble at WAL replay, into a * certain PANIC. */ if (!rel->rd_istemp) { /* * Make an XLOG entry reporting the file truncation. */ XLogRecPtr lsn; XLogRecData rdata; xl_smgr_truncate xlrec; xlrec.blkno = nblocks; xlrec.rnode = rel->rd_node; rdata.data = (char *) &xlrec; rdata.len = sizeof(xlrec); rdata.buffer = InvalidBuffer; rdata.next = NULL; lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE, &rdata); /* * Flush, because otherwise the truncation of the main relation might * hit the disk before the WAL record, and the truncation of the FSM * or visibility map. If we crashed during that window, we'd be left * with a truncated heap, but the FSM or visibility map would still * contain entries for the non-existent heap pages. */ if (fsm || vm) XLogFlush(lsn); } /* Do the real work */ smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks, rel->rd_istemp); }
static void ReadBlocks(int filenum) { FILE *file; char record_type; char *dbname; Oid record_filenode; ForkNumber record_forknum; BlockNumber record_blocknum; BlockNumber record_range; int log_level = DEBUG3; Oid relOid = InvalidOid; Relation rel = NULL; bool skip_relation = false; bool skip_fork = false; bool skip_block = false; BlockNumber nblocks = 0; BlockNumber blocks_restored = 0; const char *filepath; /* * If this condition changes, then this code, and the code in the writer * will need to be changed; especially the format specifiers in log and * error messages. */ StaticAssertStmt(MaxBlockNumber == 0xFFFFFFFE, "Code may need review."); filepath = getSavefileName(filenum); file = fileOpen(filepath, PG_BINARY_R); dbname = readDBName(file, filepath); /* * When restoring global objects, the dbname is zero-length string, and non- * zero length otherwise. And filenum is never expected to be smaller than 1. */ Assert(filenum >= 1); Assert(filenum == 1 ? strlen(dbname) == 0 : strlen(dbname) > 0); /* To restore the global objects, use default database */ BackgroundWorkerInitializeConnection(filenum == 1 ? guc_default_database : dbname, NULL); SetCurrentStatementStartTimestamp(); StartTransactionCommand(); SPI_connect(); PushActiveSnapshot(GetTransactionSnapshot()); pgstat_report_activity(STATE_RUNNING, "restoring buffers"); /* * Note that in case of a read error, we will leak relcache entry that we may * currently have open. In case of EOF, we close the relation after the loop. */ while (fileRead(&record_type, 1, file, true, filepath)) { /* * If we want to process the signals, this seems to be the best place * to do it. Generally the backends refrain from processing config file * while in transaction, but that's more for the fear of allowing GUC * changes to affect expression evaluation, causing different results * for the same expression in a transaction. Since this worker is not * processing any queries, it is okay to process the config file here. * * Even though it's okay to process SIGHUP here, doing so doesn't add * any value. The only reason we might want to process config file here * would be to allow the user to interrupt the BlockReader's operation * by changing this extenstion's GUC parameter. But the user can do that * anyway, using SIGTERM or pg_terminate_backend(). */ /* Stop processing the save-file if the Postmaster wants us to die. */ if (got_sigterm) break; ereport(log_level, (errmsg("record type %x - %c", record_type, record_type))); switch (record_type) { case 'r': { /* Close the previous relation, if any. */ if (rel) { relation_close(rel, AccessShareLock); rel = NULL; } record_forknum = InvalidForkNumber; record_blocknum = InvalidBlockNumber; nblocks = 0; fileRead(&record_filenode, sizeof(Oid), file, false, filepath); relOid = GetRelOid(record_filenode); ereport(log_level, (errmsg("processing filenode %u, relation %u", record_filenode, relOid))); /* * If the relation has been rewritten/dropped since we saved it, * just skip it and process the next relation. */ if (relOid == InvalidOid) skip_relation = true; else { skip_relation = false; /* Open the relation */ rel = relation_open(relOid, AccessShareLock); RelationOpenSmgr(rel); } } break; case 'f': { record_blocknum = InvalidBlockNumber; nblocks = 0; fileRead(&record_forknum, sizeof(ForkNumber), file, false, filepath); if (skip_relation) continue; if (rel == NULL) ereport(ERROR, (errmsg("found a fork record without a preceeding relation record"))); ereport(log_level, (errmsg("processing fork %d", record_forknum))); if (!smgrexists(rel->rd_smgr, record_forknum)) skip_fork = true; else { skip_fork = false; nblocks = RelationGetNumberOfBlocksInFork(rel, record_forknum); } } break; case 'b': { if (record_forknum == InvalidForkNumber) ereport(ERROR, (errmsg("found a block record without a preceeding fork record"))); fileRead(&record_blocknum, sizeof(BlockNumber), file, false, filepath); if (skip_relation || skip_fork) continue; /* * Don't try to read past the file; the file may have been shrunk * by a vaccum/truncate operation. */ if (record_blocknum >= nblocks) { ereport(log_level, (errmsg("reader %d skipping block filenode %u forknum %d blocknum %u", filenum, record_filenode, record_forknum, record_blocknum))); skip_block = true; continue; } else { Buffer buf; skip_block = false; ereport(log_level, (errmsg("reader %d reading block filenode %u forknum %d blocknum %u", filenum, record_filenode, record_forknum, record_blocknum))); buf = ReadBufferExtended(rel, record_forknum, record_blocknum, RBM_NORMAL, NULL); ReleaseBuffer(buf); ++blocks_restored; } } break; case 'N': { BlockNumber block; Assert(record_blocknum != InvalidBlockNumber); if (record_blocknum == InvalidBlockNumber) ereport(ERROR, (errmsg("found a block range record without a preceeding block record"))); fileRead(&record_range, sizeof(int), file, false, filepath); if (skip_relation || skip_fork || skip_block) continue; ereport(log_level, (errmsg("reader %d reading range filenode %u forknum %d blocknum %u range %u", filenum, record_filenode, record_forknum, record_blocknum, record_range))); for (block = record_blocknum + 1; block <= (record_blocknum + record_range); ++block) { Buffer buf; /* * Don't try to read past the file; the file may have been * shrunk by a vaccum operation. */ if (block >= nblocks) { ereport(log_level, (errmsg("reader %d skipping block range filenode %u forknum %d start %u end %u", filenum, record_filenode, record_forknum, block, record_blocknum + record_range))); break; } buf = ReadBufferExtended(rel, record_forknum, block, RBM_NORMAL, NULL); ReleaseBuffer(buf); ++blocks_restored; } } break; default: { ereport(ERROR, (errmsg("found unexpected save-file marker %x - %c)", record_type, record_type))); Assert(false); } break; } } if (rel) relation_close(rel, AccessShareLock); ereport(LOG, (errmsg("Block Reader %d: restored %u blocks", filenum, blocks_restored))); SPI_finish(); PopActiveSnapshot(); CommitTransactionCommand(); pgstat_report_activity(STATE_IDLE, NULL); fileClose(file, filepath); /* Remove the save-file */ if (remove(filepath) != 0) ereport(ERROR, (errcode_for_file_access(), errmsg("error removing file \"%s\" : %m", filepath))); }
/* * Read a FSM page. * * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is * true, the FSM file is extended. */ static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend) { BlockNumber blkno = fsm_logical_to_physical(addr); Buffer buf; RelationOpenSmgr(rel); /* * If we haven't cached the size of the FSM yet, check it first. Also * recheck if the requested block seems to be past end, since our cached * value might be stale. (We send smgr inval messages on truncation, but * not on extension.) */ if (rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber || blkno >= rel->rd_smgr->smgr_fsm_nblocks) { if (smgrexists(rel->rd_smgr, FSM_FORKNUM)) rel->rd_smgr->smgr_fsm_nblocks = smgrnblocks(rel->rd_smgr, FSM_FORKNUM); else rel->rd_smgr->smgr_fsm_nblocks = 0; } /* Handle requests beyond EOF */ if (blkno >= rel->rd_smgr->smgr_fsm_nblocks) { if (extend) fsm_extend(rel, blkno + 1); else return InvalidBuffer; } /* * Use ZERO_ON_ERROR mode, and initialize the page if necessary. The FSM * information is not accurate anyway, so it's better to clear corrupt * pages than error out. Since the FSM changes are not WAL-logged, the * so-called torn page problem on crash can lead to pages with corrupt * headers, for example. * * The initialize-the-page part is trickier than it looks, because of the * possibility of multiple backends doing this concurrently, and our * desire to not uselessly take the buffer lock in the normal path where * the page is OK. We must take the lock to initialize the page, so * recheck page newness after we have the lock, in case someone else * already did it. Also, because we initially check PageIsNew with no * lock, it's possible to fall through and return the buffer while someone * else is still initializing the page (i.e., we might see pd_upper as set * but other page header fields are still zeroes). This is harmless for * callers that will take a buffer lock themselves, but some callers * inspect the page without any lock at all. The latter is OK only so * long as it doesn't depend on the page header having correct contents. * Current usage is safe because PageGetContents() does not require that. */ buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL); if (PageIsNew(BufferGetPage(buf))) { LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); if (PageIsNew(BufferGetPage(buf))) PageInit(BufferGetPage(buf), BLCKSZ, 0); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } return buf; }
/* * FreeSpaceMapTruncateRel - adjust for truncation of a relation. * * The caller must hold AccessExclusiveLock on the relation, to ensure that * other backends receive the smgr invalidation event that this function sends * before they access the FSM again. * * nblocks is the new size of the heap. */ void FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks) { BlockNumber new_nfsmblocks; FSMAddress first_removed_address; uint16 first_removed_slot; Buffer buf; RelationOpenSmgr(rel); /* * If no FSM has been created yet for this relation, there's nothing to * truncate. */ if (!smgrexists(rel->rd_smgr, FSM_FORKNUM)) return; /* Get the location in the FSM of the first removed heap block */ first_removed_address = fsm_get_location(nblocks, &first_removed_slot); /* * Zero out the tail of the last remaining FSM page. If the slot * representing the first removed heap block is at a page boundary, as the * first slot on the FSM page that first_removed_address points to, we can * just truncate that page altogether. */ if (first_removed_slot > 0) { buf = fsm_readbuf(rel, first_removed_address, false); if (!BufferIsValid(buf)) return; /* nothing to do; the FSM was already smaller */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); /* NO EREPORT(ERROR) from here till changes are logged */ START_CRIT_SECTION(); fsm_truncate_avail(BufferGetPage(buf), first_removed_slot); /* * Truncation of a relation is WAL-logged at a higher-level, and we * will be called at WAL replay. But if checksums are enabled, we need * to still write a WAL record to protect against a torn page, if the * page is flushed to disk before the truncation WAL record. We cannot * use MarkBufferDirtyHint here, because that will not dirty the page * during recovery. */ MarkBufferDirty(buf); if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded()) log_newpage_buffer(buf, false); END_CRIT_SECTION(); UnlockReleaseBuffer(buf); new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1; } else { new_nfsmblocks = fsm_logical_to_physical(first_removed_address); if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks) return; /* nothing to do; the FSM was already smaller */ } /* Truncate the unused FSM pages, and send smgr inval message */ smgrtruncate(rel->rd_smgr, FSM_FORKNUM, new_nfsmblocks); /* * We might as well update the local smgr_fsm_nblocks setting. * smgrtruncate sent an smgr cache inval message, which will cause other * backends to invalidate their copy of smgr_fsm_nblocks, and this one too * at the next command boundary. But this ensures it isn't outright wrong * until then. */ if (rel->rd_smgr) rel->rd_smgr->smgr_fsm_nblocks = new_nfsmblocks; /* * Update upper-level FSM pages to account for the truncation. This is * important because the just-truncated pages were likely marked as * all-free, and would be preferentially selected. */ FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber); }
void smgr_redo(XLogRecPtr lsn, XLogRecord *record) { uint8 info = record->xl_info & ~XLR_INFO_MASK; /* Backup blocks are not used in smgr records */ Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); if (info == XLOG_SMGR_CREATE) { xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record); SMgrRelation reln; reln = smgropen(xlrec->rnode, InvalidBackendId); smgrcreate(reln, xlrec->forkNum, true); } else if (info == XLOG_SMGR_TRUNCATE) { xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record); SMgrRelation reln; Relation rel; reln = smgropen(xlrec->rnode, InvalidBackendId); /* * Forcibly create relation if it doesn't exist (which suggests that * it was dropped somewhere later in the WAL sequence). As in * XLogReadBuffer, we prefer to recreate the rel and replay the log as * best we can until the drop is seen. */ smgrcreate(reln, MAIN_FORKNUM, true); /* * Before we perform the truncation, update minimum recovery point * to cover this WAL record. Once the relation is truncated, there's * no going back. The buffer manager enforces the WAL-first rule * for normal updates to relation files, so that the minimum recovery * point is always updated before the corresponding change in the * data file is flushed to disk. We have to do the same manually * here. * * Doing this before the truncation means that if the truncation fails * for some reason, you cannot start up the system even after restart, * until you fix the underlying situation so that the truncation will * succeed. Alternatively, we could update the minimum recovery point * after truncation, but that would leave a small window where the * WAL-first rule could be violated. */ XLogFlush(lsn); smgrtruncate(reln, MAIN_FORKNUM, xlrec->blkno); /* Also tell xlogutils.c about it */ XLogTruncateRelation(xlrec->rnode, MAIN_FORKNUM, xlrec->blkno); /* Truncate FSM and VM too */ rel = CreateFakeRelcacheEntry(xlrec->rnode); if (smgrexists(reln, FSM_FORKNUM)) FreeSpaceMapTruncateRel(rel, xlrec->blkno); if (smgrexists(reln, VISIBILITYMAP_FORKNUM)) visibilitymap_truncate(rel, xlrec->blkno); FreeFakeRelcacheEntry(rel); } else elog(PANIC, "smgr_redo: unknown op code %u", info); }