/* * XLogReadBufferForRedo * Read a page during XLOG replay * * Reads a block referenced by a WAL record into shared buffer cache, and * determines what needs to be done to redo the changes to it. If the WAL * record includes a full-page image of the page, it is restored. * * 'lsn' is the LSN of the record being replayed. It is compared with the * page's LSN to determine if the record has already been replayed. * 'block_id' is the ID number the block was registered with, when the WAL * record was created. * * Returns one of the following: * * BLK_NEEDS_REDO - changes from the WAL record need to be applied * BLK_DONE - block doesn't need replaying * BLK_RESTORED - block was restored from a full-page image included in * the record * BLK_NOTFOUND - block was not found (because it was truncated away by * an operation later in the WAL stream) * * On return, the buffer is locked in exclusive-mode, and returned in *buf. * Note that the buffer is locked and returned even if it doesn't need * replaying. (Getting the buffer lock is not really necessary during * single-process crash recovery, but some subroutines such as MarkBufferDirty * will complain if we don't have the lock. In hot standby mode it's * definitely necessary.) * * Note: when a backup block is available in XLOG, we restore it * unconditionally, even if the page in the database appears newer. This is * to protect ourselves against database pages that were partially or * incorrectly written during a crash. We assume that the XLOG data must be * good because it has passed a CRC check, while the database page might not * be. This will force us to replay all subsequent modifications of the page * that appear in XLOG, rather than possibly ignoring them as already * applied, but that's not a huge drawback. */ XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id, Buffer *buf) { return XLogReadBufferForRedoExtended(record, block_id, RBM_NORMAL, false, buf); }
/* * Pin and lock a buffer referenced by a WAL record, for the purpose of * re-initializing it. */ Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id) { Buffer buf; XLogReadBufferForRedoExtended(record, block_id, RBM_ZERO_AND_LOCK, false, &buf); return buf; }
/* * XLogReadBufferForRedo * Read a page during XLOG replay * * Reads a block referenced by a WAL record into shared buffer cache, and * determines what needs to be done to redo the changes to it. If the WAL * record includes a full-page image of the page, it is restored. * * 'lsn' is the LSN of the record being replayed. It is compared with the * page's LSN to determine if the record has already been replayed. * 'rnode' and 'blkno' point to the block being replayed (main fork number * is implied, use XLogReadBufferForRedoExtended for other forks). * 'block_index' identifies the backup block in the record for the page. * * Returns one of the following: * * BLK_NEEDS_REDO - changes from the WAL record need to be applied * BLK_DONE - block doesn't need replaying * BLK_RESTORED - block was restored from a full-page image included in * the record * BLK_NOTFOUND - block was not found (because it was truncated away by * an operation later in the WAL stream) * * On return, the buffer is locked in exclusive-mode, and returned in *buf. * Note that the buffer is locked and returned even if it doesn't need * replaying. (Getting the buffer lock is not really necessary during * single-process crash recovery, but some subroutines such as MarkBufferDirty * will complain if we don't have the lock. In hot standby mode it's * definitely necessary.) */ XLogRedoAction XLogReadBufferForRedo(XLogRecPtr lsn, XLogRecord *record, int block_index, RelFileNode rnode, BlockNumber blkno, Buffer *buf) { return XLogReadBufferForRedoExtended(lsn, record, block_index, rnode, MAIN_FORKNUM, blkno, RBM_NORMAL, false, buf); }
static void btree_xlog_vacuum(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record); Buffer buffer; Page page; BTPageOpaque opaque; /* * If queries might be active then we need to ensure every leaf page is * unpinned between the lastBlockVacuumed and the current block, if there * are any. This prevents replay of the VACUUM from reaching the stage of * removing heap tuples while there could still be indexscans "in flight" * to those particular tuples (see nbtree/README). * * It might be worth checking if there are actually any backends running; * if not, we could just skip this. * * Since VACUUM can visit leaf pages out-of-order, it might issue records * with lastBlockVacuumed >= block; that's not an error, it just means * nothing to do now. * * Note: since we touch all pages in the range, we will lock non-leaf * pages, and also any empty (all-zero) pages that may be in the index. It * doesn't seem worth the complexity to avoid that. But it's important * that HotStandbyActiveInReplay() will not return true if the database * isn't yet consistent; so we need not fear reading still-corrupt blocks * here during crash recovery. */ if (HotStandbyActiveInReplay()) { RelFileNode thisrnode; BlockNumber thisblkno; BlockNumber blkno; XLogRecGetBlockTag(record, 0, &thisrnode, NULL, &thisblkno); for (blkno = xlrec->lastBlockVacuumed + 1; blkno < thisblkno; blkno++) { /* * We use RBM_NORMAL_NO_LOG mode because it's not an error * condition to see all-zero pages. The original btvacuumpage * scan would have skipped over all-zero pages, noting them in FSM * but not bothering to initialize them just yet; so we mustn't * throw an error here. (We could skip acquiring the cleanup lock * if PageIsNew, but it's probably not worth the cycles to test.) * * XXX we don't actually need to read the block, we just need to * confirm it is unpinned. If we had a special call into the * buffer manager we could optimise this so that if the block is * not in shared_buffers we confirm it as unpinned. */ buffer = XLogReadBufferExtended(thisrnode, MAIN_FORKNUM, blkno, RBM_NORMAL_NO_LOG); if (BufferIsValid(buffer)) { LockBufferForCleanup(buffer); UnlockReleaseBuffer(buffer); } } } /* * Like in btvacuumpage(), we need to take a cleanup lock on every leaf * page. See nbtree/README for details. */ if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer) == BLK_NEEDS_REDO) { char *ptr; Size len; ptr = XLogRecGetBlockData(record, 0, &len); page = (Page) BufferGetPage(buffer); if (len > 0) { OffsetNumber *unused; OffsetNumber *unend; unused = (OffsetNumber *) ptr; unend = (OffsetNumber *) ((char *) ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); } /* * Mark the page as not containing any LP_DEAD items --- see comments * in _bt_delitems_vacuum(). */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_flags &= ~BTP_HAS_GARBAGE; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }
/* * replay delete operation of hash index */ static void hash_xlog_delete(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_delete *xldata = (xl_hash_delete *) XLogRecGetData(record); Buffer bucketbuf = InvalidBuffer; Buffer deletebuf; Page page; XLogRedoAction action; /* * Ensure we have a cleanup lock on primary bucket page before we start * with the actual replay operation. This is to ensure that neither a * scan can start nor a scan can be already-in-progress during the replay * of this operation. If we allow scans during this operation, then they * can miss some records or show the same record multiple times. */ if (xldata->is_primary_bucket_page) action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &deletebuf); else { /* * we don't care for return value as the purpose of reading bucketbuf * is to ensure a cleanup lock on primary bucket page. */ (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); action = XLogReadBufferForRedo(record, 1, &deletebuf); } /* replay the record for deleting entries in bucket page */ if (action == BLK_NEEDS_REDO) { char *ptr; Size len; ptr = XLogRecGetBlockData(record, 1, &len); page = (Page) BufferGetPage(deletebuf); if (len > 0) { OffsetNumber *unused; OffsetNumber *unend; unused = (OffsetNumber *) ptr; unend = (OffsetNumber *) ((char *) ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); } /* * Mark the page as not containing any LP_DEAD items only if * clear_dead_marking flag is set to true. See comments in * hashbucketcleanup() for details. */ if (xldata->clear_dead_marking) { HashPageOpaque pageopaque; pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; } PageSetLSN(page, lsn); MarkBufferDirty(deletebuf); } if (BufferIsValid(deletebuf)) UnlockReleaseBuffer(deletebuf); if (BufferIsValid(bucketbuf)) UnlockReleaseBuffer(bucketbuf); }
/* * replay squeeze page operation of hash index */ static void hash_xlog_squeeze_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) XLogRecGetData(record); Buffer bucketbuf = InvalidBuffer; Buffer writebuf; Buffer ovflbuf; Buffer prevbuf = InvalidBuffer; Buffer mapbuf; XLogRedoAction action; /* * Ensure we have a cleanup lock on primary bucket page before we start * with the actual replay operation. This is to ensure that neither a * scan can start nor a scan can be already-in-progress during the replay * of this operation. If we allow scans during this operation, then they * can miss some records or show the same record multiple times. */ if (xldata->is_prim_bucket_same_wrt) action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf); else { /* * we don't care for return value as the purpose of reading bucketbuf * is to ensure a cleanup lock on primary bucket page. */ (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); action = XLogReadBufferForRedo(record, 1, &writebuf); } /* replay the record for adding entries in overflow buffer */ if (action == BLK_NEEDS_REDO) { Page writepage; char *begin; char *data; Size datalen; uint16 ninserted = 0; data = begin = XLogRecGetBlockData(record, 1, &datalen); writepage = (Page) BufferGetPage(writebuf); if (xldata->ntups > 0) { OffsetNumber *towrite = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntups; while (data - begin < datalen) { IndexTuple itup = (IndexTuple) data; Size itemsz; OffsetNumber l; itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); data += itemsz; l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); if (l == InvalidOffsetNumber) elog(ERROR, "hash_xlog_squeeze_page: failed to add item to hash index page, size %d bytes", (int) itemsz); ninserted++; } } /* * number of tuples inserted must be same as requested in REDO record. */ Assert(ninserted == xldata->ntups); /* * if the page on which are adding tuples is a page previous to freed * overflow page, then update its nextblno. */ if (xldata->is_prev_bucket_same_wrt) { HashPageOpaque writeopaque = (HashPageOpaque) PageGetSpecialPointer(writepage); writeopaque->hasho_nextblkno = xldata->nextblkno; } PageSetLSN(writepage, lsn); MarkBufferDirty(writebuf); } /* replay the record for initializing overflow buffer */ if (XLogReadBufferForRedo(record, 2, &ovflbuf) == BLK_NEEDS_REDO) { Page ovflpage; ovflpage = BufferGetPage(ovflbuf); _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); PageSetLSN(ovflpage, lsn); MarkBufferDirty(ovflbuf); } if (BufferIsValid(ovflbuf)) UnlockReleaseBuffer(ovflbuf); /* replay the record for page previous to the freed overflow page */ if (!xldata->is_prev_bucket_same_wrt && XLogReadBufferForRedo(record, 3, &prevbuf) == BLK_NEEDS_REDO) { Page prevpage = BufferGetPage(prevbuf); HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); prevopaque->hasho_nextblkno = xldata->nextblkno; PageSetLSN(prevpage, lsn); MarkBufferDirty(prevbuf); } if (BufferIsValid(prevbuf)) UnlockReleaseBuffer(prevbuf); /* replay the record for page next to the freed overflow page */ if (XLogRecHasBlockRef(record, 4)) { Buffer nextbuf; if (XLogReadBufferForRedo(record, 4, &nextbuf) == BLK_NEEDS_REDO) { Page nextpage = BufferGetPage(nextbuf); HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); nextopaque->hasho_prevblkno = xldata->prevblkno; PageSetLSN(nextpage, lsn); MarkBufferDirty(nextbuf); } if (BufferIsValid(nextbuf)) UnlockReleaseBuffer(nextbuf); } if (BufferIsValid(writebuf)) UnlockReleaseBuffer(writebuf); if (BufferIsValid(bucketbuf)) UnlockReleaseBuffer(bucketbuf); /* * Note: in normal operation, we'd update the bitmap and meta page while * still holding lock on the primary bucket page and overflow pages. But * during replay it's not necessary to hold those locks, since no other * index updates can be happening concurrently. */ /* replay the record for bitmap page */ if (XLogReadBufferForRedo(record, 5, &mapbuf) == BLK_NEEDS_REDO) { Page mappage = (Page) BufferGetPage(mapbuf); uint32 *freep = NULL; char *data; uint32 *bitmap_page_bit; Size datalen; freep = HashPageGetBitmap(mappage); data = XLogRecGetBlockData(record, 5, &datalen); bitmap_page_bit = (uint32 *) data; CLRBIT(freep, *bitmap_page_bit); PageSetLSN(mappage, lsn); MarkBufferDirty(mapbuf); } if (BufferIsValid(mapbuf)) UnlockReleaseBuffer(mapbuf); /* replay the record for meta page */ if (XLogRecHasBlockRef(record, 6)) { Buffer metabuf; if (XLogReadBufferForRedo(record, 6, &metabuf) == BLK_NEEDS_REDO) { HashMetaPage metap; Page page; char *data; uint32 *firstfree_ovflpage; Size datalen; data = XLogRecGetBlockData(record, 6, &datalen); firstfree_ovflpage = (uint32 *) data; page = BufferGetPage(metabuf); metap = HashPageGetMeta(page); metap->hashm_firstfree = *firstfree_ovflpage; PageSetLSN(page, lsn); MarkBufferDirty(metabuf); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); } }
/* * replay move of page contents for squeeze operation of hash index */ static void hash_xlog_move_page_contents(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_move_page_contents *xldata = (xl_hash_move_page_contents *) XLogRecGetData(record); Buffer bucketbuf = InvalidBuffer; Buffer writebuf = InvalidBuffer; Buffer deletebuf = InvalidBuffer; XLogRedoAction action; /* * Ensure we have a cleanup lock on primary bucket page before we start * with the actual replay operation. This is to ensure that neither a * scan can start nor a scan can be already-in-progress during the replay * of this operation. If we allow scans during this operation, then they * can miss some records or show the same record multiple times. */ if (xldata->is_prim_bucket_same_wrt) action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf); else { /* * we don't care for return value as the purpose of reading bucketbuf * is to ensure a cleanup lock on primary bucket page. */ (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); action = XLogReadBufferForRedo(record, 1, &writebuf); } /* replay the record for adding entries in overflow buffer */ if (action == BLK_NEEDS_REDO) { Page writepage; char *begin; char *data; Size datalen; uint16 ninserted = 0; data = begin = XLogRecGetBlockData(record, 1, &datalen); writepage = (Page) BufferGetPage(writebuf); if (xldata->ntups > 0) { OffsetNumber *towrite = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntups; while (data - begin < datalen) { IndexTuple itup = (IndexTuple) data; Size itemsz; OffsetNumber l; itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); data += itemsz; l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); if (l == InvalidOffsetNumber) elog(ERROR, "hash_xlog_move_page_contents: failed to add item to hash index page, size %d bytes", (int) itemsz); ninserted++; } } /* * number of tuples inserted must be same as requested in REDO record. */ Assert(ninserted == xldata->ntups); PageSetLSN(writepage, lsn); MarkBufferDirty(writebuf); } /* replay the record for deleting entries from overflow buffer */ if (XLogReadBufferForRedo(record, 2, &deletebuf) == BLK_NEEDS_REDO) { Page page; char *ptr; Size len; ptr = XLogRecGetBlockData(record, 2, &len); page = (Page) BufferGetPage(deletebuf); if (len > 0) { OffsetNumber *unused; OffsetNumber *unend; unused = (OffsetNumber *) ptr; unend = (OffsetNumber *) ((char *) ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); } PageSetLSN(page, lsn); MarkBufferDirty(deletebuf); } /* * Replay is complete, now we can release the buffers. We release locks at * end of replay operation to ensure that we hold lock on primary bucket * page till end of operation. We can optimize by releasing the lock on * write buffer as soon as the operation for same is complete, if it is * not same as primary bucket page, but that doesn't seem to be worth * complicating the code. */ if (BufferIsValid(deletebuf)) UnlockReleaseBuffer(deletebuf); if (BufferIsValid(writebuf)) UnlockReleaseBuffer(writebuf); if (BufferIsValid(bucketbuf)) UnlockReleaseBuffer(bucketbuf); }
/* * replay allocation of page for split operation */ static void hash_xlog_split_allocate_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) XLogRecGetData(record); Buffer oldbuf; Buffer newbuf; Buffer metabuf; Size datalen PG_USED_FOR_ASSERTS_ONLY; char *data; XLogRedoAction action; /* * To be consistent with normal operation, here we take cleanup locks on * both the old and new buckets even though there can't be any concurrent * inserts. */ /* replay the record for old bucket */ action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &oldbuf); /* * Note that we still update the page even if it was restored from a full * page image, because the special space is not included in the image. */ if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) { Page oldpage; HashPageOpaque oldopaque; oldpage = BufferGetPage(oldbuf); oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage); oldopaque->hasho_flag = xlrec->old_bucket_flag; oldopaque->hasho_prevblkno = xlrec->new_bucket; PageSetLSN(oldpage, lsn); MarkBufferDirty(oldbuf); } /* replay the record for new bucket */ newbuf = XLogInitBufferForRedo(record, 1); _hash_initbuf(newbuf, xlrec->new_bucket, xlrec->new_bucket, xlrec->new_bucket_flag, true); if (!IsBufferCleanupOK(newbuf)) elog(PANIC, "hash_xlog_split_allocate_page: failed to acquire cleanup lock"); MarkBufferDirty(newbuf); PageSetLSN(BufferGetPage(newbuf), lsn); /* * We can release the lock on old bucket early as well but doing here to * consistent with normal operation. */ if (BufferIsValid(oldbuf)) UnlockReleaseBuffer(oldbuf); if (BufferIsValid(newbuf)) UnlockReleaseBuffer(newbuf); /* * Note: in normal operation, we'd update the meta page while still * holding lock on the old and new bucket pages. But during replay it's * not necessary to hold those locks, since no other bucket splits can be * happening concurrently. */ /* replay the record for metapage changes */ if (XLogReadBufferForRedo(record, 2, &metabuf) == BLK_NEEDS_REDO) { Page page; HashMetaPage metap; page = BufferGetPage(metabuf); metap = HashPageGetMeta(page); metap->hashm_maxbucket = xlrec->new_bucket; data = XLogRecGetBlockData(record, 2, &datalen); if (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) { uint32 lowmask; uint32 *highmask; /* extract low and high masks. */ memcpy(&lowmask, data, sizeof(uint32)); highmask = (uint32 *) ((char *) data + sizeof(uint32)); /* update metapage */ metap->hashm_lowmask = lowmask; metap->hashm_highmask = *highmask; data += sizeof(uint32) * 2; } if (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) { uint32 ovflpoint; uint32 *ovflpages; /* extract information of overflow pages. */ memcpy(&ovflpoint, data, sizeof(uint32)); ovflpages = (uint32 *) ((char *) data + sizeof(uint32)); /* update metapage */ metap->hashm_spares[ovflpoint] = *ovflpages; metap->hashm_ovflpoint = ovflpoint; } MarkBufferDirty(metabuf); PageSetLSN(BufferGetPage(metabuf), lsn); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); }
/* * replay delete operation in hash index to remove * tuples marked as DEAD during index tuple insertion. */ static void hash_xlog_vacuum_one_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_vacuum_one_page *xldata; Buffer buffer; Buffer metabuf; Page page; XLogRedoAction action; HashPageOpaque pageopaque; xldata = (xl_hash_vacuum_one_page *) XLogRecGetData(record); /* * If we have any conflict processing to do, it must happen before we * update the page. * * Hash index records that are marked as LP_DEAD and being removed during * hash index tuple insertion can conflict with standby queries. You might * think that vacuum records would conflict as well, but we've handled * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid * cleaned by the vacuum of the heap and so we can resolve any conflicts * just once when that arrives. After that we know that no conflicts * exist from individual hash index vacuum records on that index. */ if (InHotStandby) { TransactionId latestRemovedXid = hash_xlog_vacuum_get_latestRemovedXid(record); RelFileNode rnode; XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode); } action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer); if (action == BLK_NEEDS_REDO) { char *ptr; Size len; ptr = XLogRecGetBlockData(record, 0, &len); page = (Page) BufferGetPage(buffer); if (len > 0) { OffsetNumber *unused; OffsetNumber *unend; unused = (OffsetNumber *) ptr; unend = (OffsetNumber *) ((char *) ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); } /* * Mark the page as not containing any LP_DEAD items. See comments * in _hash_vacuum_one_page() for details. */ pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) { Page metapage; HashMetaPage metap; metapage = BufferGetPage(metabuf); metap = HashPageGetMeta(metapage); metap->hashm_ntuples -= xldata->ntuples; PageSetLSN(metapage, lsn); MarkBufferDirty(metabuf); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); }