/* * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs. * * Deletes can contain the new tuple. */ static void DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { XLogReaderState *r = buf->record; xl_heap_insert *xlrec; ReorderBufferChange *change; RelFileNode target_node; xlrec = (xl_heap_insert *) XLogRecGetData(r); /* only interested in our database */ XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL); if (target_node.dbNode != ctx->slot->data.database) return; change = ReorderBufferGetChange(ctx->reorder); change->action = REORDER_BUFFER_CHANGE_INSERT; memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode)); if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE) { Size tuplelen; char *tupledata = XLogRecGetBlockData(r, 0, &tuplelen); change->data.tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder); DecodeXLogTuple(tupledata, tuplelen, change->data.tp.newtuple); } change->data.tp.clear_toast_afterwards = true; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); }
/* * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout * in the record, from wal into proper tuplebufs. * * Updates can possibly contain a new tuple and the old primary key. */ static void DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { XLogReaderState *r = buf->record; xl_heap_update *xlrec; ReorderBufferChange *change; char *data; RelFileNode target_node; xlrec = (xl_heap_update *) XLogRecGetData(r); /* only interested in our database */ XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL); if (target_node.dbNode != ctx->slot->data.database) return; /* output plugin doesn't look for this origin, no need to queue */ if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) return; change = ReorderBufferGetChange(ctx->reorder); change->action = REORDER_BUFFER_CHANGE_UPDATE; change->origin_id = XLogRecGetOrigin(r); memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode)); if (xlrec->flags & XLH_UPDATE_CONTAINS_NEW_TUPLE) { Size datalen; Size tuplelen; data = XLogRecGetBlockData(r, 0, &datalen); tuplelen = datalen - SizeOfHeapHeader; change->data.tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); DecodeXLogTuple(data, datalen, change->data.tp.newtuple); } if (xlrec->flags & XLH_UPDATE_CONTAINS_OLD) { Size datalen; Size tuplelen; /* caution, remaining data in record is not aligned */ data = XLogRecGetData(r) + SizeOfHeapUpdate; datalen = XLogRecGetDataLen(r) - SizeOfHeapUpdate; tuplelen = datalen - SizeOfHeapHeader; change->data.tp.oldtuple = ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); DecodeXLogTuple(data, datalen, change->data.tp.oldtuple); } change->data.tp.clear_toast_afterwards = true; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); }
/* * Parse XLOG_HEAP_CONFIRM from wal into a confirmation change. * * This is pretty trivial, all the state essentially already setup by the * speculative insertion. */ static void DecodeSpecConfirm(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { XLogReaderState *r = buf->record; ReorderBufferChange *change; RelFileNode target_node; /* only interested in our database */ XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL); if (target_node.dbNode != ctx->slot->data.database) return; /* output plugin doesn't look for this origin, no need to queue */ if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) return; change = ReorderBufferGetChange(ctx->reorder); change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM; change->origin_id = XLogRecGetOrigin(r); memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode)); change->data.tp.clear_toast_afterwards = true; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); }
static void btree_xlog_delete(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); Buffer buffer; Page page; BTPageOpaque opaque; /* * If we have any conflict processing to do, it must happen before we * update the page. * * Btree delete records can conflict with standby queries. You might * think that vacuum records would conflict as well, but we've handled * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid * cleaned by the vacuum of the heap and so we can resolve any conflicts * just once when that arrives. After that we know that no conflicts * exist from individual btree vacuum records on that index. */ if (InHotStandby) { TransactionId latestRemovedXid = btree_xlog_delete_get_latestRemovedXid(record); RelFileNode rnode; XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode); } /* * We don't need to take a cleanup lock to apply these changes. See * nbtree/README for details. */ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { page = (Page) BufferGetPage(buffer); if (XLogRecGetDataLen(record) > SizeOfBtreeDelete) { OffsetNumber *unused; unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete); PageIndexMultiDelete(page, unused, xlrec->nitems); } /* * Mark the page as not containing any LP_DEAD items --- see comments * in _bt_delitems_delete(). */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_flags &= ~BTP_HAS_GARBAGE; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }
/* * Replay a revmap page extension */ static void brin_xlog_revmap_extend(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_brin_revmap_extend *xlrec; Buffer metabuf; Buffer buf; Page page; BlockNumber targetBlk; XLogRedoAction action; xlrec = (xl_brin_revmap_extend *) XLogRecGetData(record); XLogRecGetBlockTag(record, 1, NULL, NULL, &targetBlk); Assert(xlrec->targetBlk == targetBlk); /* Update the metapage */ action = XLogReadBufferForRedo(record, 0, &metabuf); if (action == BLK_NEEDS_REDO) { Page metapg; BrinMetaPageData *metadata; metapg = BufferGetPage(metabuf); metadata = (BrinMetaPageData *) PageGetContents(metapg); Assert(metadata->lastRevmapPage == xlrec->targetBlk - 1); metadata->lastRevmapPage = xlrec->targetBlk; PageSetLSN(metapg, lsn); /* * Set pd_lower just past the end of the metadata. This is essential, * because without doing so, metadata will be lost if xlog.c * compresses the page. (We must do this here because pre-v11 * versions of PG did not set the metapage's pd_lower correctly, so a * pg_upgraded index might contain the wrong value.) */ ((PageHeader) metapg)->pd_lower = ((char *) metadata + sizeof(BrinMetaPageData)) - (char *) metapg; MarkBufferDirty(metabuf); } /* * Re-init the target block as a revmap page. There's never a full- page * image here. */ buf = XLogInitBufferForRedo(record, 1); page = (Page) BufferGetPage(buf); brin_page_init(page, BRIN_PAGETYPE_REVMAP); PageSetLSN(page, lsn); MarkBufferDirty(buf); UnlockReleaseBuffer(buf); if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); }
/* * redo delete on gist index page to remove tuples marked as DEAD during index * tuple insertion */ static void gistRedoDeleteRecord(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; gistxlogDelete *xldata = (gistxlogDelete *) XLogRecGetData(record); Buffer buffer; Page page; /* * If we have any conflict processing to do, it must happen before we * update the page. * * GiST delete records can conflict with standby queries. You might think * that vacuum records would conflict as well, but we've handled that * already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid * cleaned by the vacuum of the heap and so we can resolve any conflicts * just once when that arrives. After that we know that no conflicts * exist from individual gist vacuum records on that index. */ if (InHotStandby) { TransactionId latestRemovedXid = gistRedoDeleteRecordGetLatestRemovedXid(record); RelFileNode rnode; XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode); } if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { page = (Page) BufferGetPage(buffer); if (XLogRecGetDataLen(record) > SizeOfGistxlogDelete) { OffsetNumber *todelete; todelete = (OffsetNumber *) ((char *) xldata + SizeOfGistxlogDelete); PageIndexMultiDelete(page, todelete, xldata->ntodelete); } GistClearPageHasGarbage(page); GistMarkTuplesDeleted(page); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }
/* * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs. * * Deletes can possibly contain the old primary key. */ static void DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { XLogReaderState *r = buf->record; xl_heap_delete *xlrec; ReorderBufferChange *change; RelFileNode target_node; xlrec = (xl_heap_delete *) XLogRecGetData(r); /* only interested in our database */ XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL); if (target_node.dbNode != ctx->slot->data.database) return; /* * Super deletions are irrelevant for logical decoding, it's driven by the * confirmation records. */ if (xlrec->flags & XLH_DELETE_IS_SUPER) return; /* output plugin doesn't look for this origin, no need to queue */ if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) return; change = ReorderBufferGetChange(ctx->reorder); change->action = REORDER_BUFFER_CHANGE_DELETE; change->origin_id = XLogRecGetOrigin(r); memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode)); /* old primary key stored */ if (xlrec->flags & XLH_DELETE_CONTAINS_OLD) { Size datalen = XLogRecGetDataLen(r) - SizeOfHeapDelete; Size tuplelen = datalen - SizeOfHeapHeader; Assert(XLogRecGetDataLen(r) > (SizeOfHeapDelete + SizeOfHeapHeader)); change->data.tp.oldtuple = ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); DecodeXLogTuple((char *) xlrec + SizeOfHeapDelete, datalen, change->data.tp.oldtuple); } change->data.tp.clear_toast_afterwards = true; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); }
/* * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs. * * Deletes can contain the new tuple. */ static void DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { Size datalen; char *tupledata; Size tuplelen; XLogReaderState *r = buf->record; xl_heap_insert *xlrec; ReorderBufferChange *change; RelFileNode target_node; xlrec = (xl_heap_insert *) XLogRecGetData(r); /* * Ignore insert records without new tuples (this does happen when * raw_heap_insert marks the TOAST record as HEAP_INSERT_NO_LOGICAL). */ if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) return; /* only interested in our database */ XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL); if (target_node.dbNode != ctx->slot->data.database) return; /* output plugin doesn't look for this origin, no need to queue */ if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) return; change = ReorderBufferGetChange(ctx->reorder); if (!(xlrec->flags & XLH_INSERT_IS_SPECULATIVE)) change->action = REORDER_BUFFER_CHANGE_INSERT; else change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT; change->origin_id = XLogRecGetOrigin(r); memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode)); tupledata = XLogRecGetBlockData(r, 0, &datalen); tuplelen = datalen - SizeOfHeapHeader; change->data.tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple); change->data.tp.clear_toast_afterwards = true; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); }
/* * Parse XLOG_HEAP_DELETE from wal into proper tuplebufs. * * Deletes can possibly contain the old primary key. */ static void DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { XLogReaderState *r = buf->record; xl_heap_delete *xlrec; ReorderBufferChange *change; RelFileNode target_node; xlrec = (xl_heap_delete *) XLogRecGetData(r); /* only interested in our database */ XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL); if (target_node.dbNode != ctx->slot->data.database) return; change = ReorderBufferGetChange(ctx->reorder); change->action = REORDER_BUFFER_CHANGE_DELETE; memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode)); /* old primary key stored */ if (xlrec->flags & XLOG_HEAP_CONTAINS_OLD) { Assert(XLogRecGetDataLen(r) > (SizeOfHeapDelete + SizeOfHeapHeader)); change->data.tp.oldtuple = ReorderBufferGetTupleBuf(ctx->reorder); DecodeXLogTuple((char *) xlrec + SizeOfHeapDelete, XLogRecGetDataLen(r) - SizeOfHeapDelete, change->data.tp.oldtuple); } change->data.tp.clear_toast_afterwards = true; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); }
static void gistRedoPageSplitRecord(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; gistxlogPageSplit *xldata = (gistxlogPageSplit *) XLogRecGetData(record); Buffer firstbuffer = InvalidBuffer; Buffer buffer; Page page; int i; bool isrootsplit = false; /* * We must hold lock on the first-listed page throughout the action, * including while updating the left child page (if any). We can unlock * remaining pages in the list as soon as they've been written, because * there is no path for concurrent queries to reach those pages without * first visiting the first-listed page. */ /* loop around all pages */ for (i = 0; i < xldata->npage; i++) { int flags; char *data; Size datalen; int num; BlockNumber blkno; IndexTuple *tuples; XLogRecGetBlockTag(record, i + 1, NULL, NULL, &blkno); if (blkno == GIST_ROOT_BLKNO) { Assert(i == 0); isrootsplit = true; } buffer = XLogInitBufferForRedo(record, i + 1); page = (Page) BufferGetPage(buffer); data = XLogRecGetBlockData(record, i + 1, &datalen); tuples = decodePageSplitRecord(data, datalen, &num); /* ok, clear buffer */ if (xldata->origleaf && blkno != GIST_ROOT_BLKNO) flags = F_LEAF; else flags = 0; GISTInitBuffer(buffer, flags); /* and fill it */ gistfillbuffer(page, tuples, num, FirstOffsetNumber); if (blkno == GIST_ROOT_BLKNO) { GistPageGetOpaque(page)->rightlink = InvalidBlockNumber; GistPageSetNSN(page, xldata->orignsn); GistClearFollowRight(page); } else { if (i < xldata->npage - 1) { BlockNumber nextblkno; XLogRecGetBlockTag(record, i + 2, NULL, NULL, &nextblkno); GistPageGetOpaque(page)->rightlink = nextblkno; } else GistPageGetOpaque(page)->rightlink = xldata->origrlink; GistPageSetNSN(page, xldata->orignsn); if (i < xldata->npage - 1 && !isrootsplit && xldata->markfollowright) GistMarkFollowRight(page); else GistClearFollowRight(page); } PageSetLSN(page, lsn); MarkBufferDirty(buffer); if (i == 0) firstbuffer = buffer; else UnlockReleaseBuffer(buffer); } /* Fix follow-right data on left child page, if any */ if (XLogRecHasBlockRef(record, 0)) gistRedoClearFollowRight(record, 0); /* Finally, release lock on the first page */ UnlockReleaseBuffer(firstbuffer); }
static void spgRedoMoveLeafs(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; char *ptr = XLogRecGetData(record); spgxlogMoveLeafs *xldata = (spgxlogMoveLeafs *) ptr; SpGistState state; OffsetNumber *toDelete; OffsetNumber *toInsert; int nInsert; Buffer buffer; Page page; XLogRedoAction action; BlockNumber blknoDst; XLogRecGetBlockTag(record, 1, NULL, NULL, &blknoDst); fillFakeState(&state, xldata->stateSrc); nInsert = xldata->replaceDead ? 1 : xldata->nMoves + 1; ptr += SizeOfSpgxlogMoveLeafs; toDelete = (OffsetNumber *) ptr; ptr += sizeof(OffsetNumber) * xldata->nMoves; toInsert = (OffsetNumber *) ptr; ptr += sizeof(OffsetNumber) * nInsert; /* now ptr points to the list of leaf tuples */ /* * In normal operation we would have all three pages (source, dest, and * parent) locked simultaneously; but in WAL replay it should be safe to * update them one at a time, as long as we do it in the right order. */ /* Insert tuples on the dest page (do first, so redirect is valid) */ if (xldata->newPage) { buffer = XLogInitBufferForRedo(record, 1); SpGistInitBuffer(buffer, SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0)); action = BLK_NEEDS_REDO; } else action = XLogReadBufferForRedo(record, 1, &buffer); if (action == BLK_NEEDS_REDO) { int i; page = BufferGetPage(buffer); for (i = 0; i < nInsert; i++) { char *leafTuple; SpGistLeafTupleData leafTupleHdr; /* * the tuples are not aligned, so must copy to access the size * field. */ leafTuple = ptr; memcpy(&leafTupleHdr, leafTuple, sizeof(SpGistLeafTupleData)); addOrReplaceTuple(page, (Item) leafTuple, leafTupleHdr.size, toInsert[i]); ptr += leafTupleHdr.size; } PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* Delete tuples from the source page, inserting a redirection pointer */ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { page = BufferGetPage(buffer); spgPageIndexMultiDelete(&state, page, toDelete, xldata->nMoves, state.isBuild ? SPGIST_PLACEHOLDER : SPGIST_REDIRECT, SPGIST_PLACEHOLDER, blknoDst, toInsert[nInsert - 1]); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* And update the parent downlink */ if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) { SpGistInnerTuple tuple; page = BufferGetPage(buffer); tuple = (SpGistInnerTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumParent)); spgUpdateNodeLink(tuple, xldata->nodeI, blknoDst, toInsert[nInsert - 1]); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }
/* * Get the latestRemovedXid from the heap pages pointed at by the index * tuples being deleted. This puts the work for calculating latestRemovedXid * into the recovery path rather than the primary path. * * It's possible that this generates a fair amount of I/O, since an index * block may have hundreds of tuples being deleted. Repeat accesses to the * same heap blocks are common, though are not yet optimised. * * XXX optimise later with something like XLogPrefetchBuffer() */ static TransactionId btree_xlog_delete_get_latestRemovedXid(XLogReaderState *record) { xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record); OffsetNumber *unused; Buffer ibuffer, hbuffer; Page ipage, hpage; RelFileNode rnode; BlockNumber blkno; ItemId iitemid, hitemid; IndexTuple itup; HeapTupleHeader htuphdr; BlockNumber hblkno; OffsetNumber hoffnum; TransactionId latestRemovedXid = InvalidTransactionId; int i; /* * If there's nothing running on the standby we don't need to derive a * full latestRemovedXid value, so use a fast path out of here. This * returns InvalidTransactionId, and so will conflict with all HS * transactions; but since we just worked out that that's zero people, * it's OK. * * XXX There is a race condition here, which is that a new backend might * start just after we look. If so, it cannot need to conflict, but this * coding will result in throwing a conflict anyway. */ if (CountDBBackends(InvalidOid) == 0) return latestRemovedXid; /* * In what follows, we have to examine the previous state of the index * page, as well as the heap page(s) it points to. This is only valid if * WAL replay has reached a consistent database state; which means that * the preceding check is not just an optimization, but is *necessary*. We * won't have let in any user sessions before we reach consistency. */ if (!reachedConsistency) elog(PANIC, "btree_xlog_delete_get_latestRemovedXid: cannot operate with inconsistent data"); /* * Get index page. If the DB is consistent, this should not fail, nor * should any of the heap page fetches below. If one does, we return * InvalidTransactionId to cancel all HS transactions. That's probably * overkill, but it's safe, and certainly better than panicking here. */ XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL); if (!BufferIsValid(ibuffer)) return InvalidTransactionId; LockBuffer(ibuffer, BT_READ); ipage = (Page) BufferGetPage(ibuffer); /* * Loop through the deleted index items to obtain the TransactionId from * the heap items they point to. */ unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete); for (i = 0; i < xlrec->nitems; i++) { /* * Identify the index tuple about to be deleted */ iitemid = PageGetItemId(ipage, unused[i]); itup = (IndexTuple) PageGetItem(ipage, iitemid); /* * Locate the heap page that the index tuple points at */ hblkno = ItemPointerGetBlockNumber(&(itup->t_tid)); hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL); if (!BufferIsValid(hbuffer)) { UnlockReleaseBuffer(ibuffer); return InvalidTransactionId; } LockBuffer(hbuffer, BUFFER_LOCK_SHARE); hpage = (Page) BufferGetPage(hbuffer); /* * Look up the heap tuple header that the index tuple points at by * using the heap node supplied with the xlrec. We can't use * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer. * Note that we are not looking at tuple data here, just headers. */ hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid)); hitemid = PageGetItemId(hpage, hoffnum); /* * Follow any redirections until we find something useful. */ while (ItemIdIsRedirected(hitemid)) { hoffnum = ItemIdGetRedirect(hitemid); hitemid = PageGetItemId(hpage, hoffnum); CHECK_FOR_INTERRUPTS(); } /* * If the heap item has storage, then read the header and use that to * set latestRemovedXid. * * Some LP_DEAD items may not be accessible, so we ignore them. */ if (ItemIdHasStorage(hitemid)) { htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid); HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid); } else if (ItemIdIsDead(hitemid)) { /* * Conjecture: if hitemid is dead then it had xids before the xids * marked on LP_NORMAL items. So we just ignore this item and move * onto the next, for the purposes of calculating * latestRemovedxids. */ } else Assert(!ItemIdIsUsed(hitemid)); UnlockReleaseBuffer(hbuffer); } UnlockReleaseBuffer(ibuffer); /* * If all heap tuples were LP_DEAD then we will be returning * InvalidTransactionId here, which avoids conflicts. This matches * existing logic which assumes that LP_DEAD tuples must already be older * than the latestRemovedXid on the cleanup record that set them as * LP_DEAD, hence must already have generated a conflict. */ return latestRemovedXid; }
static void btree_xlog_vacuum(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record); Buffer buffer; Page page; BTPageOpaque opaque; /* * If queries might be active then we need to ensure every leaf page is * unpinned between the lastBlockVacuumed and the current block, if there * are any. This prevents replay of the VACUUM from reaching the stage of * removing heap tuples while there could still be indexscans "in flight" * to those particular tuples (see nbtree/README). * * It might be worth checking if there are actually any backends running; * if not, we could just skip this. * * Since VACUUM can visit leaf pages out-of-order, it might issue records * with lastBlockVacuumed >= block; that's not an error, it just means * nothing to do now. * * Note: since we touch all pages in the range, we will lock non-leaf * pages, and also any empty (all-zero) pages that may be in the index. It * doesn't seem worth the complexity to avoid that. But it's important * that HotStandbyActiveInReplay() will not return true if the database * isn't yet consistent; so we need not fear reading still-corrupt blocks * here during crash recovery. */ if (HotStandbyActiveInReplay()) { RelFileNode thisrnode; BlockNumber thisblkno; BlockNumber blkno; XLogRecGetBlockTag(record, 0, &thisrnode, NULL, &thisblkno); for (blkno = xlrec->lastBlockVacuumed + 1; blkno < thisblkno; blkno++) { /* * We use RBM_NORMAL_NO_LOG mode because it's not an error * condition to see all-zero pages. The original btvacuumpage * scan would have skipped over all-zero pages, noting them in FSM * but not bothering to initialize them just yet; so we mustn't * throw an error here. (We could skip acquiring the cleanup lock * if PageIsNew, but it's probably not worth the cycles to test.) * * XXX we don't actually need to read the block, we just need to * confirm it is unpinned. If we had a special call into the * buffer manager we could optimise this so that if the block is * not in shared_buffers we confirm it as unpinned. */ buffer = XLogReadBufferExtended(thisrnode, MAIN_FORKNUM, blkno, RBM_NORMAL_NO_LOG); if (BufferIsValid(buffer)) { LockBufferForCleanup(buffer); UnlockReleaseBuffer(buffer); } } } /* * Like in btvacuumpage(), we need to take a cleanup lock on every leaf * page. See nbtree/README for details. */ if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer) == BLK_NEEDS_REDO) { char *ptr; Size len; ptr = XLogRecGetBlockData(record, 0, &len); page = (Page) BufferGetPage(buffer); if (len > 0) { OffsetNumber *unused; OffsetNumber *unend; unused = (OffsetNumber *) ptr; unend = (OffsetNumber *) ((char *) ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); } /* * Mark the page as not containing any LP_DEAD items --- see comments * in _bt_delitems_vacuum(). */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_flags &= ~BTP_HAS_GARBAGE; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }
static void btree_xlog_split(bool onleft, bool isroot, XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); bool isleaf = (xlrec->level == 0); Buffer lbuf; Buffer rbuf; Page rpage; BTPageOpaque ropaque; char *datapos; Size datalen; Item left_hikey = NULL; Size left_hikeysz = 0; BlockNumber leftsib; BlockNumber rightsib; BlockNumber rnext; XLogRecGetBlockTag(record, 0, NULL, NULL, &leftsib); XLogRecGetBlockTag(record, 1, NULL, NULL, &rightsib); if (!XLogRecGetBlockTag(record, 2, NULL, NULL, &rnext)) rnext = P_NONE; /* * Clear the incomplete split flag on the left sibling of the child page * this is a downlink for. (Like in btree_xlog_insert, this can be done * before locking the other pages) */ if (!isleaf) _bt_clear_incomplete_split(record, 3); /* Reconstruct right (new) sibling page from scratch */ rbuf = XLogInitBufferForRedo(record, 1); datapos = XLogRecGetBlockData(record, 1, &datalen); rpage = (Page) BufferGetPage(rbuf); _bt_pageinit(rpage, BufferGetPageSize(rbuf)); ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); ropaque->btpo_prev = leftsib; ropaque->btpo_next = rnext; ropaque->btpo.level = xlrec->level; ropaque->btpo_flags = isleaf ? BTP_LEAF : 0; ropaque->btpo_cycleid = 0; _bt_restore_page(rpage, datapos, datalen); /* * On leaf level, the high key of the left page is equal to the first key * on the right page. */ if (isleaf) { ItemId hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque)); left_hikey = PageGetItem(rpage, hiItemId); left_hikeysz = ItemIdGetLength(hiItemId); } PageSetLSN(rpage, lsn); MarkBufferDirty(rbuf); /* don't release the buffer yet; we touch right page's first item below */ /* Now reconstruct left (original) sibling page */ if (XLogReadBufferForRedo(record, 0, &lbuf) == BLK_NEEDS_REDO) { /* * To retain the same physical order of the tuples that they had, we * initialize a temporary empty page for the left page and add all the * items to that in item number order. This mirrors how _bt_split() * works. It's not strictly required to retain the same physical * order, as long as the items are in the correct item number order, * but it helps debugging. See also _bt_restore_page(), which does * the same for the right page. */ Page lpage = (Page) BufferGetPage(lbuf); BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); OffsetNumber off; Item newitem = NULL; Size newitemsz = 0; Page newlpage; OffsetNumber leftoff; datapos = XLogRecGetBlockData(record, 0, &datalen); if (onleft) { newitem = (Item) datapos; newitemsz = MAXALIGN(IndexTupleSize(newitem)); datapos += newitemsz; datalen -= newitemsz; } /* Extract left hikey and its size (assuming 16-bit alignment) */ if (!isleaf) { left_hikey = (Item) datapos; left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); datapos += left_hikeysz; datalen -= left_hikeysz; } Assert(datalen == 0); newlpage = PageGetTempPageCopySpecial(lpage); /* Set high key */ leftoff = P_HIKEY; if (PageAddItem(newlpage, left_hikey, left_hikeysz, P_HIKEY, false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add high key to left page after split"); leftoff = OffsetNumberNext(leftoff); for (off = P_FIRSTDATAKEY(lopaque); off < xlrec->firstright; off++) { ItemId itemid; Size itemsz; Item item; /* add the new item if it was inserted on left page */ if (onleft && off == xlrec->newitemoff) { if (PageAddItem(newlpage, newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add new item to left page after split"); leftoff = OffsetNumberNext(leftoff); } itemid = PageGetItemId(lpage, off); itemsz = ItemIdGetLength(itemid); item = PageGetItem(lpage, itemid); if (PageAddItem(newlpage, item, itemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add old item to left page after split"); leftoff = OffsetNumberNext(leftoff); } /* cope with possibility that newitem goes at the end */ if (onleft && off == xlrec->newitemoff) { if (PageAddItem(newlpage, newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add new item to left page after split"); leftoff = OffsetNumberNext(leftoff); } PageRestoreTempPage(newlpage, lpage); /* Fix opaque fields */ lopaque->btpo_flags = BTP_INCOMPLETE_SPLIT; if (isleaf) lopaque->btpo_flags |= BTP_LEAF; lopaque->btpo_next = rightsib; lopaque->btpo_cycleid = 0; PageSetLSN(lpage, lsn); MarkBufferDirty(lbuf); } /* We no longer need the buffers */ if (BufferIsValid(lbuf)) UnlockReleaseBuffer(lbuf); UnlockReleaseBuffer(rbuf); /* * Fix left-link of the page to the right of the new right sibling. * * Note: in normal operation, we do this while still holding lock on the * two split pages. However, that's not necessary for correctness in WAL * replay, because no other index update can be in progress, and readers * will cope properly when following an obsolete left-link. */ if (rnext != P_NONE) { Buffer buffer; if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) { Page page = (Page) BufferGetPage(buffer); BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_prev = rightsib; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); } }
/* * XLogReadBufferForRedoExtended * Like XLogReadBufferForRedo, but with extra options. * * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended * with all-zeroes pages up to the referenced block number. In * RBM_ZERO_AND_LOCK and RBM_ZERO_AND_CLEANUP_LOCK modes, the return value * is always BLK_NEEDS_REDO. * * (The RBM_ZERO_AND_CLEANUP_LOCK mode is redundant with the get_cleanup_lock * parameter. Do not use an inconsistent combination!) * * If 'get_cleanup_lock' is true, a "cleanup lock" is acquired on the buffer * using LockBufferForCleanup(), instead of a regular exclusive lock. */ XLogRedoAction XLogReadBufferForRedoExtended(XLogReaderState *record, uint8 block_id, ReadBufferMode mode, bool get_cleanup_lock, Buffer *buf) { XLogRecPtr lsn = record->EndRecPtr; RelFileNode rnode; ForkNumber forknum; BlockNumber blkno; Page page; bool zeromode; bool willinit; if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno)) { /* Caller specified a bogus block_id */ elog(PANIC, "failed to locate backup block with ID %d", block_id); } /* * Make sure that if the block is marked with WILL_INIT, the caller is * going to initialize it. And vice versa. */ zeromode = (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK); willinit = (record->blocks[block_id].flags & BKPBLOCK_WILL_INIT) != 0; if (willinit && !zeromode) elog(PANIC, "block with WILL_INIT flag in WAL record must be zeroed by redo routine"); if (!willinit && zeromode) elog(PANIC, "block to be initialized in redo routine must be marked with WILL_INIT flag in the WAL record"); /* If it's a full-page image, restore it. */ if (XLogRecHasBlockImage(record, block_id)) { *buf = XLogReadBufferExtended(rnode, forknum, blkno, get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK); page = BufferGetPage(*buf); if (!RestoreBlockImage(record, block_id, page)) elog(ERROR, "failed to restore block image"); /* * The page may be uninitialized. If so, we can't set the LSN because * that would corrupt the page. */ if (!PageIsNew(page)) { PageSetLSN(page, lsn); } MarkBufferDirty(*buf); /* * At the end of crash recovery the init forks of unlogged relations * are copied, without going through shared buffers. So we need to * force the on-disk state of init forks to always be in sync with the * state in shared buffers. */ if (forknum == INIT_FORKNUM) FlushOneBuffer(*buf); return BLK_RESTORED; } else { *buf = XLogReadBufferExtended(rnode, forknum, blkno, mode); if (BufferIsValid(*buf)) { if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK) { if (get_cleanup_lock) LockBufferForCleanup(*buf); else LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE); } if (lsn <= PageGetLSN(BufferGetPage(*buf))) return BLK_DONE; else return BLK_NEEDS_REDO; } else return BLK_NOTFOUND; } }
static void spgRedoAddNode(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; char *ptr = XLogRecGetData(record); spgxlogAddNode *xldata = (spgxlogAddNode *) ptr; char *innerTuple; SpGistInnerTupleData innerTupleHdr; SpGistState state; Buffer buffer; Page page; XLogRedoAction action; ptr += sizeof(spgxlogAddNode); innerTuple = ptr; /* the tuple is unaligned, so make a copy to access its header */ memcpy(&innerTupleHdr, innerTuple, sizeof(SpGistInnerTupleData)); fillFakeState(&state, xldata->stateSrc); if (!XLogRecHasBlockRef(record, 1)) { /* update in place */ Assert(xldata->parentBlk == -1); if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { page = BufferGetPage(buffer); PageIndexTupleDelete(page, xldata->offnum); if (PageAddItem(page, (Item) innerTuple, innerTupleHdr.size, xldata->offnum, false, false) != xldata->offnum) elog(ERROR, "failed to add item of size %u to SPGiST index page", innerTupleHdr.size); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); } else { BlockNumber blkno; BlockNumber blknoNew; XLogRecGetBlockTag(record, 0, NULL, NULL, &blkno); XLogRecGetBlockTag(record, 1, NULL, NULL, &blknoNew); /* * In normal operation we would have all three pages (source, dest, * and parent) locked simultaneously; but in WAL replay it should be * safe to update them one at a time, as long as we do it in the right * order. We must insert the new tuple before replacing the old tuple * with the redirect tuple. */ /* Install new tuple first so redirect is valid */ if (xldata->newPage) { /* AddNode is not used for nulls pages */ buffer = XLogInitBufferForRedo(record, 1); SpGistInitBuffer(buffer, 0); action = BLK_NEEDS_REDO; } else action = XLogReadBufferForRedo(record, 1, &buffer); if (action == BLK_NEEDS_REDO) { page = BufferGetPage(buffer); addOrReplaceTuple(page, (Item) innerTuple, innerTupleHdr.size, xldata->offnumNew); /* * If parent is in this same page, update it now. */ if (xldata->parentBlk == 1) { SpGistInnerTuple parentTuple; parentTuple = (SpGistInnerTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumParent)); spgUpdateNodeLink(parentTuple, xldata->nodeI, blknoNew, xldata->offnumNew); } PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* Delete old tuple, replacing it with redirect or placeholder tuple */ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { SpGistDeadTuple dt; page = BufferGetPage(buffer); if (state.isBuild) dt = spgFormDeadTuple(&state, SPGIST_PLACEHOLDER, InvalidBlockNumber, InvalidOffsetNumber); else dt = spgFormDeadTuple(&state, SPGIST_REDIRECT, blknoNew, xldata->offnumNew); PageIndexTupleDelete(page, xldata->offnum); if (PageAddItem(page, (Item) dt, dt->size, xldata->offnum, false, false) != xldata->offnum) elog(ERROR, "failed to add item of size %u to SPGiST index page", dt->size); if (state.isBuild) SpGistPageGetOpaque(page)->nPlaceholder++; else SpGistPageGetOpaque(page)->nRedirection++; /* * If parent is in this same page, update it now. */ if (xldata->parentBlk == 0) { SpGistInnerTuple parentTuple; parentTuple = (SpGistInnerTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumParent)); spgUpdateNodeLink(parentTuple, xldata->nodeI, blknoNew, xldata->offnumNew); } PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* * Update parent downlink (if we didn't do it as part of the source or * destination page update already). */ if (xldata->parentBlk == 2) { if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) { SpGistInnerTuple parentTuple; page = BufferGetPage(buffer); parentTuple = (SpGistInnerTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumParent)); spgUpdateNodeLink(parentTuple, xldata->nodeI, blknoNew, xldata->offnumNew); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); } } }
/* * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs. * * Currently MULTI_INSERT will always contain the full tuples. */ static void DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { XLogReaderState *r = buf->record; xl_heap_multi_insert *xlrec; int i; char *data; char *tupledata; Size tuplelen; RelFileNode rnode; xlrec = (xl_heap_multi_insert *) XLogRecGetData(r); /* only interested in our database */ XLogRecGetBlockTag(r, 0, &rnode, NULL, NULL); if (rnode.dbNode != ctx->slot->data.database) return; /* output plugin doesn't look for this origin, no need to queue */ if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) return; tupledata = XLogRecGetBlockData(r, 0, &tuplelen); data = tupledata; for (i = 0; i < xlrec->ntuples; i++) { ReorderBufferChange *change; xl_multi_insert_tuple *xlhdr; int datalen; ReorderBufferTupleBuf *tuple; change = ReorderBufferGetChange(ctx->reorder); change->action = REORDER_BUFFER_CHANGE_INSERT; change->origin_id = XLogRecGetOrigin(r); memcpy(&change->data.tp.relnode, &rnode, sizeof(RelFileNode)); /* * CONTAINS_NEW_TUPLE will always be set currently as multi_insert * isn't used for catalogs, but better be future proof. * * We decode the tuple in pretty much the same way as DecodeXLogTuple, * but since the layout is slightly different, we can't use it here. */ if (xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE) { change->data.tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder); tuple = change->data.tp.newtuple; /* not a disk based tuple */ ItemPointerSetInvalid(&tuple->tuple.t_self); xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(data); data = ((char *) xlhdr) + SizeOfMultiInsertTuple; datalen = xlhdr->datalen; /* * We can only figure this out after reassembling the * transactions. */ tuple->tuple.t_tableOid = InvalidOid; tuple->tuple.t_data = &tuple->t_data.header; tuple->tuple.t_len = datalen + SizeofHeapTupleHeader; memset(&tuple->t_data.header, 0, SizeofHeapTupleHeader); memcpy((char *) &tuple->t_data.header + SizeofHeapTupleHeader, (char *) data, datalen); data += datalen; tuple->t_data.header.t_infomask = xlhdr->t_infomask; tuple->t_data.header.t_infomask2 = xlhdr->t_infomask2; tuple->t_data.header.t_hoff = xlhdr->t_hoff; } /* * Reset toast reassembly state only after the last row in the last * xl_multi_insert_tuple record emitted by one heap_multi_insert() * call. */ if (xlrec->flags & XLH_INSERT_LAST_IN_MULTI && (i + 1) == xlrec->ntuples) change->data.tp.clear_toast_afterwards = true; else change->data.tp.clear_toast_afterwards = false; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); } Assert(data == tupledata + tuplelen); }
/* * replay addition of overflow page for hash index */ static void hash_xlog_add_ovfl_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) XLogRecGetData(record); Buffer leftbuf; Buffer ovflbuf; Buffer metabuf; BlockNumber leftblk; BlockNumber rightblk; BlockNumber newmapblk = InvalidBlockNumber; Page ovflpage; HashPageOpaque ovflopaque; uint32 *num_bucket; char *data; Size datalen PG_USED_FOR_ASSERTS_ONLY; bool new_bmpage = false; XLogRecGetBlockTag(record, 0, NULL, NULL, &rightblk); XLogRecGetBlockTag(record, 1, NULL, NULL, &leftblk); ovflbuf = XLogInitBufferForRedo(record, 0); Assert(BufferIsValid(ovflbuf)); data = XLogRecGetBlockData(record, 0, &datalen); num_bucket = (uint32 *) data; Assert(datalen == sizeof(uint32)); _hash_initbuf(ovflbuf, InvalidBlockNumber, *num_bucket, LH_OVERFLOW_PAGE, true); /* update backlink */ ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); ovflopaque->hasho_prevblkno = leftblk; PageSetLSN(ovflpage, lsn); MarkBufferDirty(ovflbuf); if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO) { Page leftpage; HashPageOpaque leftopaque; leftpage = BufferGetPage(leftbuf); leftopaque = (HashPageOpaque) PageGetSpecialPointer(leftpage); leftopaque->hasho_nextblkno = rightblk; PageSetLSN(leftpage, lsn); MarkBufferDirty(leftbuf); } if (BufferIsValid(leftbuf)) UnlockReleaseBuffer(leftbuf); UnlockReleaseBuffer(ovflbuf); /* * Note: in normal operation, we'd update the bitmap and meta page while * still holding lock on the overflow pages. But during replay it's not * necessary to hold those locks, since no other index updates can be * happening concurrently. */ if (XLogRecHasBlockRef(record, 2)) { Buffer mapbuffer; if (XLogReadBufferForRedo(record, 2, &mapbuffer) == BLK_NEEDS_REDO) { Page mappage = (Page) BufferGetPage(mapbuffer); uint32 *freep = NULL; char *data; uint32 *bitmap_page_bit; freep = HashPageGetBitmap(mappage); data = XLogRecGetBlockData(record, 2, &datalen); bitmap_page_bit = (uint32 *) data; SETBIT(freep, *bitmap_page_bit); PageSetLSN(mappage, lsn); MarkBufferDirty(mapbuffer); } if (BufferIsValid(mapbuffer)) UnlockReleaseBuffer(mapbuffer); } if (XLogRecHasBlockRef(record, 3)) { Buffer newmapbuf; newmapbuf = XLogInitBufferForRedo(record, 3); _hash_initbitmapbuffer(newmapbuf, xlrec->bmsize, true); new_bmpage = true; newmapblk = BufferGetBlockNumber(newmapbuf); MarkBufferDirty(newmapbuf); PageSetLSN(BufferGetPage(newmapbuf), lsn); UnlockReleaseBuffer(newmapbuf); } if (XLogReadBufferForRedo(record, 4, &metabuf) == BLK_NEEDS_REDO) { HashMetaPage metap; Page page; uint32 *firstfree_ovflpage; data = XLogRecGetBlockData(record, 4, &datalen); firstfree_ovflpage = (uint32 *) data; page = BufferGetPage(metabuf); metap = HashPageGetMeta(page); metap->hashm_firstfree = *firstfree_ovflpage; if (!xlrec->bmpage_found) { metap->hashm_spares[metap->hashm_ovflpoint]++; if (new_bmpage) { Assert(BlockNumberIsValid(newmapblk)); metap->hashm_mapp[metap->hashm_nmaps] = newmapblk; metap->hashm_nmaps++; metap->hashm_spares[metap->hashm_ovflpoint]++; } } PageSetLSN(page, lsn); MarkBufferDirty(metabuf); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); }
/* * replay delete operation in hash index to remove * tuples marked as DEAD during index tuple insertion. */ static void hash_xlog_vacuum_one_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_vacuum_one_page *xldata; Buffer buffer; Buffer metabuf; Page page; XLogRedoAction action; HashPageOpaque pageopaque; xldata = (xl_hash_vacuum_one_page *) XLogRecGetData(record); /* * If we have any conflict processing to do, it must happen before we * update the page. * * Hash index records that are marked as LP_DEAD and being removed during * hash index tuple insertion can conflict with standby queries. You might * think that vacuum records would conflict as well, but we've handled * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid * cleaned by the vacuum of the heap and so we can resolve any conflicts * just once when that arrives. After that we know that no conflicts * exist from individual hash index vacuum records on that index. */ if (InHotStandby) { TransactionId latestRemovedXid = hash_xlog_vacuum_get_latestRemovedXid(record); RelFileNode rnode; XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode); } action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer); if (action == BLK_NEEDS_REDO) { char *ptr; Size len; ptr = XLogRecGetBlockData(record, 0, &len); page = (Page) BufferGetPage(buffer); if (len > 0) { OffsetNumber *unused; OffsetNumber *unend; unused = (OffsetNumber *) ptr; unend = (OffsetNumber *) ((char *) ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); } /* * Mark the page as not containing any LP_DEAD items. See comments * in _hash_vacuum_one_page() for details. */ pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) { Page metapage; HashMetaPage metap; metapage = BufferGetPage(metabuf); metap = HashPageGetMeta(metapage); metap->hashm_ntuples -= xldata->ntuples; PageSetLSN(metapage, lsn); MarkBufferDirty(metabuf); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); }
static void spgRedoAddLeaf(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; char *ptr = XLogRecGetData(record); spgxlogAddLeaf *xldata = (spgxlogAddLeaf *) ptr; char *leafTuple; SpGistLeafTupleData leafTupleHdr; Buffer buffer; Page page; XLogRedoAction action; ptr += sizeof(spgxlogAddLeaf); leafTuple = ptr; /* the leaf tuple is unaligned, so make a copy to access its header */ memcpy(&leafTupleHdr, leafTuple, sizeof(SpGistLeafTupleData)); /* * In normal operation we would have both current and parent pages locked * simultaneously; but in WAL replay it should be safe to update the leaf * page before updating the parent. */ if (xldata->newPage) { buffer = XLogInitBufferForRedo(record, 0); SpGistInitBuffer(buffer, SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0)); action = BLK_NEEDS_REDO; } else action = XLogReadBufferForRedo(record, 0, &buffer); if (action == BLK_NEEDS_REDO) { page = BufferGetPage(buffer); /* insert new tuple */ if (xldata->offnumLeaf != xldata->offnumHeadLeaf) { /* normal cases, tuple was added by SpGistPageAddNewItem */ addOrReplaceTuple(page, (Item) leafTuple, leafTupleHdr.size, xldata->offnumLeaf); /* update head tuple's chain link if needed */ if (xldata->offnumHeadLeaf != InvalidOffsetNumber) { SpGistLeafTuple head; head = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumHeadLeaf)); Assert(head->nextOffset == leafTupleHdr.nextOffset); head->nextOffset = xldata->offnumLeaf; } } else { /* replacing a DEAD tuple */ PageIndexTupleDelete(page, xldata->offnumLeaf); if (PageAddItem(page, (Item) leafTuple, leafTupleHdr.size, xldata->offnumLeaf, false, false) != xldata->offnumLeaf) elog(ERROR, "failed to add item of size %u to SPGiST index page", leafTupleHdr.size); } PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* update parent downlink if necessary */ if (xldata->offnumParent != InvalidOffsetNumber) { if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) { SpGistInnerTuple tuple; BlockNumber blknoLeaf; XLogRecGetBlockTag(record, 0, NULL, NULL, &blknoLeaf); page = BufferGetPage(buffer); tuple = (SpGistInnerTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumParent)); spgUpdateNodeLink(tuple, xldata->nodeI, blknoLeaf, xldata->offnumLeaf); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); } }
static void spgRedoPickSplit(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; char *ptr = XLogRecGetData(record); spgxlogPickSplit *xldata = (spgxlogPickSplit *) ptr; char *innerTuple; SpGistInnerTupleData innerTupleHdr; SpGistState state; OffsetNumber *toDelete; OffsetNumber *toInsert; uint8 *leafPageSelect; Buffer srcBuffer; Buffer destBuffer; Buffer innerBuffer; Page srcPage; Page destPage; Page page; int i; BlockNumber blknoInner; XLogRedoAction action; XLogRecGetBlockTag(record, 2, NULL, NULL, &blknoInner); fillFakeState(&state, xldata->stateSrc); ptr += SizeOfSpgxlogPickSplit; toDelete = (OffsetNumber *) ptr; ptr += sizeof(OffsetNumber) * xldata->nDelete; toInsert = (OffsetNumber *) ptr; ptr += sizeof(OffsetNumber) * xldata->nInsert; leafPageSelect = (uint8 *) ptr; ptr += sizeof(uint8) * xldata->nInsert; innerTuple = ptr; /* the inner tuple is unaligned, so make a copy to access its header */ memcpy(&innerTupleHdr, innerTuple, sizeof(SpGistInnerTupleData)); ptr += innerTupleHdr.size; /* now ptr points to the list of leaf tuples */ if (xldata->isRootSplit) { /* when splitting root, we touch it only in the guise of new inner */ srcBuffer = InvalidBuffer; srcPage = NULL; } else if (xldata->initSrc) { /* just re-init the source page */ srcBuffer = XLogInitBufferForRedo(record, 0); srcPage = (Page) BufferGetPage(srcBuffer); SpGistInitBuffer(srcBuffer, SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0)); /* don't update LSN etc till we're done with it */ } else { /* * Delete the specified tuples from source page. (In case we're in * Hot Standby, we need to hold lock on the page till we're done * inserting leaf tuples and the new inner tuple, else the added * redirect tuple will be a dangling link.) */ srcPage = NULL; if (XLogReadBufferForRedo(record, 0, &srcBuffer) == BLK_NEEDS_REDO) { srcPage = BufferGetPage(srcBuffer); /* * We have it a bit easier here than in doPickSplit(), because we * know the inner tuple's location already, so we can inject the * correct redirection tuple now. */ if (!state.isBuild) spgPageIndexMultiDelete(&state, srcPage, toDelete, xldata->nDelete, SPGIST_REDIRECT, SPGIST_PLACEHOLDER, blknoInner, xldata->offnumInner); else spgPageIndexMultiDelete(&state, srcPage, toDelete, xldata->nDelete, SPGIST_PLACEHOLDER, SPGIST_PLACEHOLDER, InvalidBlockNumber, InvalidOffsetNumber); /* don't update LSN etc till we're done with it */ } } /* try to access dest page if any */ if (!XLogRecHasBlockRef(record, 1)) { destBuffer = InvalidBuffer; destPage = NULL; } else if (xldata->initDest) { /* just re-init the dest page */ destBuffer = XLogInitBufferForRedo(record, 1); destPage = (Page) BufferGetPage(destBuffer); SpGistInitBuffer(destBuffer, SPGIST_LEAF | (xldata->storesNulls ? SPGIST_NULLS : 0)); /* don't update LSN etc till we're done with it */ } else { /* * We could probably release the page lock immediately in the * full-page-image case, but for safety let's hold it till later. */ if (XLogReadBufferForRedo(record, 1, &destBuffer) == BLK_NEEDS_REDO) destPage = (Page) BufferGetPage(destBuffer); else destPage = NULL; /* don't do any page updates */ } /* restore leaf tuples to src and/or dest page */ for (i = 0; i < xldata->nInsert; i++) { char *leafTuple; SpGistLeafTupleData leafTupleHdr; /* the tuples are not aligned, so must copy to access the size field. */ leafTuple = ptr; memcpy(&leafTupleHdr, leafTuple, sizeof(SpGistLeafTupleData)); ptr += leafTupleHdr.size; page = leafPageSelect[i] ? destPage : srcPage; if (page == NULL) continue; /* no need to touch this page */ addOrReplaceTuple(page, (Item) leafTuple, leafTupleHdr.size, toInsert[i]); } /* Now update src and dest page LSNs if needed */ if (srcPage != NULL) { PageSetLSN(srcPage, lsn); MarkBufferDirty(srcBuffer); } if (destPage != NULL) { PageSetLSN(destPage, lsn); MarkBufferDirty(destBuffer); } /* restore new inner tuple */ if (xldata->initInner) { innerBuffer = XLogInitBufferForRedo(record, 2); SpGistInitBuffer(innerBuffer, (xldata->storesNulls ? SPGIST_NULLS : 0)); action = BLK_NEEDS_REDO; } else action = XLogReadBufferForRedo(record, 2, &innerBuffer); if (action == BLK_NEEDS_REDO) { page = BufferGetPage(innerBuffer); addOrReplaceTuple(page, (Item) innerTuple, innerTupleHdr.size, xldata->offnumInner); /* if inner is also parent, update link while we're here */ if (xldata->innerIsParent) { SpGistInnerTuple parent; parent = (SpGistInnerTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumParent)); spgUpdateNodeLink(parent, xldata->nodeI, blknoInner, xldata->offnumInner); } PageSetLSN(page, lsn); MarkBufferDirty(innerBuffer); } if (BufferIsValid(innerBuffer)) UnlockReleaseBuffer(innerBuffer); /* * Now we can release the leaf-page locks. It's okay to do this before * updating the parent downlink. */ if (BufferIsValid(srcBuffer)) UnlockReleaseBuffer(srcBuffer); if (BufferIsValid(destBuffer)) UnlockReleaseBuffer(destBuffer); /* update parent downlink, unless we did it above */ if (XLogRecHasBlockRef(record, 3)) { Buffer parentBuffer; if (XLogReadBufferForRedo(record, 3, &parentBuffer) == BLK_NEEDS_REDO) { SpGistInnerTuple parent; page = BufferGetPage(parentBuffer); parent = (SpGistInnerTuple) PageGetItem(page, PageGetItemId(page, xldata->offnumParent)); spgUpdateNodeLink(parent, xldata->nodeI, blknoInner, xldata->offnumInner); PageSetLSN(page, lsn); MarkBufferDirty(parentBuffer); } if (BufferIsValid(parentBuffer)) UnlockReleaseBuffer(parentBuffer); } else Assert(xldata->innerIsParent || xldata->isRootSplit); }
static void spgRedoVacuumRedirect(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; char *ptr = XLogRecGetData(record); spgxlogVacuumRedirect *xldata = (spgxlogVacuumRedirect *) ptr; OffsetNumber *itemToPlaceholder; Buffer buffer; itemToPlaceholder = xldata->offsets; /* * If any redirection tuples are being removed, make sure there are no * live Hot Standby transactions that might need to see them. */ if (InHotStandby) { if (TransactionIdIsValid(xldata->newestRedirectXid)) { RelFileNode node; XLogRecGetBlockTag(record, 0, &node, NULL, NULL); ResolveRecoveryConflictWithSnapshot(xldata->newestRedirectXid, node); } } if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { Page page = BufferGetPage(buffer); SpGistPageOpaque opaque = SpGistPageGetOpaque(page); int i; /* Convert redirect pointers to plain placeholders */ for (i = 0; i < xldata->nToPlaceholder; i++) { SpGistDeadTuple dt; dt = (SpGistDeadTuple) PageGetItem(page, PageGetItemId(page, itemToPlaceholder[i])); Assert(dt->tupstate == SPGIST_REDIRECT); dt->tupstate = SPGIST_PLACEHOLDER; ItemPointerSetInvalid(&dt->pointer); } Assert(opaque->nRedirection >= xldata->nToPlaceholder); opaque->nRedirection -= xldata->nToPlaceholder; opaque->nPlaceholder += xldata->nToPlaceholder; /* Remove placeholder tuples at end of page */ if (xldata->firstPlaceholder != InvalidOffsetNumber) { int max = PageGetMaxOffsetNumber(page); OffsetNumber *toDelete; toDelete = palloc(sizeof(OffsetNumber) * max); for (i = xldata->firstPlaceholder; i <= max; i++) toDelete[i - xldata->firstPlaceholder] = i; i = max - xldata->firstPlaceholder + 1; Assert(opaque->nPlaceholder >= i); opaque->nPlaceholder -= i; /* The array is sorted, so can use PageIndexMultiDelete */ PageIndexMultiDelete(page, toDelete, i); pfree(toDelete); } PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }