/* * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs. * * Deletes can contain the new tuple. */ static void DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { XLogReaderState *r = buf->record; xl_heap_insert *xlrec; ReorderBufferChange *change; RelFileNode target_node; xlrec = (xl_heap_insert *) XLogRecGetData(r); /* only interested in our database */ XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL); if (target_node.dbNode != ctx->slot->data.database) return; change = ReorderBufferGetChange(ctx->reorder); change->action = REORDER_BUFFER_CHANGE_INSERT; memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode)); if (xlrec->flags & XLOG_HEAP_CONTAINS_NEW_TUPLE) { Size tuplelen; char *tupledata = XLogRecGetBlockData(r, 0, &tuplelen); change->data.tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder); DecodeXLogTuple(tupledata, tuplelen, change->data.tp.newtuple); } change->data.tp.clear_toast_afterwards = true; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); }
/* * Parse XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE, which have the same layout * in the record, from wal into proper tuplebufs. * * Updates can possibly contain a new tuple and the old primary key. */ static void DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { XLogReaderState *r = buf->record; xl_heap_update *xlrec; ReorderBufferChange *change; char *data; RelFileNode target_node; xlrec = (xl_heap_update *) XLogRecGetData(r); /* only interested in our database */ XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL); if (target_node.dbNode != ctx->slot->data.database) return; /* output plugin doesn't look for this origin, no need to queue */ if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) return; change = ReorderBufferGetChange(ctx->reorder); change->action = REORDER_BUFFER_CHANGE_UPDATE; change->origin_id = XLogRecGetOrigin(r); memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode)); if (xlrec->flags & XLH_UPDATE_CONTAINS_NEW_TUPLE) { Size datalen; Size tuplelen; data = XLogRecGetBlockData(r, 0, &datalen); tuplelen = datalen - SizeOfHeapHeader; change->data.tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); DecodeXLogTuple(data, datalen, change->data.tp.newtuple); } if (xlrec->flags & XLH_UPDATE_CONTAINS_OLD) { Size datalen; Size tuplelen; /* caution, remaining data in record is not aligned */ data = XLogRecGetData(r) + SizeOfHeapUpdate; datalen = XLogRecGetDataLen(r) - SizeOfHeapUpdate; tuplelen = datalen - SizeOfHeapHeader; change->data.tp.oldtuple = ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); DecodeXLogTuple(data, datalen, change->data.tp.oldtuple); } change->data.tp.clear_toast_afterwards = true; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); }
/* * Update a tuple on a single page. */ static void brin_xlog_samepage_update(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_brin_samepage_update *xlrec; Buffer buffer; XLogRedoAction action; xlrec = (xl_brin_samepage_update *) XLogRecGetData(record); action = XLogReadBufferForRedo(record, 0, &buffer); if (action == BLK_NEEDS_REDO) { Size tuplen; BrinTuple *brintuple; Page page; OffsetNumber offnum; brintuple = (BrinTuple *) XLogRecGetBlockData(record, 0, &tuplen); page = (Page) BufferGetPage(buffer); offnum = xlrec->offnum; if (!PageIndexTupleOverwrite(page, offnum, (Item) brintuple, tuplen)) elog(PANIC, "brin_xlog_samepage_update: failed to replace tuple"); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* XXX no FSM updates here ... */ }
/* * Redo function for generic xlog record. */ void generic_redo(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; Buffer buffers[MAX_GENERIC_XLOG_PAGES]; uint8 block_id; /* Protect limited size of buffers[] array */ Assert(record->max_block_id < MAX_GENERIC_XLOG_PAGES); /* Iterate over blocks */ for (block_id = 0; block_id <= record->max_block_id; block_id++) { XLogRedoAction action; if (!XLogRecHasBlockRef(record, block_id)) { buffers[block_id] = InvalidBuffer; continue; } action = XLogReadBufferForRedo(record, block_id, &buffers[block_id]); /* Apply redo to given block if needed */ if (action == BLK_NEEDS_REDO) { Page page; PageHeader pageHeader; char *blockDelta; Size blockDeltaSize; page = BufferGetPage(buffers[block_id], NULL, NULL, BGP_NO_SNAPSHOT_TEST); blockDelta = XLogRecGetBlockData(record, block_id, &blockDeltaSize); applyPageRedo(page, blockDelta, blockDeltaSize); /* * Since the delta contains no information about what's in the * "hole" between pd_lower and pd_upper, set that to zero to * ensure we produce the same page state that application of the * logged action by GenericXLogFinish did. */ pageHeader = (PageHeader) page; memset(page + pageHeader->pd_lower, 0, pageHeader->pd_upper - pageHeader->pd_lower); PageSetLSN(page, lsn); MarkBufferDirty(buffers[block_id]); } } /* Changes are done: unlock and release all buffers */ for (block_id = 0; block_id <= record->max_block_id; block_id++) { if (BufferIsValid(buffers[block_id])) UnlockReleaseBuffer(buffers[block_id]); } }
static void ginRedoInsertListPage(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; ginxlogInsertListPage *data = (ginxlogInsertListPage *) XLogRecGetData(record); Buffer buffer; Page page; OffsetNumber l, off = FirstOffsetNumber; int i, tupsize; char *payload; IndexTuple tuples; Size totaltupsize; /* We always re-initialize the page. */ buffer = XLogInitBufferForRedo(record, 0); page = BufferGetPage(buffer); GinInitBuffer(buffer, GIN_LIST); GinPageGetOpaque(page)->rightlink = data->rightlink; if (data->rightlink == InvalidBlockNumber) { /* tail of sublist */ GinPageSetFullRow(page); GinPageGetOpaque(page)->maxoff = 1; } else { GinPageGetOpaque(page)->maxoff = 0; } payload = XLogRecGetBlockData(record, 0, &totaltupsize); tuples = (IndexTuple) payload; for (i = 0; i < data->ntuples; i++) { tupsize = IndexTupleSize(tuples); l = PageAddItem(page, (Item) tuples, tupsize, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page"); tuples = (IndexTuple) (((char *) tuples) + tupsize); off++; } Assert((char *) tuples == payload + totaltupsize); PageSetLSN(page, lsn); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); }
static void ginRedoInsert(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; ginxlogInsert *data = (ginxlogInsert *) XLogRecGetData(record); Buffer buffer; #ifdef NOT_USED BlockNumber leftChildBlkno = InvalidBlockNumber; #endif BlockNumber rightChildBlkno = InvalidBlockNumber; bool isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0; /* * First clear incomplete-split flag on child page if this finishes a * split. */ if (!isLeaf) { char *payload = XLogRecGetData(record) + sizeof(ginxlogInsert); #ifdef NOT_USED leftChildBlkno = BlockIdGetBlockNumber((BlockId) payload); #endif payload += sizeof(BlockIdData); rightChildBlkno = BlockIdGetBlockNumber((BlockId) payload); payload += sizeof(BlockIdData); ginRedoClearIncompleteSplit(record, 1); } if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { Page page = BufferGetPage(buffer); Size len; char *payload = XLogRecGetBlockData(record, 0, &len); /* How to insert the payload is tree-type specific */ if (data->flags & GIN_INSERT_ISDATA) { Assert(GinPageIsData(page)); ginRedoInsertData(buffer, isLeaf, rightChildBlkno, payload); } else { Assert(!GinPageIsData(page)); ginRedoInsertEntry(buffer, isLeaf, rightChildBlkno, payload); } PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }
/* * Parse XLOG_HEAP_INSERT (not MULTI_INSERT!) records into tuplebufs. * * Deletes can contain the new tuple. */ static void DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { Size datalen; char *tupledata; Size tuplelen; XLogReaderState *r = buf->record; xl_heap_insert *xlrec; ReorderBufferChange *change; RelFileNode target_node; xlrec = (xl_heap_insert *) XLogRecGetData(r); /* * Ignore insert records without new tuples (this does happen when * raw_heap_insert marks the TOAST record as HEAP_INSERT_NO_LOGICAL). */ if (!(xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE)) return; /* only interested in our database */ XLogRecGetBlockTag(r, 0, &target_node, NULL, NULL); if (target_node.dbNode != ctx->slot->data.database) return; /* output plugin doesn't look for this origin, no need to queue */ if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) return; change = ReorderBufferGetChange(ctx->reorder); if (!(xlrec->flags & XLH_INSERT_IS_SPECULATIVE)) change->action = REORDER_BUFFER_CHANGE_INSERT; else change->action = REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT; change->origin_id = XLogRecGetOrigin(r); memcpy(&change->data.tp.relnode, &target_node, sizeof(RelFileNode)); tupledata = XLogRecGetBlockData(r, 0, &datalen); tuplelen = datalen - SizeOfHeapHeader; change->data.tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder, tuplelen); DecodeXLogTuple(tupledata, datalen, change->data.tp.newtuple); change->data.tp.clear_toast_afterwards = true; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); }
static void btree_xlog_insert(bool isleaf, bool ismeta, XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record); Buffer buffer; Page page; /* * Insertion to an internal page finishes an incomplete split at the child * level. Clear the incomplete-split flag in the child. Note: during * normal operation, the child and parent pages are locked at the same * time, so that clearing the flag and inserting the downlink appear * atomic to other backends. We don't bother with that during replay, * because readers don't care about the incomplete-split flag and there * cannot be updates happening. */ if (!isleaf) _bt_clear_incomplete_split(record, 1); if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { Size datalen; char *datapos = XLogRecGetBlockData(record, 0, &datalen); page = BufferGetPage(buffer); if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, false, false) == InvalidOffsetNumber) elog(PANIC, "btree_insert_redo: failed to add item"); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* * Note: in normal operation, we'd update the metapage while still holding * lock on the page we inserted into. But during replay it's not * necessary to hold that lock, since no other index updates can be * happening concurrently, and readers will cope fine with following an * obsolete link from the metapage. */ if (ismeta) _bt_restore_meta(record, 2); }
static void _bt_restore_meta(XLogReaderState *record, uint8 block_id) { XLogRecPtr lsn = record->EndRecPtr; Buffer metabuf; Page metapg; BTMetaPageData *md; BTPageOpaque pageop; xl_btree_metadata *xlrec; char *ptr; Size len; metabuf = XLogInitBufferForRedo(record, block_id); ptr = XLogRecGetBlockData(record, block_id, &len); Assert(len == sizeof(xl_btree_metadata)); Assert(BufferGetBlockNumber(metabuf) == BTREE_METAPAGE); xlrec = (xl_btree_metadata *) ptr; metapg = BufferGetPage(metabuf); _bt_pageinit(metapg, BufferGetPageSize(metabuf)); md = BTPageGetMeta(metapg); md->btm_magic = BTREE_MAGIC; md->btm_version = BTREE_VERSION; md->btm_root = xlrec->root; md->btm_level = xlrec->level; md->btm_fastroot = xlrec->fastroot; md->btm_fastlevel = xlrec->fastlevel; pageop = (BTPageOpaque) PageGetSpecialPointer(metapg); pageop->btpo_flags = BTP_META; /* * Set pd_lower just past the end of the metadata. This is not essential * but it makes the page look compressible to xlog.c. */ ((PageHeader) metapg)->pd_lower = ((char *) md + sizeof(BTMetaPageData)) - (char *) metapg; PageSetLSN(metapg, lsn); MarkBufferDirty(metabuf); UnlockReleaseBuffer(metabuf); }
/* * replay a hash index insert without split */ static void hash_xlog_insert(XLogReaderState *record) { HashMetaPage metap; XLogRecPtr lsn = record->EndRecPtr; xl_hash_insert *xlrec = (xl_hash_insert *) XLogRecGetData(record); Buffer buffer; Page page; if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { Size datalen; char *datapos = XLogRecGetBlockData(record, 0, &datalen); page = BufferGetPage(buffer); if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, false, false) == InvalidOffsetNumber) elog(PANIC, "hash_xlog_insert: failed to add item"); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) { /* * Note: in normal operation, we'd update the metapage while still * holding lock on the page we inserted into. But during replay it's * not necessary to hold that lock, since no other index updates can * be happening concurrently. */ page = BufferGetPage(buffer); metap = HashPageGetMeta(page); metap->hashm_ntuples += 1; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }
static void btree_xlog_newroot(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_newroot *xlrec = (xl_btree_newroot *) XLogRecGetData(record); Buffer buffer; Page page; BTPageOpaque pageop; char *ptr; Size len; buffer = XLogInitBufferForRedo(record, 0); page = (Page) BufferGetPage(buffer); _bt_pageinit(page, BufferGetPageSize(buffer)); pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_flags = BTP_ROOT; pageop->btpo_prev = pageop->btpo_next = P_NONE; pageop->btpo.level = xlrec->level; if (xlrec->level == 0) pageop->btpo_flags |= BTP_LEAF; pageop->btpo_cycleid = 0; if (xlrec->level > 0) { ptr = XLogRecGetBlockData(record, 0, &len); _bt_restore_page(page, ptr, len); /* Clear the incomplete-split flag in left child */ _bt_clear_incomplete_split(record, 1); } PageSetLSN(page, lsn); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); _bt_restore_meta(record, 2); }
static void ginRedoVacuumDataLeafPage(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; Buffer buffer; if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { Page page = BufferGetPage(buffer); Size len; ginxlogVacuumDataLeafPage *xlrec; xlrec = (ginxlogVacuumDataLeafPage *) XLogRecGetBlockData(record, 0, &len); Assert(GinPageIsLeaf(page)); Assert(GinPageIsData(page)); ginRedoRecompress(page, &xlrec->data); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }
/* * replay delete operation in hash index to remove * tuples marked as DEAD during index tuple insertion. */ static void hash_xlog_vacuum_one_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_vacuum_one_page *xldata; Buffer buffer; Buffer metabuf; Page page; XLogRedoAction action; HashPageOpaque pageopaque; xldata = (xl_hash_vacuum_one_page *) XLogRecGetData(record); /* * If we have any conflict processing to do, it must happen before we * update the page. * * Hash index records that are marked as LP_DEAD and being removed during * hash index tuple insertion can conflict with standby queries. You might * think that vacuum records would conflict as well, but we've handled * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid * cleaned by the vacuum of the heap and so we can resolve any conflicts * just once when that arrives. After that we know that no conflicts * exist from individual hash index vacuum records on that index. */ if (InHotStandby) { TransactionId latestRemovedXid = hash_xlog_vacuum_get_latestRemovedXid(record); RelFileNode rnode; XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rnode); } action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer); if (action == BLK_NEEDS_REDO) { char *ptr; Size len; ptr = XLogRecGetBlockData(record, 0, &len); page = (Page) BufferGetPage(buffer); if (len > 0) { OffsetNumber *unused; OffsetNumber *unend; unused = (OffsetNumber *) ptr; unend = (OffsetNumber *) ((char *) ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); } /* * Mark the page as not containing any LP_DEAD items. See comments * in _hash_vacuum_one_page() for details. */ pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) { Page metapage; HashMetaPage metap; metapage = BufferGetPage(metabuf); metap = HashPageGetMeta(metapage); metap->hashm_ntuples -= xldata->ntuples; PageSetLSN(metapage, lsn); MarkBufferDirty(metabuf); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); }
/* * Get the latestRemovedXid from the heap pages pointed at by the index * tuples being deleted. See also btree_xlog_delete_get_latestRemovedXid, * on which this function is based. */ static TransactionId hash_xlog_vacuum_get_latestRemovedXid(XLogReaderState *record) { xl_hash_vacuum_one_page *xlrec; OffsetNumber *unused; Buffer ibuffer, hbuffer; Page ipage, hpage; RelFileNode rnode; BlockNumber blkno; ItemId iitemid, hitemid; IndexTuple itup; HeapTupleHeader htuphdr; BlockNumber hblkno; OffsetNumber hoffnum; TransactionId latestRemovedXid = InvalidTransactionId; int i; char *ptr; Size len; xlrec = (xl_hash_vacuum_one_page *) XLogRecGetData(record); /* * If there's nothing running on the standby we don't need to derive a * full latestRemovedXid value, so use a fast path out of here. This * returns InvalidTransactionId, and so will conflict with all HS * transactions; but since we just worked out that that's zero people, * it's OK. * * XXX There is a race condition here, which is that a new backend might * start just after we look. If so, it cannot need to conflict, but this * coding will result in throwing a conflict anyway. */ if (CountDBBackends(InvalidOid) == 0) return latestRemovedXid; /* * Get index page. If the DB is consistent, this should not fail, nor * should any of the heap page fetches below. If one does, we return * InvalidTransactionId to cancel all HS transactions. That's probably * overkill, but it's safe, and certainly better than panicking here. */ XLogRecGetBlockTag(record, 1, &rnode, NULL, &blkno); ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL); if (!BufferIsValid(ibuffer)) return InvalidTransactionId; LockBuffer(ibuffer, HASH_READ); ipage = (Page) BufferGetPage(ibuffer); /* * Loop through the deleted index items to obtain the TransactionId from * the heap items they point to. */ ptr = XLogRecGetBlockData(record, 1, &len); unused = (OffsetNumber *) ptr; for (i = 0; i < xlrec->ntuples; i++) { /* * Identify the index tuple about to be deleted. */ iitemid = PageGetItemId(ipage, unused[i]); itup = (IndexTuple) PageGetItem(ipage, iitemid); /* * Locate the heap page that the index tuple points at */ hblkno = ItemPointerGetBlockNumber(&(itup->t_tid)); hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL); if (!BufferIsValid(hbuffer)) { UnlockReleaseBuffer(ibuffer); return InvalidTransactionId; } LockBuffer(hbuffer, HASH_READ); hpage = (Page) BufferGetPage(hbuffer); /* * Look up the heap tuple header that the index tuple points at by * using the heap node supplied with the xlrec. We can't use * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer. * Note that we are not looking at tuple data here, just headers. */ hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid)); hitemid = PageGetItemId(hpage, hoffnum); /* * Follow any redirections until we find something useful. */ while (ItemIdIsRedirected(hitemid)) { hoffnum = ItemIdGetRedirect(hitemid); hitemid = PageGetItemId(hpage, hoffnum); CHECK_FOR_INTERRUPTS(); } /* * If the heap item has storage, then read the header and use that to * set latestRemovedXid. * * Some LP_DEAD items may not be accessible, so we ignore them. */ if (ItemIdHasStorage(hitemid)) { htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid); HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid); } else if (ItemIdIsDead(hitemid)) { /* * Conjecture: if hitemid is dead then it had xids before the xids * marked on LP_NORMAL items. So we just ignore this item and move * onto the next, for the purposes of calculating * latestRemovedxids. */ } else Assert(!ItemIdIsUsed(hitemid)); UnlockReleaseBuffer(hbuffer); } UnlockReleaseBuffer(ibuffer); /* * If all heap tuples were LP_DEAD then we will be returning * InvalidTransactionId here, which avoids conflicts. This matches * existing logic which assumes that LP_DEAD tuples must already be older * than the latestRemovedXid on the cleanup record that set them as * LP_DEAD, hence must already have generated a conflict. */ return latestRemovedXid; }
/* * replay allocation of page for split operation */ static void hash_xlog_split_allocate_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_split_allocate_page *xlrec = (xl_hash_split_allocate_page *) XLogRecGetData(record); Buffer oldbuf; Buffer newbuf; Buffer metabuf; Size datalen PG_USED_FOR_ASSERTS_ONLY; char *data; XLogRedoAction action; /* * To be consistent with normal operation, here we take cleanup locks on * both the old and new buckets even though there can't be any concurrent * inserts. */ /* replay the record for old bucket */ action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &oldbuf); /* * Note that we still update the page even if it was restored from a full * page image, because the special space is not included in the image. */ if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) { Page oldpage; HashPageOpaque oldopaque; oldpage = BufferGetPage(oldbuf); oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage); oldopaque->hasho_flag = xlrec->old_bucket_flag; oldopaque->hasho_prevblkno = xlrec->new_bucket; PageSetLSN(oldpage, lsn); MarkBufferDirty(oldbuf); } /* replay the record for new bucket */ newbuf = XLogInitBufferForRedo(record, 1); _hash_initbuf(newbuf, xlrec->new_bucket, xlrec->new_bucket, xlrec->new_bucket_flag, true); if (!IsBufferCleanupOK(newbuf)) elog(PANIC, "hash_xlog_split_allocate_page: failed to acquire cleanup lock"); MarkBufferDirty(newbuf); PageSetLSN(BufferGetPage(newbuf), lsn); /* * We can release the lock on old bucket early as well but doing here to * consistent with normal operation. */ if (BufferIsValid(oldbuf)) UnlockReleaseBuffer(oldbuf); if (BufferIsValid(newbuf)) UnlockReleaseBuffer(newbuf); /* * Note: in normal operation, we'd update the meta page while still * holding lock on the old and new bucket pages. But during replay it's * not necessary to hold those locks, since no other bucket splits can be * happening concurrently. */ /* replay the record for metapage changes */ if (XLogReadBufferForRedo(record, 2, &metabuf) == BLK_NEEDS_REDO) { Page page; HashMetaPage metap; page = BufferGetPage(metabuf); metap = HashPageGetMeta(page); metap->hashm_maxbucket = xlrec->new_bucket; data = XLogRecGetBlockData(record, 2, &datalen); if (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) { uint32 lowmask; uint32 *highmask; /* extract low and high masks. */ memcpy(&lowmask, data, sizeof(uint32)); highmask = (uint32 *) ((char *) data + sizeof(uint32)); /* update metapage */ metap->hashm_lowmask = lowmask; metap->hashm_highmask = *highmask; data += sizeof(uint32) * 2; } if (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) { uint32 ovflpoint; uint32 *ovflpages; /* extract information of overflow pages. */ memcpy(&ovflpoint, data, sizeof(uint32)); ovflpages = (uint32 *) ((char *) data + sizeof(uint32)); /* update metapage */ metap->hashm_spares[ovflpoint] = *ovflpages; metap->hashm_ovflpoint = ovflpoint; } MarkBufferDirty(metabuf); PageSetLSN(BufferGetPage(metabuf), lsn); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); }
/* * replay addition of overflow page for hash index */ static void hash_xlog_add_ovfl_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_add_ovfl_page *xlrec = (xl_hash_add_ovfl_page *) XLogRecGetData(record); Buffer leftbuf; Buffer ovflbuf; Buffer metabuf; BlockNumber leftblk; BlockNumber rightblk; BlockNumber newmapblk = InvalidBlockNumber; Page ovflpage; HashPageOpaque ovflopaque; uint32 *num_bucket; char *data; Size datalen PG_USED_FOR_ASSERTS_ONLY; bool new_bmpage = false; XLogRecGetBlockTag(record, 0, NULL, NULL, &rightblk); XLogRecGetBlockTag(record, 1, NULL, NULL, &leftblk); ovflbuf = XLogInitBufferForRedo(record, 0); Assert(BufferIsValid(ovflbuf)); data = XLogRecGetBlockData(record, 0, &datalen); num_bucket = (uint32 *) data; Assert(datalen == sizeof(uint32)); _hash_initbuf(ovflbuf, InvalidBlockNumber, *num_bucket, LH_OVERFLOW_PAGE, true); /* update backlink */ ovflpage = BufferGetPage(ovflbuf); ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); ovflopaque->hasho_prevblkno = leftblk; PageSetLSN(ovflpage, lsn); MarkBufferDirty(ovflbuf); if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO) { Page leftpage; HashPageOpaque leftopaque; leftpage = BufferGetPage(leftbuf); leftopaque = (HashPageOpaque) PageGetSpecialPointer(leftpage); leftopaque->hasho_nextblkno = rightblk; PageSetLSN(leftpage, lsn); MarkBufferDirty(leftbuf); } if (BufferIsValid(leftbuf)) UnlockReleaseBuffer(leftbuf); UnlockReleaseBuffer(ovflbuf); /* * Note: in normal operation, we'd update the bitmap and meta page while * still holding lock on the overflow pages. But during replay it's not * necessary to hold those locks, since no other index updates can be * happening concurrently. */ if (XLogRecHasBlockRef(record, 2)) { Buffer mapbuffer; if (XLogReadBufferForRedo(record, 2, &mapbuffer) == BLK_NEEDS_REDO) { Page mappage = (Page) BufferGetPage(mapbuffer); uint32 *freep = NULL; char *data; uint32 *bitmap_page_bit; freep = HashPageGetBitmap(mappage); data = XLogRecGetBlockData(record, 2, &datalen); bitmap_page_bit = (uint32 *) data; SETBIT(freep, *bitmap_page_bit); PageSetLSN(mappage, lsn); MarkBufferDirty(mapbuffer); } if (BufferIsValid(mapbuffer)) UnlockReleaseBuffer(mapbuffer); } if (XLogRecHasBlockRef(record, 3)) { Buffer newmapbuf; newmapbuf = XLogInitBufferForRedo(record, 3); _hash_initbitmapbuffer(newmapbuf, xlrec->bmsize, true); new_bmpage = true; newmapblk = BufferGetBlockNumber(newmapbuf); MarkBufferDirty(newmapbuf); PageSetLSN(BufferGetPage(newmapbuf), lsn); UnlockReleaseBuffer(newmapbuf); } if (XLogReadBufferForRedo(record, 4, &metabuf) == BLK_NEEDS_REDO) { HashMetaPage metap; Page page; uint32 *firstfree_ovflpage; data = XLogRecGetBlockData(record, 4, &datalen); firstfree_ovflpage = (uint32 *) data; page = BufferGetPage(metabuf); metap = HashPageGetMeta(page); metap->hashm_firstfree = *firstfree_ovflpage; if (!xlrec->bmpage_found) { metap->hashm_spares[metap->hashm_ovflpoint]++; if (new_bmpage) { Assert(BlockNumberIsValid(newmapblk)); metap->hashm_mapp[metap->hashm_nmaps] = newmapblk; metap->hashm_nmaps++; metap->hashm_spares[metap->hashm_ovflpoint]++; } } PageSetLSN(page, lsn); MarkBufferDirty(metabuf); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); }
static void btree_xlog_split(bool onleft, bool isroot, XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record); bool isleaf = (xlrec->level == 0); Buffer lbuf; Buffer rbuf; Page rpage; BTPageOpaque ropaque; char *datapos; Size datalen; Item left_hikey = NULL; Size left_hikeysz = 0; BlockNumber leftsib; BlockNumber rightsib; BlockNumber rnext; XLogRecGetBlockTag(record, 0, NULL, NULL, &leftsib); XLogRecGetBlockTag(record, 1, NULL, NULL, &rightsib); if (!XLogRecGetBlockTag(record, 2, NULL, NULL, &rnext)) rnext = P_NONE; /* * Clear the incomplete split flag on the left sibling of the child page * this is a downlink for. (Like in btree_xlog_insert, this can be done * before locking the other pages) */ if (!isleaf) _bt_clear_incomplete_split(record, 3); /* Reconstruct right (new) sibling page from scratch */ rbuf = XLogInitBufferForRedo(record, 1); datapos = XLogRecGetBlockData(record, 1, &datalen); rpage = (Page) BufferGetPage(rbuf); _bt_pageinit(rpage, BufferGetPageSize(rbuf)); ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); ropaque->btpo_prev = leftsib; ropaque->btpo_next = rnext; ropaque->btpo.level = xlrec->level; ropaque->btpo_flags = isleaf ? BTP_LEAF : 0; ropaque->btpo_cycleid = 0; _bt_restore_page(rpage, datapos, datalen); /* * On leaf level, the high key of the left page is equal to the first key * on the right page. */ if (isleaf) { ItemId hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque)); left_hikey = PageGetItem(rpage, hiItemId); left_hikeysz = ItemIdGetLength(hiItemId); } PageSetLSN(rpage, lsn); MarkBufferDirty(rbuf); /* don't release the buffer yet; we touch right page's first item below */ /* Now reconstruct left (original) sibling page */ if (XLogReadBufferForRedo(record, 0, &lbuf) == BLK_NEEDS_REDO) { /* * To retain the same physical order of the tuples that they had, we * initialize a temporary empty page for the left page and add all the * items to that in item number order. This mirrors how _bt_split() * works. It's not strictly required to retain the same physical * order, as long as the items are in the correct item number order, * but it helps debugging. See also _bt_restore_page(), which does * the same for the right page. */ Page lpage = (Page) BufferGetPage(lbuf); BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage); OffsetNumber off; Item newitem = NULL; Size newitemsz = 0; Page newlpage; OffsetNumber leftoff; datapos = XLogRecGetBlockData(record, 0, &datalen); if (onleft) { newitem = (Item) datapos; newitemsz = MAXALIGN(IndexTupleSize(newitem)); datapos += newitemsz; datalen -= newitemsz; } /* Extract left hikey and its size (assuming 16-bit alignment) */ if (!isleaf) { left_hikey = (Item) datapos; left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); datapos += left_hikeysz; datalen -= left_hikeysz; } Assert(datalen == 0); newlpage = PageGetTempPageCopySpecial(lpage); /* Set high key */ leftoff = P_HIKEY; if (PageAddItem(newlpage, left_hikey, left_hikeysz, P_HIKEY, false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add high key to left page after split"); leftoff = OffsetNumberNext(leftoff); for (off = P_FIRSTDATAKEY(lopaque); off < xlrec->firstright; off++) { ItemId itemid; Size itemsz; Item item; /* add the new item if it was inserted on left page */ if (onleft && off == xlrec->newitemoff) { if (PageAddItem(newlpage, newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add new item to left page after split"); leftoff = OffsetNumberNext(leftoff); } itemid = PageGetItemId(lpage, off); itemsz = ItemIdGetLength(itemid); item = PageGetItem(lpage, itemid); if (PageAddItem(newlpage, item, itemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add old item to left page after split"); leftoff = OffsetNumberNext(leftoff); } /* cope with possibility that newitem goes at the end */ if (onleft && off == xlrec->newitemoff) { if (PageAddItem(newlpage, newitem, newitemsz, leftoff, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add new item to left page after split"); leftoff = OffsetNumberNext(leftoff); } PageRestoreTempPage(newlpage, lpage); /* Fix opaque fields */ lopaque->btpo_flags = BTP_INCOMPLETE_SPLIT; if (isleaf) lopaque->btpo_flags |= BTP_LEAF; lopaque->btpo_next = rightsib; lopaque->btpo_cycleid = 0; PageSetLSN(lpage, lsn); MarkBufferDirty(lbuf); } /* We no longer need the buffers */ if (BufferIsValid(lbuf)) UnlockReleaseBuffer(lbuf); UnlockReleaseBuffer(rbuf); /* * Fix left-link of the page to the right of the new right sibling. * * Note: in normal operation, we do this while still holding lock on the * two split pages. However, that's not necessary for correctness in WAL * replay, because no other index update can be in progress, and readers * will cope properly when following an obsolete left-link. */ if (rnext != P_NONE) { Buffer buffer; if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) { Page page = (Page) BufferGetPage(buffer); BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_prev = rightsib; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); } }
static void btree_xlog_vacuum(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record); Buffer buffer; Page page; BTPageOpaque opaque; /* * If queries might be active then we need to ensure every leaf page is * unpinned between the lastBlockVacuumed and the current block, if there * are any. This prevents replay of the VACUUM from reaching the stage of * removing heap tuples while there could still be indexscans "in flight" * to those particular tuples (see nbtree/README). * * It might be worth checking if there are actually any backends running; * if not, we could just skip this. * * Since VACUUM can visit leaf pages out-of-order, it might issue records * with lastBlockVacuumed >= block; that's not an error, it just means * nothing to do now. * * Note: since we touch all pages in the range, we will lock non-leaf * pages, and also any empty (all-zero) pages that may be in the index. It * doesn't seem worth the complexity to avoid that. But it's important * that HotStandbyActiveInReplay() will not return true if the database * isn't yet consistent; so we need not fear reading still-corrupt blocks * here during crash recovery. */ if (HotStandbyActiveInReplay()) { RelFileNode thisrnode; BlockNumber thisblkno; BlockNumber blkno; XLogRecGetBlockTag(record, 0, &thisrnode, NULL, &thisblkno); for (blkno = xlrec->lastBlockVacuumed + 1; blkno < thisblkno; blkno++) { /* * We use RBM_NORMAL_NO_LOG mode because it's not an error * condition to see all-zero pages. The original btvacuumpage * scan would have skipped over all-zero pages, noting them in FSM * but not bothering to initialize them just yet; so we mustn't * throw an error here. (We could skip acquiring the cleanup lock * if PageIsNew, but it's probably not worth the cycles to test.) * * XXX we don't actually need to read the block, we just need to * confirm it is unpinned. If we had a special call into the * buffer manager we could optimise this so that if the block is * not in shared_buffers we confirm it as unpinned. */ buffer = XLogReadBufferExtended(thisrnode, MAIN_FORKNUM, blkno, RBM_NORMAL_NO_LOG); if (BufferIsValid(buffer)) { LockBufferForCleanup(buffer); UnlockReleaseBuffer(buffer); } } } /* * Like in btvacuumpage(), we need to take a cleanup lock on every leaf * page. See nbtree/README for details. */ if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer) == BLK_NEEDS_REDO) { char *ptr; Size len; ptr = XLogRecGetBlockData(record, 0, &len); page = (Page) BufferGetPage(buffer); if (len > 0) { OffsetNumber *unused; OffsetNumber *unend; unused = (OffsetNumber *) ptr; unend = (OffsetNumber *) ((char *) ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); } /* * Mark the page as not containing any LP_DEAD items --- see comments * in _bt_delitems_vacuum(). */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_flags &= ~BTP_HAS_GARBAGE; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }
/* * replay move of page contents for squeeze operation of hash index */ static void hash_xlog_move_page_contents(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_move_page_contents *xldata = (xl_hash_move_page_contents *) XLogRecGetData(record); Buffer bucketbuf = InvalidBuffer; Buffer writebuf = InvalidBuffer; Buffer deletebuf = InvalidBuffer; XLogRedoAction action; /* * Ensure we have a cleanup lock on primary bucket page before we start * with the actual replay operation. This is to ensure that neither a * scan can start nor a scan can be already-in-progress during the replay * of this operation. If we allow scans during this operation, then they * can miss some records or show the same record multiple times. */ if (xldata->is_prim_bucket_same_wrt) action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf); else { /* * we don't care for return value as the purpose of reading bucketbuf * is to ensure a cleanup lock on primary bucket page. */ (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); action = XLogReadBufferForRedo(record, 1, &writebuf); } /* replay the record for adding entries in overflow buffer */ if (action == BLK_NEEDS_REDO) { Page writepage; char *begin; char *data; Size datalen; uint16 ninserted = 0; data = begin = XLogRecGetBlockData(record, 1, &datalen); writepage = (Page) BufferGetPage(writebuf); if (xldata->ntups > 0) { OffsetNumber *towrite = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntups; while (data - begin < datalen) { IndexTuple itup = (IndexTuple) data; Size itemsz; OffsetNumber l; itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); data += itemsz; l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); if (l == InvalidOffsetNumber) elog(ERROR, "hash_xlog_move_page_contents: failed to add item to hash index page, size %d bytes", (int) itemsz); ninserted++; } } /* * number of tuples inserted must be same as requested in REDO record. */ Assert(ninserted == xldata->ntups); PageSetLSN(writepage, lsn); MarkBufferDirty(writebuf); } /* replay the record for deleting entries from overflow buffer */ if (XLogReadBufferForRedo(record, 2, &deletebuf) == BLK_NEEDS_REDO) { Page page; char *ptr; Size len; ptr = XLogRecGetBlockData(record, 2, &len); page = (Page) BufferGetPage(deletebuf); if (len > 0) { OffsetNumber *unused; OffsetNumber *unend; unused = (OffsetNumber *) ptr; unend = (OffsetNumber *) ((char *) ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); } PageSetLSN(page, lsn); MarkBufferDirty(deletebuf); } /* * Replay is complete, now we can release the buffers. We release locks at * end of replay operation to ensure that we hold lock on primary bucket * page till end of operation. We can optimize by releasing the lock on * write buffer as soon as the operation for same is complete, if it is * not same as primary bucket page, but that doesn't seem to be worth * complicating the code. */ if (BufferIsValid(deletebuf)) UnlockReleaseBuffer(deletebuf); if (BufferIsValid(writebuf)) UnlockReleaseBuffer(writebuf); if (BufferIsValid(bucketbuf)) UnlockReleaseBuffer(bucketbuf); }
/* * redo any page update (except page split) */ static void gistRedoPageUpdateRecord(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) XLogRecGetData(record); Buffer buffer; Page page; if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { char *begin; char *data; Size datalen; int ninserted = 0; data = begin = XLogRecGetBlockData(record, 0, &datalen); page = (Page) BufferGetPage(buffer); /* Delete old tuples */ if (xldata->ntodelete > 0) { int i; OffsetNumber *todelete = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntodelete; for (i = 0; i < xldata->ntodelete; i++) PageIndexTupleDelete(page, todelete[i]); if (GistPageIsLeaf(page)) GistMarkTuplesDeleted(page); } /* add tuples */ if (data - begin < datalen) { OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); while (data - begin < datalen) { IndexTuple itup = (IndexTuple) data; Size sz = IndexTupleSize(itup); OffsetNumber l; data += sz; l = PageAddItem(page, (Item) itup, sz, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to GiST index page, size %d bytes", (int) sz); off++; ninserted++; } } Assert(ninserted == xldata->ntoinsert); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } /* * Fix follow-right data on left child page * * This must be done while still holding the lock on the target page. Note * that even if the target page no longer exists, we still attempt to * replay the change on the child page. */ if (XLogRecHasBlockRef(record, 1)) gistRedoClearFollowRight(record, 1); if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }
static void ginRedoUpdateMetapage(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; ginxlogUpdateMeta *data = (ginxlogUpdateMeta *) XLogRecGetData(record); Buffer metabuffer; Page metapage; Buffer buffer; /* * Restore the metapage. This is essentially the same as a full-page * image, so restore the metapage unconditionally without looking at the * LSN, to avoid torn page hazards. */ metabuffer = XLogInitBufferForRedo(record, 0); Assert(BufferGetBlockNumber(metabuffer) == GIN_METAPAGE_BLKNO); metapage = BufferGetPage(metabuffer); memcpy(GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData)); PageSetLSN(metapage, lsn); MarkBufferDirty(metabuffer); if (data->ntuples > 0) { /* * insert into tail page */ if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) { Page page = BufferGetPage(buffer); OffsetNumber off; int i; Size tupsize; char *payload; IndexTuple tuples; Size totaltupsize; payload = XLogRecGetBlockData(record, 1, &totaltupsize); tuples = (IndexTuple) payload; if (PageIsEmpty(page)) off = FirstOffsetNumber; else off = OffsetNumberNext(PageGetMaxOffsetNumber(page)); for (i = 0; i < data->ntuples; i++) { tupsize = IndexTupleSize(tuples); if (PageAddItem(page, (Item) tuples, tupsize, off, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page"); tuples = (IndexTuple) (((char *) tuples) + tupsize); off++; } Assert(payload + totaltupsize == (char *) tuples); /* * Increase counter of heap tuples */ GinPageGetOpaque(page)->maxoff++; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); } else if (data->prevTail != InvalidBlockNumber) { /* * New tail */ if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) { Page page = BufferGetPage(buffer); GinPageGetOpaque(page)->rightlink = data->newRightlink; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); } UnlockReleaseBuffer(metabuffer); }
/* * Common part of an insert or update. Inserts the new tuple and updates the * revmap. */ static void brin_xlog_insert_update(XLogReaderState *record, xl_brin_insert *xlrec) { XLogRecPtr lsn = record->EndRecPtr; Buffer buffer; BlockNumber regpgno; Page page; XLogRedoAction action; /* * If we inserted the first and only tuple on the page, re-initialize the * page from scratch. */ if (XLogRecGetInfo(record) & XLOG_BRIN_INIT_PAGE) { buffer = XLogInitBufferForRedo(record, 0); page = BufferGetPage(buffer); brin_page_init(page, BRIN_PAGETYPE_REGULAR); action = BLK_NEEDS_REDO; } else { action = XLogReadBufferForRedo(record, 0, &buffer); } /* need this page's blkno to store in revmap */ regpgno = BufferGetBlockNumber(buffer); /* insert the index item into the page */ if (action == BLK_NEEDS_REDO) { OffsetNumber offnum; BrinTuple *tuple; Size tuplen; tuple = (BrinTuple *) XLogRecGetBlockData(record, 0, &tuplen); Assert(tuple->bt_blkno == xlrec->heapBlk); page = (Page) BufferGetPage(buffer); offnum = xlrec->offnum; if (PageGetMaxOffsetNumber(page) + 1 < offnum) elog(PANIC, "brin_xlog_insert_update: invalid max offset number"); offnum = PageAddItem(page, (Item) tuple, tuplen, offnum, true, false); if (offnum == InvalidOffsetNumber) elog(PANIC, "brin_xlog_insert_update: failed to add tuple"); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* update the revmap */ action = XLogReadBufferForRedo(record, 1, &buffer); if (action == BLK_NEEDS_REDO) { ItemPointerData tid; ItemPointerSet(&tid, regpgno, xlrec->offnum); page = (Page) BufferGetPage(buffer); brinSetHeapBlockItemptr(buffer, xlrec->pagesPerRange, xlrec->heapBlk, tid); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* XXX no FSM updates here ... */ }
static void gistRedoPageSplitRecord(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; gistxlogPageSplit *xldata = (gistxlogPageSplit *) XLogRecGetData(record); Buffer firstbuffer = InvalidBuffer; Buffer buffer; Page page; int i; bool isrootsplit = false; /* * We must hold lock on the first-listed page throughout the action, * including while updating the left child page (if any). We can unlock * remaining pages in the list as soon as they've been written, because * there is no path for concurrent queries to reach those pages without * first visiting the first-listed page. */ /* loop around all pages */ for (i = 0; i < xldata->npage; i++) { int flags; char *data; Size datalen; int num; BlockNumber blkno; IndexTuple *tuples; XLogRecGetBlockTag(record, i + 1, NULL, NULL, &blkno); if (blkno == GIST_ROOT_BLKNO) { Assert(i == 0); isrootsplit = true; } buffer = XLogInitBufferForRedo(record, i + 1); page = (Page) BufferGetPage(buffer); data = XLogRecGetBlockData(record, i + 1, &datalen); tuples = decodePageSplitRecord(data, datalen, &num); /* ok, clear buffer */ if (xldata->origleaf && blkno != GIST_ROOT_BLKNO) flags = F_LEAF; else flags = 0; GISTInitBuffer(buffer, flags); /* and fill it */ gistfillbuffer(page, tuples, num, FirstOffsetNumber); if (blkno == GIST_ROOT_BLKNO) { GistPageGetOpaque(page)->rightlink = InvalidBlockNumber; GistPageSetNSN(page, xldata->orignsn); GistClearFollowRight(page); } else { if (i < xldata->npage - 1) { BlockNumber nextblkno; XLogRecGetBlockTag(record, i + 2, NULL, NULL, &nextblkno); GistPageGetOpaque(page)->rightlink = nextblkno; } else GistPageGetOpaque(page)->rightlink = xldata->origrlink; GistPageSetNSN(page, xldata->orignsn); if (i < xldata->npage - 1 && !isrootsplit && xldata->markfollowright) GistMarkFollowRight(page); else GistClearFollowRight(page); } PageSetLSN(page, lsn); MarkBufferDirty(buffer); if (i == 0) firstbuffer = buffer; else UnlockReleaseBuffer(buffer); } /* Fix follow-right data on left child page, if any */ if (XLogRecHasBlockRef(record, 0)) gistRedoClearFollowRight(record, 0); /* Finally, release lock on the first page */ UnlockReleaseBuffer(firstbuffer); }
/* * replay squeeze page operation of hash index */ static void hash_xlog_squeeze_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) XLogRecGetData(record); Buffer bucketbuf = InvalidBuffer; Buffer writebuf; Buffer ovflbuf; Buffer prevbuf = InvalidBuffer; Buffer mapbuf; XLogRedoAction action; /* * Ensure we have a cleanup lock on primary bucket page before we start * with the actual replay operation. This is to ensure that neither a * scan can start nor a scan can be already-in-progress during the replay * of this operation. If we allow scans during this operation, then they * can miss some records or show the same record multiple times. */ if (xldata->is_prim_bucket_same_wrt) action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf); else { /* * we don't care for return value as the purpose of reading bucketbuf * is to ensure a cleanup lock on primary bucket page. */ (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); action = XLogReadBufferForRedo(record, 1, &writebuf); } /* replay the record for adding entries in overflow buffer */ if (action == BLK_NEEDS_REDO) { Page writepage; char *begin; char *data; Size datalen; uint16 ninserted = 0; data = begin = XLogRecGetBlockData(record, 1, &datalen); writepage = (Page) BufferGetPage(writebuf); if (xldata->ntups > 0) { OffsetNumber *towrite = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntups; while (data - begin < datalen) { IndexTuple itup = (IndexTuple) data; Size itemsz; OffsetNumber l; itemsz = IndexTupleDSize(*itup); itemsz = MAXALIGN(itemsz); data += itemsz; l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); if (l == InvalidOffsetNumber) elog(ERROR, "hash_xlog_squeeze_page: failed to add item to hash index page, size %d bytes", (int) itemsz); ninserted++; } } /* * number of tuples inserted must be same as requested in REDO record. */ Assert(ninserted == xldata->ntups); /* * if the page on which are adding tuples is a page previous to freed * overflow page, then update its nextblno. */ if (xldata->is_prev_bucket_same_wrt) { HashPageOpaque writeopaque = (HashPageOpaque) PageGetSpecialPointer(writepage); writeopaque->hasho_nextblkno = xldata->nextblkno; } PageSetLSN(writepage, lsn); MarkBufferDirty(writebuf); } /* replay the record for initializing overflow buffer */ if (XLogReadBufferForRedo(record, 2, &ovflbuf) == BLK_NEEDS_REDO) { Page ovflpage; ovflpage = BufferGetPage(ovflbuf); _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); PageSetLSN(ovflpage, lsn); MarkBufferDirty(ovflbuf); } if (BufferIsValid(ovflbuf)) UnlockReleaseBuffer(ovflbuf); /* replay the record for page previous to the freed overflow page */ if (!xldata->is_prev_bucket_same_wrt && XLogReadBufferForRedo(record, 3, &prevbuf) == BLK_NEEDS_REDO) { Page prevpage = BufferGetPage(prevbuf); HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); prevopaque->hasho_nextblkno = xldata->nextblkno; PageSetLSN(prevpage, lsn); MarkBufferDirty(prevbuf); } if (BufferIsValid(prevbuf)) UnlockReleaseBuffer(prevbuf); /* replay the record for page next to the freed overflow page */ if (XLogRecHasBlockRef(record, 4)) { Buffer nextbuf; if (XLogReadBufferForRedo(record, 4, &nextbuf) == BLK_NEEDS_REDO) { Page nextpage = BufferGetPage(nextbuf); HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); nextopaque->hasho_prevblkno = xldata->prevblkno; PageSetLSN(nextpage, lsn); MarkBufferDirty(nextbuf); } if (BufferIsValid(nextbuf)) UnlockReleaseBuffer(nextbuf); } if (BufferIsValid(writebuf)) UnlockReleaseBuffer(writebuf); if (BufferIsValid(bucketbuf)) UnlockReleaseBuffer(bucketbuf); /* * Note: in normal operation, we'd update the bitmap and meta page while * still holding lock on the primary bucket page and overflow pages. But * during replay it's not necessary to hold those locks, since no other * index updates can be happening concurrently. */ /* replay the record for bitmap page */ if (XLogReadBufferForRedo(record, 5, &mapbuf) == BLK_NEEDS_REDO) { Page mappage = (Page) BufferGetPage(mapbuf); uint32 *freep = NULL; char *data; uint32 *bitmap_page_bit; Size datalen; freep = HashPageGetBitmap(mappage); data = XLogRecGetBlockData(record, 5, &datalen); bitmap_page_bit = (uint32 *) data; CLRBIT(freep, *bitmap_page_bit); PageSetLSN(mappage, lsn); MarkBufferDirty(mapbuf); } if (BufferIsValid(mapbuf)) UnlockReleaseBuffer(mapbuf); /* replay the record for meta page */ if (XLogRecHasBlockRef(record, 6)) { Buffer metabuf; if (XLogReadBufferForRedo(record, 6, &metabuf) == BLK_NEEDS_REDO) { HashMetaPage metap; Page page; char *data; uint32 *firstfree_ovflpage; Size datalen; data = XLogRecGetBlockData(record, 6, &datalen); firstfree_ovflpage = (uint32 *) data; page = BufferGetPage(metabuf); metap = HashPageGetMeta(page); metap->hashm_firstfree = *firstfree_ovflpage; PageSetLSN(page, lsn); MarkBufferDirty(metabuf); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); } }
/* * Decode XLOG_HEAP2_MULTI_INSERT_insert record into multiple tuplebufs. * * Currently MULTI_INSERT will always contain the full tuples. */ static void DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { XLogReaderState *r = buf->record; xl_heap_multi_insert *xlrec; int i; char *data; char *tupledata; Size tuplelen; RelFileNode rnode; xlrec = (xl_heap_multi_insert *) XLogRecGetData(r); /* only interested in our database */ XLogRecGetBlockTag(r, 0, &rnode, NULL, NULL); if (rnode.dbNode != ctx->slot->data.database) return; /* output plugin doesn't look for this origin, no need to queue */ if (FilterByOrigin(ctx, XLogRecGetOrigin(r))) return; tupledata = XLogRecGetBlockData(r, 0, &tuplelen); data = tupledata; for (i = 0; i < xlrec->ntuples; i++) { ReorderBufferChange *change; xl_multi_insert_tuple *xlhdr; int datalen; ReorderBufferTupleBuf *tuple; change = ReorderBufferGetChange(ctx->reorder); change->action = REORDER_BUFFER_CHANGE_INSERT; change->origin_id = XLogRecGetOrigin(r); memcpy(&change->data.tp.relnode, &rnode, sizeof(RelFileNode)); /* * CONTAINS_NEW_TUPLE will always be set currently as multi_insert * isn't used for catalogs, but better be future proof. * * We decode the tuple in pretty much the same way as DecodeXLogTuple, * but since the layout is slightly different, we can't use it here. */ if (xlrec->flags & XLH_INSERT_CONTAINS_NEW_TUPLE) { change->data.tp.newtuple = ReorderBufferGetTupleBuf(ctx->reorder); tuple = change->data.tp.newtuple; /* not a disk based tuple */ ItemPointerSetInvalid(&tuple->tuple.t_self); xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(data); data = ((char *) xlhdr) + SizeOfMultiInsertTuple; datalen = xlhdr->datalen; /* * We can only figure this out after reassembling the * transactions. */ tuple->tuple.t_tableOid = InvalidOid; tuple->tuple.t_data = &tuple->t_data.header; tuple->tuple.t_len = datalen + SizeofHeapTupleHeader; memset(&tuple->t_data.header, 0, SizeofHeapTupleHeader); memcpy((char *) &tuple->t_data.header + SizeofHeapTupleHeader, (char *) data, datalen); data += datalen; tuple->t_data.header.t_infomask = xlhdr->t_infomask; tuple->t_data.header.t_infomask2 = xlhdr->t_infomask2; tuple->t_data.header.t_hoff = xlhdr->t_hoff; } /* * Reset toast reassembly state only after the last row in the last * xl_multi_insert_tuple record emitted by one heap_multi_insert() * call. */ if (xlrec->flags & XLH_INSERT_LAST_IN_MULTI && (i + 1) == xlrec->ntuples) change->data.tp.clear_toast_afterwards = true; else change->data.tp.clear_toast_afterwards = false; ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change); } Assert(data == tupledata + tuplelen); }
/* * redo any page update (except page split) */ static void gistRedoPageUpdateRecord(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) XLogRecGetData(record); Buffer buffer; Page page; if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { char *begin; char *data; Size datalen; int ninserted = 0; data = begin = XLogRecGetBlockData(record, 0, &datalen); page = (Page) BufferGetPage(buffer); if (xldata->ntodelete == 1 && xldata->ntoinsert == 1) { /* * When replacing one tuple with one other tuple, we must use * PageIndexTupleOverwrite for consistency with gistplacetopage. */ OffsetNumber offnum = *((OffsetNumber *) data); IndexTuple itup; Size itupsize; data += sizeof(OffsetNumber); itup = (IndexTuple) data; itupsize = IndexTupleSize(itup); if (!PageIndexTupleOverwrite(page, offnum, (Item) itup, itupsize)) elog(ERROR, "failed to add item to GiST index page, size %d bytes", (int) itupsize); data += itupsize; /* should be nothing left after consuming 1 tuple */ Assert(data - begin == datalen); /* update insertion count for assert check below */ ninserted++; } else if (xldata->ntodelete > 0) { /* Otherwise, delete old tuples if any */ OffsetNumber *todelete = (OffsetNumber *) data; data += sizeof(OffsetNumber) * xldata->ntodelete; PageIndexMultiDelete(page, todelete, xldata->ntodelete); if (GistPageIsLeaf(page)) GistMarkTuplesDeleted(page); } /* Add new tuples if any */ if (data - begin < datalen) { OffsetNumber off = (PageIsEmpty(page)) ? FirstOffsetNumber : OffsetNumberNext(PageGetMaxOffsetNumber(page)); while (data - begin < datalen) { IndexTuple itup = (IndexTuple) data; Size sz = IndexTupleSize(itup); OffsetNumber l; data += sz; l = PageAddItem(page, (Item) itup, sz, off, false, false); if (l == InvalidOffsetNumber) elog(ERROR, "failed to add item to GiST index page, size %d bytes", (int) sz); off++; ninserted++; } } /* Check that XLOG record contained expected number of tuples */ Assert(ninserted == xldata->ntoinsert); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } /* * Fix follow-right data on left child page * * This must be done while still holding the lock on the target page. Note * that even if the target page no longer exists, we still attempt to * replay the change on the child page. */ if (XLogRecHasBlockRef(record, 1)) gistRedoClearFollowRight(record, 1); if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); }
/* * replay delete operation of hash index */ static void hash_xlog_delete(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_hash_delete *xldata = (xl_hash_delete *) XLogRecGetData(record); Buffer bucketbuf = InvalidBuffer; Buffer deletebuf; Page page; XLogRedoAction action; /* * Ensure we have a cleanup lock on primary bucket page before we start * with the actual replay operation. This is to ensure that neither a * scan can start nor a scan can be already-in-progress during the replay * of this operation. If we allow scans during this operation, then they * can miss some records or show the same record multiple times. */ if (xldata->is_primary_bucket_page) action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &deletebuf); else { /* * we don't care for return value as the purpose of reading bucketbuf * is to ensure a cleanup lock on primary bucket page. */ (void) XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &bucketbuf); action = XLogReadBufferForRedo(record, 1, &deletebuf); } /* replay the record for deleting entries in bucket page */ if (action == BLK_NEEDS_REDO) { char *ptr; Size len; ptr = XLogRecGetBlockData(record, 1, &len); page = (Page) BufferGetPage(deletebuf); if (len > 0) { OffsetNumber *unused; OffsetNumber *unend; unused = (OffsetNumber *) ptr; unend = (OffsetNumber *) ((char *) ptr + len); if ((unend - unused) > 0) PageIndexMultiDelete(page, unused, unend - unused); } /* * Mark the page as not containing any LP_DEAD items only if * clear_dead_marking flag is set to true. See comments in * hashbucketcleanup() for details. */ if (xldata->clear_dead_marking) { HashPageOpaque pageopaque; pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); pageopaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; } PageSetLSN(page, lsn); MarkBufferDirty(deletebuf); } if (BufferIsValid(deletebuf)) UnlockReleaseBuffer(deletebuf); if (BufferIsValid(bucketbuf)) UnlockReleaseBuffer(bucketbuf); }