/* * Write a WAL record containing a full image of a page. * * Caller should initialize the buffer and mark it dirty before calling this * function. This function will set the page LSN. * * If the page follows the standard page layout, with a PageHeader and unused * space between pd_lower and pd_upper, set 'page_std' to TRUE. That allows * the unused space to be left out from the WAL record, making it smaller. */ XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std) { Page page = BufferGetPage(buffer); RelFileNode rnode; ForkNumber forkNum; BlockNumber blkno; /* Shared buffers should be modified in a critical section. */ Assert(CritSectionCount > 0); BufferGetTag(buffer, &rnode, &forkNum, &blkno); return log_newpage(&rnode, forkNum, blkno, page, page_std); }
/* * Register a reference to a buffer with the WAL record being constructed. * This must be called for every page that the WAL-logged operation modifies. */ void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags) { registered_buffer *regbuf; /* NO_IMAGE doesn't make sense with FORCE_IMAGE */ Assert(!((flags & REGBUF_FORCE_IMAGE) && (flags & (REGBUF_NO_IMAGE)))); Assert(begininsert_called); if (block_id >= max_registered_block_id) { if (block_id >= max_registered_buffers) elog(ERROR, "too many registered buffers"); max_registered_block_id = block_id + 1; } regbuf = ®istered_buffers[block_id]; BufferGetTag(buffer, ®buf->rnode, ®buf->forkno, ®buf->block); regbuf->page = BufferGetPage(buffer); regbuf->flags = flags; regbuf->rdata_tail = (XLogRecData *) ®buf->rdata_head; regbuf->rdata_len = 0; /* * Check that this page hasn't already been registered with some other * block_id. */ #ifdef USE_ASSERT_CHECKING { int i; for (i = 0; i < max_registered_block_id; i++) { registered_buffer *regbuf_old = ®istered_buffers[i]; if (i == block_id || !regbuf_old->in_use) continue; Assert(!RelFileNodeEquals(regbuf_old->rnode, regbuf->rnode) || regbuf_old->forkno != regbuf->forkno || regbuf_old->block != regbuf->block); } } #endif regbuf->in_use = true; }
static void ginRedoInsertEntry(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rdata) { Page page = BufferGetPage(buffer); ginxlogInsertEntry *data = (ginxlogInsertEntry *) rdata; OffsetNumber offset = data->offset; IndexTuple itup; if (rightblkno != InvalidBlockNumber) { /* update link to right page after split */ Assert(!GinPageIsLeaf(page)); Assert(offset >= FirstOffsetNumber && offset <= PageGetMaxOffsetNumber(page)); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offset)); GinSetDownlink(itup, rightblkno); } if (data->isDelete) { Assert(GinPageIsLeaf(page)); Assert(offset >= FirstOffsetNumber && offset <= PageGetMaxOffsetNumber(page)); PageIndexTupleDelete(page, offset); } itup = &data->tuple; if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), offset, false, false) == InvalidOffsetNumber) { RelFileNode node; ForkNumber forknum; BlockNumber blknum; BufferGetTag(buffer, &node, &forknum, &blknum); elog(ERROR, "failed to add item to index page in %u/%u/%u", node.spcNode, node.dbNode, node.relNode); } }
/* * Write a backup block if needed when we are setting a hint. Note that * this may be called for a variety of page types, not just heaps. * * Callable while holding just share lock on the buffer content. * * We can't use the plain backup block mechanism since that relies on the * Buffer being exclusively locked. Since some modifications (setting LSN, hint * bits) are allowed in a sharelocked buffer that can lead to wal checksum * failures. So instead we copy the page and insert the copied data as normal * record data. * * We only need to do something if page has not yet been full page written in * this checkpoint round. The LSN of the inserted wal record is returned if we * had to write, InvalidXLogRecPtr otherwise. * * It is possible that multiple concurrent backends could attempt to write WAL * records. In that case, multiple copies of the same block would be recorded * in separate WAL records by different backends, though that is still OK from * a correctness perspective. */ XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std) { XLogRecPtr recptr = InvalidXLogRecPtr; XLogRecPtr lsn; XLogRecPtr RedoRecPtr; /* * Ensure no checkpoint can change our view of RedoRecPtr. */ Assert(MyPgXact->delayChkpt); /* * Update RedoRecPtr so that we can make the right decision */ RedoRecPtr = GetRedoRecPtr(); /* * We assume page LSN is first data on *every* page that can be passed to * XLogInsert, whether it has the standard page layout or not. Since we're * only holding a share-lock on the page, we must take the buffer header * lock when we look at the LSN. */ lsn = BufferGetLSNAtomic(buffer); if (lsn <= RedoRecPtr) { int flags; char copied_buffer[BLCKSZ]; char *origdata = (char *) BufferGetBlock(buffer); RelFileNode rnode; ForkNumber forkno; BlockNumber blkno; /* * Copy buffer so we don't have to worry about concurrent hint bit or * lsn updates. We assume pd_lower/upper cannot be changed without an * exclusive lock, so the contents bkp are not racy. */ if (buffer_std) { /* Assume we can omit data between pd_lower and pd_upper */ Page page = BufferGetPage(buffer); uint16 lower = ((PageHeader) page)->pd_lower; uint16 upper = ((PageHeader) page)->pd_upper; memcpy(copied_buffer, origdata, lower); memcpy(copied_buffer + upper, origdata + upper, BLCKSZ - upper); } else memcpy(copied_buffer, origdata, BLCKSZ); XLogBeginInsert(); flags = REGBUF_FORCE_IMAGE; if (buffer_std) flags |= REGBUF_STANDARD; BufferGetTag(buffer, &rnode, &forkno, &blkno); XLogRegisterBlock(0, &rnode, forkno, blkno, copied_buffer, flags); recptr = XLogInsert(RM_XLOG_ID, XLOG_FPI_FOR_HINT); } return recptr; }
/* * Searches for a slot with category at least minvalue. * Returns slot number, or -1 if none found. * * The caller must hold at least a shared lock on the page, and this * function can unlock and lock the page again in exclusive mode if it * needs to be updated. exclusive_lock_held should be set to true if the * caller is already holding an exclusive lock, to avoid extra work. * * If advancenext is false, fp_next_slot is set to point to the returned * slot, and if it's true, to the slot after the returned slot. */ int fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext, bool exclusive_lock_held) { Page page = BufferGetPage(buf); FSMPage fsmpage = (FSMPage) PageGetContents(page); int nodeno; int target; uint16 slot; restart: /* * Check the root first, and exit quickly if there's no leaf with enough * free space */ if (fsmpage->fp_nodes[0] < minvalue) return -1; /* * Start search using fp_next_slot. It's just a hint, so check that it's * sane. (This also handles wrapping around when the prior call returned * the last slot on the page.) */ target = fsmpage->fp_next_slot; if (target < 0 || target >= LeafNodesPerPage) target = 0; target += NonLeafNodesPerPage; /*---------- * Start the search from the target slot. At every step, move one * node to the right, then climb up to the parent. Stop when we reach * a node with enough free space (as we must, since the root has enough * space). * * The idea is to gradually expand our "search triangle", that is, all * nodes covered by the current node, and to be sure we search to the * right from the start point. At the first step, only the target slot * is examined. When we move up from a left child to its parent, we are * adding the right-hand subtree of that parent to the search triangle. * When we move right then up from a right child, we are dropping the * current search triangle (which we know doesn't contain any suitable * page) and instead looking at the next-larger-size triangle to its * right. So we never look left from our original start point, and at * each step the size of the search triangle doubles, ensuring it takes * only log2(N) work to search N pages. * * The "move right" operation will wrap around if it hits the right edge * of the tree, so the behavior is still good if we start near the right. * Note also that the move-and-climb behavior ensures that we can't end * up on one of the missing nodes at the right of the leaf level. * * For example, consider this tree: * * 7 * 7 6 * 5 7 6 5 * 4 5 5 7 2 6 5 2 * T * * Assume that the target node is the node indicated by the letter T, * and we're searching for a node with value of 6 or higher. The search * begins at T. At the first iteration, we move to the right, then to the * parent, arriving at the rightmost 5. At the second iteration, we move * to the right, wrapping around, then climb up, arriving at the 7 on the * third level. 7 satisfies our search, so we descend down to the bottom, * following the path of sevens. This is in fact the first suitable page * to the right of (allowing for wraparound) our start point. *---------- */ nodeno = target; while (nodeno > 0) { if (fsmpage->fp_nodes[nodeno] >= minvalue) break; /* * Move to the right, wrapping around on same level if necessary, then * climb up. */ nodeno = parentof(rightneighbor(nodeno)); } /* * We're now at a node with enough free space, somewhere in the middle of * the tree. Descend to the bottom, following a path with enough free * space, preferring to move left if there's a choice. */ while (nodeno < NonLeafNodesPerPage) { int childnodeno = leftchild(nodeno); if (childnodeno < NodesPerPage && fsmpage->fp_nodes[childnodeno] >= minvalue) { nodeno = childnodeno; continue; } childnodeno++; /* point to right child */ if (childnodeno < NodesPerPage && fsmpage->fp_nodes[childnodeno] >= minvalue) { nodeno = childnodeno; } else { /* * Oops. The parent node promised that either left or right child * has enough space, but neither actually did. This can happen in * case of a "torn page", IOW if we crashed earlier while writing * the page to disk, and only part of the page made it to disk. * * Fix the corruption and restart. */ RelFileNode rnode; ForkNumber forknum; BlockNumber blknum; BufferGetTag(buf, &rnode, &forknum, &blknum); elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u", blknum, rnode.spcNode, rnode.dbNode, rnode.relNode); /* make sure we hold an exclusive lock */ if (!exclusive_lock_held) { LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); exclusive_lock_held = true; } fsm_rebuild_page(page); MarkBufferDirty(buf); goto restart; } } /* We're now at the bottom level, at a node with enough space. */ slot = nodeno - NonLeafNodesPerPage; /* * Update the next-target pointer. Note that we do this even if we're only * holding a shared lock, on the grounds that it's better to use a shared * lock and get a garbled next pointer every now and then, than take the * concurrency hit of an exclusive lock. * * Wrap-around is handled at the beginning of this function. */ fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0); return slot; }