/* ---------------------------------------------------------------- * tidin * ---------------------------------------------------------------- */ Datum tidin(PG_FUNCTION_ARGS) { char *str = PG_GETARG_CSTRING(0); char *p, *coord[NTIDARGS]; int i; ItemPointer result; BlockNumber blockNumber; OffsetNumber offsetNumber; char *badp; int hold_offset; for (i = 0, p = str; *p && i < NTIDARGS && *p != RDELIM; p++) if (*p == DELIM || (*p == LDELIM && !i)) coord[i++] = p + 1; if (i < NTIDARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type tid: \"%s\"", str))); errno = 0; blockNumber = strtoul(coord[0], &badp, 10); if (errno || *badp != DELIM) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type tid: \"%s\"", str))); hold_offset = strtol(coord[1], &badp, 10); if (errno || *badp != RDELIM || hold_offset > USHRT_MAX || hold_offset < 0) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid input syntax for type tid: \"%s\"", str))); offsetNumber = hold_offset; result = (ItemPointer) palloc(sizeof(ItemPointerData)); ItemPointerSet(result, blockNumber, offsetNumber); PG_RETURN_ITEMPOINTER(result); }
/* * tidrecv - converts external binary format to tid */ Datum tidrecv(PG_FUNCTION_ARGS) { StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); ItemPointer result; BlockNumber blockNumber; OffsetNumber offsetNumber; blockNumber = pq_getmsgint(buf, sizeof(blockNumber)); offsetNumber = pq_getmsgint(buf, sizeof(offsetNumber)); result = (ItemPointer) palloc(sizeof(ItemPointerData)); ItemPointerSet(result, blockNumber, offsetNumber); PG_RETURN_ITEMPOINTER(result); }
/* * In the given revmap buffer (locked appropriately by caller), which is used * in a BRIN index of pagesPerRange pages per range, set the element * corresponding to heap block number heapBlk to the given TID. * * Once the operation is complete, the caller must update the LSN on the * returned buffer. * * This is used both in regular operation and during WAL replay. */ void brinSetHeapBlockItemptr(Buffer buf, BlockNumber pagesPerRange, BlockNumber heapBlk, ItemPointerData tid) { RevmapContents *contents; ItemPointerData *iptr; Page page; /* The correct page should already be pinned and locked */ page = BufferGetPage(buf); contents = (RevmapContents *) PageGetContents(page); iptr = (ItemPointerData *) contents->rm_tids; iptr += HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk); ItemPointerSet(iptr, ItemPointerGetBlockNumber(&tid), ItemPointerGetOffsetNumber(&tid)); }
/* * RelationPutHeapTuple - place tuple at specified page * * !!! EREPORT(ERROR) IS DISALLOWED HERE !!! Must PANIC on failure!!! * * Note - caller must hold BUFFER_LOCK_EXCLUSIVE on the buffer. */ void RelationPutHeapTuple(Relation relation, Buffer buffer, HeapTuple tuple, bool token) { Page pageHeader; OffsetNumber offnum; /* * A tuple that's being inserted speculatively should already have its * token set. */ Assert(!token || HeapTupleHeaderIsSpeculative(tuple->t_data)); /* Add the tuple to the page */ pageHeader = BufferGetPage(buffer); offnum = PageAddItem(pageHeader, (Item) tuple->t_data, tuple->t_len, InvalidOffsetNumber, false, true); if (offnum == InvalidOffsetNumber) elog(PANIC, "failed to add tuple to page"); /* Update tuple->t_self to the actual position where it was stored */ ItemPointerSet(&(tuple->t_self), BufferGetBlockNumber(buffer), offnum); /* * Insert the correct position into CTID of the stored tuple, too (unless * this is a speculative insertion, in which case the token is held in * CTID field instead) */ if (!token) { ItemId itemId = PageGetItemId(pageHeader, offnum); HeapTupleHeader item = (HeapTupleHeader) PageGetItem(pageHeader, itemId); item->t_ctid = tuple->t_self; } }
/* * Construct a "dead" tuple to replace a tuple being deleted. * * The state can be SPGIST_REDIRECT, SPGIST_DEAD, or SPGIST_PLACEHOLDER. * For a REDIRECT tuple, a pointer (blkno+offset) must be supplied, and * the xid field is filled in automatically. * * This is called in critical sections, so we don't use palloc; the tuple * is built in preallocated storage. It should be copied before another * call with different parameters can occur. */ SpGistDeadTuple spgFormDeadTuple(SpGistState *state, int tupstate, BlockNumber blkno, OffsetNumber offnum) { SpGistDeadTuple tuple = (SpGistDeadTuple) state->deadTupleStorage; tuple->tupstate = tupstate; tuple->size = SGDTSIZE; tuple->nextOffset = InvalidOffsetNumber; if (tupstate == SPGIST_REDIRECT) { ItemPointerSet(&tuple->pointer, blkno, offnum); tuple->xid = state->myXid; } else { ItemPointerSetInvalid(&tuple->pointer); tuple->xid = InvalidTransactionId; } return tuple; }
static ArrayTuple vacuumSplitPage(GistVacuum *gv, Page tempPage, Buffer buffer, IndexTuple *addon, int curlenaddon) { ArrayTuple res = {NULL, 0, false}; IndexTuple *vec; SplitedPageLayout *dist = NULL, *ptr; int i, veclen = 0; BlockNumber blkno = BufferGetBlockNumber(buffer); MemoryContext oldCtx = MemoryContextSwitchTo(gv->opCtx); vec = gistextractpage(tempPage, &veclen); vec = gistjoinvector(vec, &veclen, addon, curlenaddon); dist = gistSplit(gv->index, tempPage, vec, veclen, &(gv->giststate)); MemoryContextSwitchTo(oldCtx); if (blkno != GIST_ROOT_BLKNO) { /* if non-root split then we should not allocate new buffer */ dist->buffer = buffer; dist->page = tempPage; /* during vacuum we never split leaf page */ GistPageGetOpaque(dist->page)->flags = 0; } else pfree(tempPage); res.itup = (IndexTuple *) palloc(sizeof(IndexTuple) * veclen); res.ituplen = 0; /* make new pages and fills them */ for (ptr = dist; ptr; ptr = ptr->next) { char *data; if (ptr->buffer == InvalidBuffer) { ptr->buffer = gistNewBuffer(gv->index); GISTInitBuffer(ptr->buffer, 0); ptr->page = BufferGetPage(ptr->buffer); } ptr->block.blkno = BufferGetBlockNumber(ptr->buffer); data = (char *) (ptr->list); for (i = 0; i < ptr->block.num; i++) { if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber) elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(gv->index)); data += IndexTupleSize((IndexTuple) data); } ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno); res.itup[res.ituplen] = (IndexTuple) palloc(IndexTupleSize(ptr->itup)); memcpy(res.itup[res.ituplen], ptr->itup, IndexTupleSize(ptr->itup)); res.ituplen++; } START_CRIT_SECTION(); for (ptr = dist; ptr; ptr = ptr->next) { MarkBufferDirty(ptr->buffer); GistPageGetOpaque(ptr->page)->rightlink = InvalidBlockNumber; } /* restore splitted non-root page */ if (blkno != GIST_ROOT_BLKNO) { PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer)); dist->page = BufferGetPage(dist->buffer); } if (!gv->index->rd_istemp) { XLogRecPtr recptr; XLogRecData *rdata; ItemPointerData key; /* set key for incomplete insert */ char *xlinfo; ItemPointerSet(&key, blkno, TUPLE_IS_VALID); rdata = formSplitRdata(gv->index->rd_node, blkno, false, &key, dist); xlinfo = rdata->data; recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_SPLIT, rdata); for (ptr = dist; ptr; ptr = ptr->next) { PageSetLSN(BufferGetPage(ptr->buffer), recptr); PageSetTLI(BufferGetPage(ptr->buffer), ThisTimeLineID); } pfree(xlinfo); pfree(rdata); } else { for (ptr = dist; ptr; ptr = ptr->next) PageSetLSN(BufferGetPage(ptr->buffer), XLogRecPtrForTemp); } for (ptr = dist; ptr; ptr = ptr->next) { /* we must keep the buffer pin on the head page */ if (BufferGetBlockNumber(ptr->buffer) != blkno) UnlockReleaseBuffer(ptr->buffer); } if (blkno == GIST_ROOT_BLKNO) { ItemPointerData key; /* set key for incomplete insert */ ItemPointerSet(&key, blkno, TUPLE_IS_VALID); gistnewroot(gv->index, buffer, res.itup, res.ituplen, &key); } END_CRIT_SECTION(); MemoryContextReset(gv->opCtx); return res; }
/* * Prune specified item pointer or a HOT chain originating at that item. * * If the item is an index-referenced tuple (i.e. not a heap-only tuple), * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple. * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really * DEAD, the OldestXmin test is just too coarse to detect it. * * The root line pointer is redirected to the tuple immediately after the * latest DEAD tuple. If all tuples in the chain are DEAD, the root line * pointer is marked LP_DEAD. (This includes the case of a DEAD simple * tuple, which we treat as a chain of length 1.) * * OldestXmin is the cutoff XID used to identify dead tuples. * * We don't actually change the page here, except perhaps for hint-bit updates * caused by HeapTupleSatisfiesVacuum. We just add entries to the arrays in * prstate showing the changes to be made. Items to be redirected are added * to the redirected[] array (two entries per redirection); items to be set to * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED * state are added to nowunused[]. * * If redirect_move is true, we intend to get rid of redirecting line pointers, * not just make redirection entries. * * Returns the number of tuples (to be) deleted from the page. */ static int heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum, TransactionId OldestXmin, PruneState *prstate, bool redirect_move) { int ndeleted = 0; Page dp = (Page) BufferGetPage(buffer); TransactionId priorXmax = InvalidTransactionId; ItemId rootlp; HeapTupleHeader htup; OffsetNumber latestdead = InvalidOffsetNumber, redirect_target = InvalidOffsetNumber, maxoff = PageGetMaxOffsetNumber(dp), offnum; OffsetNumber chainitems[MaxHeapTuplesPerPage]; int nchain = 0, i; rootlp = PageGetItemId(dp, rootoffnum); /* * If it's a heap-only tuple, then it is not the start of a HOT chain. */ if (ItemIdIsNormal(rootlp)) { htup = (HeapTupleHeader) PageGetItem(dp, rootlp); if (HeapTupleHeaderIsHeapOnly(htup)) { /* * If the tuple is DEAD and doesn't chain to anything else, mark * it unused immediately. (If it does chain, we can only remove * it as part of pruning its chain.) * * We need this primarily to handle aborted HOT updates, that is, * XMIN_INVALID heap-only tuples. Those might not be linked to by * any chain, since the parent tuple might be re-updated before * any pruning occurs. So we have to be able to reap them * separately from chain-pruning. (Note that * HeapTupleHeaderIsHotUpdated will never return true for an * XMIN_INVALID tuple, so this code will work even when there were * sequential updates within the aborted transaction.) * * Note that we might first arrive at a dead heap-only tuple * either here or while following a chain below. Whichever path * gets there first will mark the tuple unused. */ if (HeapTupleSatisfiesVacuum(relation, htup, OldestXmin, buffer) == HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup)) { heap_prune_record_unused(prstate, rootoffnum); ndeleted++; } /* Nothing more to do */ return ndeleted; } } /* Start from the root tuple */ offnum = rootoffnum; /* while not end of the chain */ for (;;) { ItemId lp; bool tupdead, recent_dead; /* Some sanity checks */ if (offnum < FirstOffsetNumber || offnum > maxoff) break; /* If item is already processed, stop --- it must not be same chain */ if (prstate->marked[offnum]) break; lp = PageGetItemId(dp, offnum); /* Unused item obviously isn't part of the chain */ if (!ItemIdIsUsed(lp)) break; /* * If we are looking at the redirected root line pointer, jump to the * first normal tuple in the chain. If we find a redirect somewhere * else, stop --- it must not be same chain. */ if (ItemIdIsRedirected(lp)) { if (nchain > 0) break; /* not at start of chain */ chainitems[nchain++] = offnum; offnum = ItemIdGetRedirect(rootlp); continue; } /* * Likewise, a dead item pointer can't be part of the chain. (We * already eliminated the case of dead root tuple outside this * function.) */ if (ItemIdIsDead(lp)) break; Assert(ItemIdIsNormal(lp)); htup = (HeapTupleHeader) PageGetItem(dp, lp); /* * Check the tuple XMIN against prior XMAX, if any */ if (TransactionIdIsValid(priorXmax) && !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax)) break; /* * OK, this tuple is indeed a member of the chain. */ chainitems[nchain++] = offnum; /* * Check tuple's visibility status. */ tupdead = recent_dead = false; switch (HeapTupleSatisfiesVacuum(relation, htup, OldestXmin, buffer)) { case HEAPTUPLE_DEAD: tupdead = true; break; case HEAPTUPLE_RECENTLY_DEAD: recent_dead = true; /* * This tuple may soon become DEAD. Update the hint field so * that the page is reconsidered for pruning in future. */ heap_prune_record_prunable(prstate, HeapTupleHeaderGetXmax(htup)); break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* * This tuple may soon become DEAD. Update the hint field so * that the page is reconsidered for pruning in future. */ heap_prune_record_prunable(prstate, HeapTupleHeaderGetXmax(htup)); break; case HEAPTUPLE_LIVE: case HEAPTUPLE_INSERT_IN_PROGRESS: /* * If we wanted to optimize for aborts, we might consider * marking the page prunable when we see INSERT_IN_PROGRESS. * But we don't. See related decisions about when to mark the * page prunable in heapam.c. */ break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } /* * Remember the last DEAD tuple seen. We will advance past * RECENTLY_DEAD tuples just in case there's a DEAD one after them; * but we can't advance past anything else. (XXX is it really worth * continuing to scan beyond RECENTLY_DEAD? The case where we will * find another DEAD tuple is a fairly unusual corner case.) */ if (tupdead) latestdead = offnum; else if (!recent_dead) break; /* * If the tuple is not HOT-updated, then we are at the end of this * HOT-update chain. */ if (!HeapTupleHeaderIsHotUpdated(htup)) break; /* * Advance to next chain member. */ Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == BufferGetBlockNumber(buffer)); offnum = ItemPointerGetOffsetNumber(&htup->t_ctid); priorXmax = HeapTupleHeaderGetXmax(htup); } /* * If we found a DEAD tuple in the chain, adjust the HOT chain so that all * the DEAD tuples at the start of the chain are removed and the root line * pointer is appropriately redirected. */ if (OffsetNumberIsValid(latestdead)) { /* * Mark as unused each intermediate item that we are able to remove * from the chain. * * When the previous item is the last dead tuple seen, we are at the * right candidate for redirection. */ for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++) { heap_prune_record_unused(prstate, chainitems[i]); ndeleted++; } /* * If the root entry had been a normal tuple, we are deleting it, so * count it in the result. But changing a redirect (even to DEAD * state) doesn't count. */ if (ItemIdIsNormal(rootlp)) ndeleted++; /* * If the DEAD tuple is at the end of the chain, the entire chain is * dead and the root line pointer can be marked dead. Otherwise just * redirect the root to the correct chain member. */ if (i >= nchain) heap_prune_record_dead(prstate, rootoffnum); else { heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]); /* If the redirection will be a move, need more processing */ if (redirect_move) redirect_target = chainitems[i]; } } else if (nchain < 2 && ItemIdIsRedirected(rootlp)) { /* * We found a redirect item that doesn't point to a valid follow-on * item. This can happen if the loop in heap_page_prune caused us to * visit the dead successor of a redirect item before visiting the * redirect item. We can clean up by setting the redirect item to * DEAD state. */ heap_prune_record_dead(prstate, rootoffnum); } else if (redirect_move && ItemIdIsRedirected(rootlp)) { /* * If we desire to eliminate LP_REDIRECT items by moving tuples, make * a redirection entry for each redirected root item; this will cause * heap_page_prune_execute to actually do the move. (We get here only * when there are no DEAD tuples in the chain; otherwise the * redirection entry was made above.) */ heap_prune_record_redirect(prstate, rootoffnum, chainitems[1]); redirect_target = chainitems[1]; } /* * If we are going to implement a redirect by moving tuples, we have to * issue a cache invalidation against the redirection target tuple, * because its CTID will be effectively changed by the move. Note that * CacheInvalidateHeapTuple only queues the request, it doesn't send it; * if we fail before reaching EndNonTransactionalInvalidation, nothing * happens and no harm is done. */ if (OffsetNumberIsValid(redirect_target)) { ItemId firstlp = PageGetItemId(dp, redirect_target); HeapTupleData firsttup; Assert(ItemIdIsNormal(firstlp)); /* Set up firsttup to reference the tuple at its existing CTID */ firsttup.t_data = (HeapTupleHeader) PageGetItem(dp, firstlp); firsttup.t_len = ItemIdGetLength(firstlp); ItemPointerSet(&firsttup.t_self, BufferGetBlockNumber(buffer), redirect_target); CacheInvalidateHeapTuple(relation, &firsttup); } return ndeleted; }
/* * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so. * * This action unlinks the page from the b-tree structure, removing all * pointers leading to it --- but not touching its own left and right links. * The page cannot be physically reclaimed right away, since other processes * may currently be trying to follow links leading to the page; they have to * be allowed to use its right-link to recover. See nbtree/README. * * On entry, the target buffer must be pinned and locked (either read or write * lock is OK). This lock and pin will be dropped before exiting. * * The "stack" argument can be a search stack leading (approximately) to the * target page, or NULL --- outside callers typically pass NULL since they * have not done such a search, but internal recursion cases pass the stack * to avoid duplicated search effort. * * Returns the number of pages successfully deleted (zero if page cannot * be deleted now; could be more than one if parent pages were deleted too). * * NOTE: this leaks memory. Rather than trying to clean up everything * carefully, it's better to run it in a temp context that can be reset * frequently. */ int _bt_pagedel(Relation rel, Buffer buf, BTStack stack, bool vacuum_full) { int result; BlockNumber target, leftsib, rightsib, parent; OffsetNumber poffset, maxoff; uint32 targetlevel, ilevel; ItemId itemid; IndexTuple targetkey, itup; ScanKey itup_scankey; Buffer lbuf, rbuf, pbuf; bool parent_half_dead; bool parent_one_child; bool rightsib_empty; Buffer metabuf = InvalidBuffer; Page metapg = NULL; BTMetaPageData *metad = NULL; Page page; BTPageOpaque opaque; MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; // Fetch gp_persistent_relation_node information that will be added to XLOG record. RelationFetchGpRelationNodeForXLog(rel); /* * We can never delete rightmost pages nor root pages. While at it, check * that page is not already deleted and is empty. */ page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) || P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page)) { /* Should never fail to delete a half-dead page */ Assert(!P_ISHALFDEAD(opaque)); _bt_relbuf(rel, buf); return 0; } /* * Save info about page, including a copy of its high key (it must have * one, being non-rightmost). */ target = BufferGetBlockNumber(buf); targetlevel = opaque->btpo.level; leftsib = opaque->btpo_prev; itemid = PageGetItemId(page, P_HIKEY); targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid)); /* * To avoid deadlocks, we'd better drop the target page lock before going * further. */ _bt_relbuf(rel, buf); /* * We need an approximate pointer to the page's parent page. We use the * standard search mechanism to search for the page's high key; this will * give us a link to either the current parent or someplace to its left * (if there are multiple equal high keys). In recursion cases, the * caller already generated a search stack and we can just re-use that * work. */ if (stack == NULL) { if (!InRecovery) { /* we need an insertion scan key to do our search, so build one */ itup_scankey = _bt_mkscankey(rel, targetkey); /* find the leftmost leaf page containing this key */ stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false, &lbuf, BT_READ); /* don't need a pin on that either */ _bt_relbuf(rel, lbuf); /* * If we are trying to delete an interior page, _bt_search did * more than we needed. Locate the stack item pointing to our * parent level. */ ilevel = 0; for (;;) { if (stack == NULL) elog(ERROR, "not enough stack items"); if (ilevel == targetlevel) break; stack = stack->bts_parent; ilevel++; } } else { /* * During WAL recovery, we can't use _bt_search (for one reason, * it might invoke user-defined comparison functions that expect * facilities not available in recovery mode). Instead, just set * up a dummy stack pointing to the left end of the parent tree * level, from which _bt_getstackbuf will walk right to the parent * page. Painful, but we don't care too much about performance in * this scenario. */ pbuf = _bt_get_endpoint(rel, targetlevel + 1, false); stack = (BTStack) palloc(sizeof(BTStackData)); stack->bts_blkno = BufferGetBlockNumber(pbuf); stack->bts_offset = InvalidOffsetNumber; /* bts_btentry will be initialized below */ stack->bts_parent = NULL; _bt_relbuf(rel, pbuf); } } /* * We cannot delete a page that is the rightmost child of its immediate * parent, unless it is the only child --- in which case the parent has to * be deleted too, and the same condition applies recursively to it. We * have to check this condition all the way up before trying to delete. We * don't need to re-test when deleting a non-leaf page, though. */ if (targetlevel == 0 && !_bt_parent_deletion_safe(rel, target, stack)) return 0; /* * We have to lock the pages we need to modify in the standard order: * moving right, then up. Else we will deadlock against other writers. * * So, we need to find and write-lock the current left sibling of the * target page. The sibling that was current a moment ago could have * split, so we may have to move right. This search could fail if either * the sibling or the target page was deleted by someone else meanwhile; * if so, give up. (Right now, that should never happen, since page * deletion is only done in VACUUM and there shouldn't be multiple VACUUMs * concurrently on the same table.) */ if (leftsib != P_NONE) { lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); while (P_ISDELETED(opaque) || opaque->btpo_next != target) { /* step right one page */ leftsib = opaque->btpo_next; _bt_relbuf(rel, lbuf); if (leftsib == P_NONE) { elog(LOG, "no left sibling (concurrent deletion?) in \"%s\"", RelationGetRelationName(rel)); return 0; } lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); } } else lbuf = InvalidBuffer; /* * Next write-lock the target page itself. It should be okay to take just * a write lock not a superexclusive lock, since no scans would stop on an * empty page. */ buf = _bt_getbuf(rel, target, BT_WRITE); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* * Check page is still empty etc, else abandon deletion. The empty check * is necessary since someone else might have inserted into it while we * didn't have it locked; the others are just for paranoia's sake. */ if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) || P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page)) { _bt_relbuf(rel, buf); if (BufferIsValid(lbuf)) _bt_relbuf(rel, lbuf); return 0; } if (opaque->btpo_prev != leftsib) elog(ERROR, "left link changed unexpectedly in block %u of index \"%s\"", target, RelationGetRelationName(rel)); /* * And next write-lock the (current) right sibling. */ rightsib = opaque->btpo_next; rbuf = _bt_getbuf(rel, rightsib, BT_WRITE); page = BufferGetPage(rbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (opaque->btpo_prev != target) elog(ERROR, "right sibling's left-link doesn't match: " "block %u links to %u instead of expected %u in index \"%s\"", rightsib, opaque->btpo_prev, target, RelationGetRelationName(rel)); /* * Next find and write-lock the current parent of the target page. This is * essentially the same as the corresponding step of splitting. */ ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY); pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); if (pbuf == InvalidBuffer) elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u", RelationGetRelationName(rel), target); parent = stack->bts_blkno; poffset = stack->bts_offset; /* * If the target is the rightmost child of its parent, then we can't * delete, unless it's also the only child --- in which case the parent * changes to half-dead status. The "can't delete" case should have been * detected by _bt_parent_deletion_safe, so complain if we see it now. */ page = BufferGetPage(pbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); maxoff = PageGetMaxOffsetNumber(page); parent_half_dead = false; parent_one_child = false; if (poffset >= maxoff) { if (poffset == P_FIRSTDATAKEY(opaque)) parent_half_dead = true; else elog(ERROR, "failed to delete rightmost child %u of block %u in index \"%s\"", target, parent, RelationGetRelationName(rel)); } else { /* Will there be exactly one child left in this parent? */ if (OffsetNumberNext(P_FIRSTDATAKEY(opaque)) == maxoff) parent_one_child = true; } /* * If we are deleting the next-to-last page on the target's level, then * the rightsib is a candidate to become the new fast root. (In theory, it * might be possible to push the fast root even further down, but the odds * of doing so are slim, and the locking considerations daunting.) * * We don't support handling this in the case where the parent is becoming * half-dead, even though it theoretically could occur. * * We can safely acquire a lock on the metapage here --- see comments for * _bt_newroot(). */ if (leftsib == P_NONE && !parent_half_dead) { page = BufferGetPage(rbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo.level == targetlevel); if (P_RIGHTMOST(opaque)) { /* rightsib will be the only one left on the level */ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); /* * The expected case here is btm_fastlevel == targetlevel+1; if * the fastlevel is <= targetlevel, something is wrong, and we * choose to overwrite it to fix it. */ if (metad->btm_fastlevel > targetlevel + 1) { /* no update wanted */ _bt_relbuf(rel, metabuf); metabuf = InvalidBuffer; } } } /* * Check that the parent-page index items we're about to delete/overwrite * contain what we expect. This can fail if the index has become * corrupt for some reason. We want to throw any error before entering * the critical section --- otherwise it'd be a PANIC. * * The test on the target item is just an Assert because _bt_getstackbuf * should have guaranteed it has the expected contents. The test on the * next-child downlink is known to sometimes fail in the field, though. */ page = BufferGetPage(pbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); #ifdef USE_ASSERT_CHECKING itemid = PageGetItemId(page, poffset); itup = (IndexTuple) PageGetItem(page, itemid); Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target); #endif if (!parent_half_dead) { OffsetNumber nextoffset; nextoffset = OffsetNumberNext(poffset); itemid = PageGetItemId(page, nextoffset); itup = (IndexTuple) PageGetItem(page, itemid); if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib) elog(ERROR, "right sibling %u of block %u is not next child %u of block %u in index \"%s\"", rightsib, target, ItemPointerGetBlockNumber(&(itup->t_tid)), parent, RelationGetRelationName(rel)); } /* * Here we begin doing the deletion. */ /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); /* * Update parent. The normal case is a tad tricky because we want to * delete the target's downlink and the *following* key. Easiest way is * to copy the right sibling's downlink over the target downlink, and then * delete the following item. */ if (parent_half_dead) { PageIndexTupleDelete(page, poffset); opaque->btpo_flags |= BTP_HALF_DEAD; } else { OffsetNumber nextoffset; itemid = PageGetItemId(page, poffset); itup = (IndexTuple) PageGetItem(page, itemid); ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY); nextoffset = OffsetNumberNext(poffset); PageIndexTupleDelete(page, nextoffset); } /* * Update siblings' side-links. Note the target page's side-links will * continue to point to the siblings. Asserts here are just rechecking * things we already verified above. */ if (BufferIsValid(lbuf)) { page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo_next == target); opaque->btpo_next = rightsib; } page = BufferGetPage(rbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo_prev == target); opaque->btpo_prev = leftsib; rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)); /* * Mark the page itself deleted. It can be recycled when all current * transactions are gone; or immediately if we're doing VACUUM FULL. */ page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_flags &= ~BTP_HALF_DEAD; opaque->btpo_flags |= BTP_DELETED; opaque->btpo.xact = vacuum_full ? FrozenTransactionId : ReadNewTransactionId(); /* And update the metapage, if needed */ if (BufferIsValid(metabuf)) { metad->btm_fastroot = rightsib; metad->btm_fastlevel = targetlevel; MarkBufferDirty(metabuf); } /* Must mark buffers dirty before XLogInsert */ MarkBufferDirty(pbuf); MarkBufferDirty(rbuf); MarkBufferDirty(buf); if (BufferIsValid(lbuf)) MarkBufferDirty(lbuf); /* XLOG stuff */ if (!rel->rd_istemp) { xl_btree_delete_page xlrec; xl_btree_metadata xlmeta; uint8 xlinfo; XLogRecPtr recptr; XLogRecData rdata[5]; XLogRecData *nextrdata; xl_btreetid_set(&(xlrec.target), rel, parent, poffset); xlrec.deadblk = target; xlrec.leftblk = leftsib; xlrec.rightblk = rightsib; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfBtreeDeletePage; rdata[0].buffer = InvalidBuffer; rdata[0].next = nextrdata = &(rdata[1]); if (BufferIsValid(metabuf)) { xlmeta.root = metad->btm_root; xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; nextrdata->data = (char *) &xlmeta; nextrdata->len = sizeof(xl_btree_metadata); nextrdata->buffer = InvalidBuffer; nextrdata->next = nextrdata + 1; nextrdata++; xlinfo = XLOG_BTREE_DELETE_PAGE_META; } else if (parent_half_dead) xlinfo = XLOG_BTREE_DELETE_PAGE_HALF; else xlinfo = XLOG_BTREE_DELETE_PAGE; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->next = nextrdata + 1; nextrdata->buffer = pbuf; nextrdata->buffer_std = true; nextrdata++; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->buffer = rbuf; nextrdata->buffer_std = true; nextrdata->next = NULL; if (BufferIsValid(lbuf)) { nextrdata->next = nextrdata + 1; nextrdata++; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->buffer = lbuf; nextrdata->buffer_std = true; nextrdata->next = NULL; } recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata); if (BufferIsValid(metabuf)) { PageSetLSN(metapg, recptr); PageSetTLI(metapg, ThisTimeLineID); } page = BufferGetPage(pbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); page = BufferGetPage(rbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); page = BufferGetPage(buf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); if (BufferIsValid(lbuf)) { page = BufferGetPage(lbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } END_CRIT_SECTION(); /* release metapage; send out relcache inval if metapage changed */ if (BufferIsValid(metabuf)) { CacheInvalidateRelcache(rel); _bt_relbuf(rel, metabuf); } /* can always release leftsib immediately */ if (BufferIsValid(lbuf)) _bt_relbuf(rel, lbuf); /* * If parent became half dead, recurse to delete it. Otherwise, if right * sibling is empty and is now the last child of the parent, recurse to * try to delete it. (These cases cannot apply at the same time, though * the second case might itself recurse to the first.) * * When recursing to parent, we hold the lock on the target page until * done. This delays any insertions into the keyspace that was just * effectively reassigned to the parent's right sibling. If we allowed * that, and there were enough such insertions before we finish deleting * the parent, page splits within that keyspace could lead to inserting * out-of-order keys into the grandparent level. It is thought that that * wouldn't have any serious consequences, but it still seems like a * pretty bad idea. */ if (parent_half_dead) { /* recursive call will release pbuf */ _bt_relbuf(rel, rbuf); result = _bt_pagedel(rel, pbuf, stack->bts_parent, vacuum_full) + 1; _bt_relbuf(rel, buf); } else if (parent_one_child && rightsib_empty) { _bt_relbuf(rel, pbuf); _bt_relbuf(rel, buf); /* recursive call will release rbuf */ result = _bt_pagedel(rel, rbuf, stack, vacuum_full) + 1; } else { _bt_relbuf(rel, pbuf); _bt_relbuf(rel, buf); _bt_relbuf(rel, rbuf); result = 1; } return result; }
/* * Update tuple origtup (size origsz), located in offset oldoff of buffer * oldbuf, to newtup (size newsz) as summary tuple for the page range starting * at heapBlk. oldbuf must not be locked on entry, and is not locked at exit. * * If samepage is true, attempt to put the new tuple in the same page, but if * there's no room, use some other one. * * If the update is successful, return true; the revmap is updated to point to * the new tuple. If the update is not done for whatever reason, return false. * Caller may retry the update if this happens. */ bool brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, BlockNumber heapBlk, Buffer oldbuf, OffsetNumber oldoff, const BrinTuple *origtup, Size origsz, const BrinTuple *newtup, Size newsz, bool samepage) { Page oldpage; ItemId oldlp; BrinTuple *oldtup; Size oldsz; Buffer newbuf; bool extended; Assert(newsz == MAXALIGN(newsz)); /* If the item is oversized, don't bother. */ if (newsz > BrinMaxItemSize) { ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %lu exceeds maximum %lu for index \"%s\"", (unsigned long) newsz, (unsigned long) BrinMaxItemSize, RelationGetRelationName(idxrel)))); return false; /* keep compiler quiet */ } /* make sure the revmap is long enough to contain the entry we need */ brinRevmapExtend(revmap, heapBlk); if (!samepage) { /* need a page on which to put the item */ newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended); if (!BufferIsValid(newbuf)) { Assert(!extended); return false; } /* * Note: it's possible (though unlikely) that the returned newbuf is * the same as oldbuf, if brin_getinsertbuffer determined that the old * buffer does in fact have enough space. */ if (newbuf == oldbuf) { Assert(!extended); newbuf = InvalidBuffer; } } else { LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); newbuf = InvalidBuffer; extended = false; } oldpage = BufferGetPage(oldbuf); oldlp = PageGetItemId(oldpage, oldoff); /* * Check that the old tuple wasn't updated concurrently: it might have * moved someplace else entirely ... */ if (!ItemIdIsNormal(oldlp)) { LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); /* * If this happens, and the new buffer was obtained by extending the * relation, then we need to ensure we don't leave it uninitialized or * forget about it. */ if (BufferIsValid(newbuf)) { if (extended) brin_initialize_empty_new_buffer(idxrel, newbuf); UnlockReleaseBuffer(newbuf); if (extended) FreeSpaceMapVacuum(idxrel); } return false; } oldsz = ItemIdGetLength(oldlp); oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp); /* * ... or it might have been updated in place to different contents. */ if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz)) { LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); if (BufferIsValid(newbuf)) { if (extended) brin_initialize_empty_new_buffer(idxrel, newbuf); UnlockReleaseBuffer(newbuf); if (extended) FreeSpaceMapVacuum(idxrel); } return false; } /* * Great, the old tuple is intact. We can proceed with the update. * * If there's enough room in the old page for the new tuple, replace it. * * Note that there might now be enough space on the page even though the * caller told us there isn't, if a concurrent update moved another tuple * elsewhere or replaced a tuple with a smaller one. */ if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) && brin_can_do_samepage_update(oldbuf, origsz, newsz)) { if (BufferIsValid(newbuf)) { /* as above */ if (extended) brin_initialize_empty_new_buffer(idxrel, newbuf); UnlockReleaseBuffer(newbuf); } START_CRIT_SECTION(); if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) newtup, newsz)) elog(ERROR, "failed to replace BRIN tuple"); MarkBufferDirty(oldbuf); /* XLOG stuff */ if (RelationNeedsWAL(idxrel)) { xl_brin_samepage_update xlrec; XLogRecPtr recptr; uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE; xlrec.offnum = oldoff; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfBrinSamepageUpdate); XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD); XLogRegisterBufData(0, (char *) newtup, newsz); recptr = XLogInsert(RM_BRIN_ID, info); PageSetLSN(oldpage, recptr); } END_CRIT_SECTION(); LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); if (extended) FreeSpaceMapVacuum(idxrel); return true; } else if (newbuf == InvalidBuffer) { /* * Not enough space, but caller said that there was. Tell them to * start over. */ LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); return false; } else { /* * Not enough free space on the oldpage. Put the new tuple on the new * page, and update the revmap. */ Page newpage = BufferGetPage(newbuf); Buffer revmapbuf; ItemPointerData newtid; OffsetNumber newoff; BlockNumber newblk = InvalidBlockNumber; Size freespace = 0; revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); START_CRIT_SECTION(); /* * We need to initialize the page if it's newly obtained. Note we * will WAL-log the initialization as part of the update, so we don't * need to do that here. */ if (extended) brin_page_init(BufferGetPage(newbuf), BRIN_PAGETYPE_REGULAR); PageIndexTupleDeleteNoCompact(oldpage, oldoff); newoff = PageAddItem(newpage, (Item) newtup, newsz, InvalidOffsetNumber, false, false); if (newoff == InvalidOffsetNumber) elog(ERROR, "failed to add BRIN tuple to new page"); MarkBufferDirty(oldbuf); MarkBufferDirty(newbuf); /* needed to update FSM below */ if (extended) { newblk = BufferGetBlockNumber(newbuf); freespace = br_page_get_freespace(newpage); } ItemPointerSet(&newtid, BufferGetBlockNumber(newbuf), newoff); brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid); MarkBufferDirty(revmapbuf); /* XLOG stuff */ if (RelationNeedsWAL(idxrel)) { xl_brin_update xlrec; XLogRecPtr recptr; uint8 info; info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0); xlrec.insert.offnum = newoff; xlrec.insert.heapBlk = heapBlk; xlrec.insert.pagesPerRange = pagesPerRange; xlrec.oldOffnum = oldoff; XLogBeginInsert(); /* new page */ XLogRegisterData((char *) &xlrec, SizeOfBrinUpdate); XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); XLogRegisterBufData(0, (char *) newtup, newsz); /* revmap page */ XLogRegisterBuffer(1, revmapbuf, 0); /* old page */ XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD); recptr = XLogInsert(RM_BRIN_ID, info); PageSetLSN(oldpage, recptr); PageSetLSN(newpage, recptr); PageSetLSN(BufferGetPage(revmapbuf), recptr); } END_CRIT_SECTION(); LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); UnlockReleaseBuffer(newbuf); if (extended) { Assert(BlockNumberIsValid(newblk)); RecordPageWithFreeSpace(idxrel, newblk, freespace); FreeSpaceMapVacuum(idxrel); } return true; } }
/* * _bt_step() -- Step one item in the requested direction in a scan on * the tree. * * *bufP is the current buffer (read-locked and pinned). If we change * pages, it's updated appropriately. * * If successful, update scan's currentItemData and return true. * If no adjacent record exists in the requested direction, * release buffer pin/locks and return false. */ bool _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) { Relation rel = scan->indexRelation; ItemPointer current = &(scan->currentItemData); BTScanOpaque so = (BTScanOpaque) scan->opaque; Page page; BTPageOpaque opaque; OffsetNumber offnum, maxoff; BlockNumber blkno; /* * Don't use ItemPointerGetOffsetNumber or you risk to get assertion * due to ability of ip_posid to be equal 0. */ offnum = current->ip_posid; page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); maxoff = PageGetMaxOffsetNumber(page); if (ScanDirectionIsForward(dir)) { if (!PageIsEmpty(page) && offnum < maxoff) offnum = OffsetNumberNext(offnum); else { /* Walk right to the next page with data */ for (;;) { /* if we're at end of scan, release the buffer and return */ if (P_RIGHTMOST(opaque)) { _bt_relbuf(rel, *bufP); ItemPointerSetInvalid(current); *bufP = so->btso_curbuf = InvalidBuffer; return false; } /* step right one page */ blkno = opaque->btpo_next; _bt_relbuf(rel, *bufP); *bufP = _bt_getbuf(rel, blkno, BT_READ); page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!P_IGNORE(opaque)) { maxoff = PageGetMaxOffsetNumber(page); /* done if it's not empty */ offnum = P_FIRSTDATAKEY(opaque); if (!PageIsEmpty(page) && offnum <= maxoff) break; } } } } else /* backwards scan */ { if (offnum > P_FIRSTDATAKEY(opaque)) offnum = OffsetNumberPrev(offnum); else { /* * Walk left to the next page with data. This is much more * complex than the walk-right case because of the possibility * that the page to our left splits while we are in flight to * it, plus the possibility that the page we were on gets * deleted after we leave it. See nbtree/README for details. */ for (;;) { *bufP = _bt_walk_left(rel, *bufP); /* if we're at end of scan, return failure */ if (*bufP == InvalidBuffer) { ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; return false; } page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* * Okay, we managed to move left to a non-deleted page. * Done if it's not half-dead and not empty. Else loop * back and do it all again. */ if (!P_IGNORE(opaque)) { maxoff = PageGetMaxOffsetNumber(page); offnum = maxoff; if (!PageIsEmpty(page) && maxoff >= P_FIRSTDATAKEY(opaque)) break; } } } } /* Update scan state */ so->btso_curbuf = *bufP; blkno = BufferGetBlockNumber(*bufP); ItemPointerSet(current, blkno, offnum); return true; }
/* * Finish writing out the completed btree. */ static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) { BTPageState *s; BlockNumber rootblkno = P_NONE; uint32 rootlevel = 0; Page metapage; /* * Each iteration of this loop completes one more level of the tree. */ for (s = state; s != NULL; s = s->btps_next) { BlockNumber blkno; BTPageOpaque opaque; blkno = s->btps_blkno; opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page); /* * We have to link the last page on this level to somewhere. * * If we're at the top, it's the root, so attach it to the metapage. * Otherwise, add an entry for it to its parent using its minimum key. * This may cause the last page of the parent level to split, but * that's not a problem -- we haven't gotten to it yet. */ if (s->btps_next == NULL) { opaque->btpo_flags |= BTP_ROOT; rootblkno = blkno; rootlevel = s->btps_level; } else { Assert(s->btps_minkey != NULL); ItemPointerSet(&(s->btps_minkey->t_tid), blkno, P_HIKEY); _bt_buildadd(wstate, s->btps_next, s->btps_minkey); pfree(s->btps_minkey); s->btps_minkey = NULL; } /* * This is the rightmost page, so the ItemId array needs to be slid * back one slot. Then we can dump out the page. */ _bt_slideleft(s->btps_page); _bt_blwritepage(wstate, s->btps_page, s->btps_blkno); s->btps_page = NULL; /* writepage freed the workspace */ } /* * As the last step in the process, construct the metapage and make it * point to the new root (unless we had no data at all, in which case it's * set to point to "P_NONE"). This changes the index to the "valid" state * by filling in a valid magic number in the metapage. */ metapage = (Page) palloc(BLCKSZ); _bt_initmetapage(metapage, rootblkno, rootlevel); _bt_blwritepage(wstate, metapage, BTREE_METAPAGE); }
/* * Insert a tuple to the new relation. This has to track heap_insert * and its subsidiary functions! * * t_self of the tuple is set to the new TID of the tuple. If t_ctid of the * tuple is invalid on entry, it's replaced with the new TID as well (in * the inserted data only, not in the caller's copy). */ static void raw_heap_insert(RewriteState state, HeapTuple tup) { Page page = state->rs_buffer; Size pageFreeSpace, saveFreeSpace; Size len; OffsetNumber newoff; HeapTuple heaptup; /* * If the new tuple is too big for storage or contains already toasted * out-of-line attributes from some other relation, invoke the toaster. * * Note: below this point, heaptup is the data we actually intend to store * into the relation; tup is the caller's original untoasted data. */ if (state->rs_new_rel->rd_rel->relkind == RELKIND_TOASTVALUE) { /* toast table entries should never be recursively toasted */ Assert(!HeapTupleHasExternal(tup)); heaptup = tup; } else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD) heaptup = toast_insert_or_update(state->rs_new_rel, tup, NULL, HEAP_INSERT_SKIP_FSM | (state->rs_use_wal ? 0 : HEAP_INSERT_SKIP_WAL)); else heaptup = tup; len = MAXALIGN(heaptup->t_len); /* be conservative */ /* * If we're gonna fail for oversize tuple, do it right away */ if (len > MaxHeapTupleSize) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("row is too big: size %zu, maximum size %zu", len, MaxHeapTupleSize))); /* Compute desired extra freespace due to fillfactor option */ saveFreeSpace = RelationGetTargetPageFreeSpace(state->rs_new_rel, HEAP_DEFAULT_FILLFACTOR); /* Now we can check to see if there's enough free space already. */ if (state->rs_buffer_valid) { pageFreeSpace = PageGetHeapFreeSpace(page); if (len + saveFreeSpace > pageFreeSpace) { /* Doesn't fit, so write out the existing page */ /* XLOG stuff */ if (state->rs_use_wal) log_newpage(&state->rs_new_rel->rd_node, MAIN_FORKNUM, state->rs_blockno, page, true); /* * Now write the page. We say isTemp = true even if it's not a * temp table, because there's no need for smgr to schedule an * fsync for this write; we'll do it ourselves in * end_heap_rewrite. */ RelationOpenSmgr(state->rs_new_rel); PageSetChecksumInplace(page, state->rs_blockno); smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM, state->rs_blockno, (char *) page, true); state->rs_blockno++; state->rs_buffer_valid = false; } } if (!state->rs_buffer_valid) { /* Initialize a new empty page */ PageInit(page, BLCKSZ, 0); state->rs_buffer_valid = true; } /* And now we can insert the tuple into the page */ newoff = PageAddItem(page, (Item) heaptup->t_data, heaptup->t_len, InvalidOffsetNumber, false, true); if (newoff == InvalidOffsetNumber) elog(ERROR, "failed to add tuple"); /* Update caller's t_self to the actual position where it was stored */ ItemPointerSet(&(tup->t_self), state->rs_blockno, newoff); /* * Insert the correct position into CTID of the stored tuple, too, if the * caller didn't supply a valid CTID. */ if (!ItemPointerIsValid(&tup->t_data->t_ctid)) { ItemId newitemid; HeapTupleHeader onpage_tup; newitemid = PageGetItemId(page, newoff); onpage_tup = (HeapTupleHeader) PageGetItem(page, newitemid); onpage_tup->t_ctid = tup->t_self; } /* If heaptup is a private copy, release it. */ if (heaptup != tup) heap_freetuple(heaptup); }
/* * _bt_endpoint() -- Find the first or last key in the index. * * This is used by _bt_first() to set up a scan when we've determined * that the scan must start at the beginning or end of the index (for * a forward or backward scan respectively). */ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir) { Relation rel; Buffer buf; Page page; BTPageOpaque opaque; ItemPointer current; OffsetNumber maxoff; OffsetNumber start; BlockNumber blkno; BTItem btitem; IndexTuple itup; BTScanOpaque so; bool res; bool continuescan; rel = scan->indexRelation; current = &(scan->currentItemData); so = (BTScanOpaque) scan->opaque; /* * Scan down to the leftmost or rightmost leaf page. This is a * simplified version of _bt_search(). We don't maintain a stack * since we know we won't need it. */ buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir)); if (!BufferIsValid(buf)) { /* empty index... */ ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; return false; } blkno = BufferGetBlockNumber(buf); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(P_ISLEAF(opaque)); maxoff = PageGetMaxOffsetNumber(page); if (ScanDirectionIsForward(dir)) { /* There could be dead pages to the left, so not this: */ /* Assert(P_LEFTMOST(opaque)); */ start = P_FIRSTDATAKEY(opaque); } else if (ScanDirectionIsBackward(dir)) { Assert(P_RIGHTMOST(opaque)); start = PageGetMaxOffsetNumber(page); if (start < P_FIRSTDATAKEY(opaque)) /* watch out for empty * page */ start = P_FIRSTDATAKEY(opaque); } else { elog(ERROR, "invalid scan direction: %d", (int) dir); start = 0; /* keep compiler quiet */ } ItemPointerSet(current, blkno, start); /* remember which buffer we have pinned */ so->btso_curbuf = buf; /* * Left/rightmost page could be empty due to deletions, if so step * till we find a nonempty page. */ if (start > maxoff) { if (!_bt_step(scan, &buf, dir)) return false; start = ItemPointerGetOffsetNumber(current); page = BufferGetPage(buf); } btitem = (BTItem) PageGetItem(page, PageGetItemId(page, start)); itup = &(btitem->bti_itup); /* see if we picked a winner */ if (_bt_checkkeys(scan, itup, dir, &continuescan)) { /* yes, return it */ scan->xs_ctup.t_self = itup->t_tid; res = true; } else if (continuescan) { /* no, but there might be another one that is */ res = _bt_next(scan, dir); } else { /* no tuples in the index match this scan key */ ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; _bt_relbuf(rel, buf); res = false; } return res; }
Datum readindex(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; readindexinfo *info; Relation irel = NULL; Relation hrel = NULL; MIRROREDLOCK_BUFMGR_DECLARE; if (SRF_IS_FIRSTCALL()) { Oid irelid = PG_GETARG_OID(0); TupleDesc tupdesc; MemoryContext oldcontext; AttrNumber outattnum; TupleDesc itupdesc; int i; AttrNumber attno; irel = index_open(irelid, AccessShareLock); itupdesc = RelationGetDescr(irel); outattnum = FIXED_COLUMN + itupdesc->natts; funcctx = SRF_FIRSTCALL_INIT(); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); tupdesc = CreateTemplateTupleDesc(outattnum, false); attno = 1; TupleDescInitEntry(tupdesc, attno++, "ictid", TIDOID, -1, 0); TupleDescInitEntry(tupdesc, attno++, "hctid", TIDOID, -1, 0); TupleDescInitEntry(tupdesc, attno++, "aotid", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, attno++, "istatus", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, attno++, "hstatus", TEXTOID, -1, 0); for (i = 0; i < itupdesc->natts; i++) { Form_pg_attribute attr = itupdesc->attrs[i]; TupleDescInitEntry(tupdesc, attno++, NameStr(attr->attname), attr->atttypid, attr->atttypmod, 0); } funcctx->tuple_desc = BlessTupleDesc(tupdesc); info = (readindexinfo *) palloc(sizeof(readindexinfo)); funcctx->user_fctx = (void *) info; info->outattnum = outattnum; info->ireloid = irelid; hrel = relation_open(irel->rd_index->indrelid, AccessShareLock); if (hrel->rd_rel != NULL && (hrel->rd_rel->relstorage == 'a' || hrel->rd_rel->relstorage == 'c')) { relation_close(hrel, AccessShareLock); hrel = NULL; info->hreloid = InvalidOid; } else info->hreloid = irel->rd_index->indrelid; info->num_pages = RelationGetNumberOfBlocks(irel); info->blkno = BTREE_METAPAGE + 1; info->page = NULL; MemoryContextSwitchTo(oldcontext); } funcctx = SRF_PERCALL_SETUP(); info = (readindexinfo *) funcctx->user_fctx; /* * Open the relations (on first call, we did that above already). * We unfortunately have to look up the relcache entry on every call, * because if we store it in the cross-call context, we won't get a * chance to release it if the function isn't run to completion, * e.g. because of a LIMIT clause. We only lock the relation on the * first call, and keep the lock until completion, however. */ if (!irel) irel = index_open(info->ireloid, NoLock); if (!hrel && info->hreloid != InvalidOid) hrel = heap_open(info->hreloid, NoLock); while (info->blkno < info->num_pages) { Datum values[255]; bool nulls[255]; ItemPointerData itid; HeapTuple tuple; Datum result; if (info->page == NULL) { Buffer buf; /* * Make copy of the page, because we cannot hold a buffer pin * across calls (we wouldn't have a chance to release it, if the * function isn't run to completion.) */ info->page = palloc(BLCKSZ); MIRROREDLOCK_BUFMGR_LOCK; buf = ReadBuffer(irel, info->blkno); memcpy(info->page, BufferGetPage(buf), BLCKSZ); ReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; info->opaque = (BTPageOpaque) PageGetSpecialPointer(info->page); info->minoff = P_FIRSTDATAKEY(info->opaque); info->maxoff = PageGetMaxOffsetNumber(info->page); info->offnum = info->minoff; } if (!P_ISLEAF(info->opaque) || info->offnum > info->maxoff) { pfree(info->page); info->page = NULL; info->blkno++; continue; } MemSet(nulls, false, info->outattnum * sizeof(bool)); ItemPointerSet(&itid, info->blkno, info->offnum); values[0] = ItemPointerGetDatum(&itid); readindextuple(info, irel, hrel, values, nulls); info->offnum = OffsetNumberNext(info->offnum); tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); result = HeapTupleGetDatum(tuple); if (hrel != NULL) heap_close(hrel, NoLock); index_close(irel, NoLock); SRF_RETURN_NEXT(funcctx, result); } if (hrel != NULL) heap_close(hrel, AccessShareLock); index_close(irel, AccessShareLock); SRF_RETURN_DONE(funcctx); }
/* * lazy_scan_heap() -- scan an open heap relation * * This routine sets commit status bits, builds lists of dead tuples * and pages with free space, and calculates statistics on the number * of live tuples in the heap. When done, or when we run low on space * for dead-tuple TIDs, invoke vacuuming of indexes and heap. * * If there are no indexes then we just vacuum each dirty page as we * process it, since there's no point in gathering many tuples. */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, Relation *Irel, int nindexes, List *updated_stats) { MIRROREDLOCK_BUFMGR_DECLARE; BlockNumber nblocks, blkno; HeapTupleData tuple; char *relname; BlockNumber empty_pages, vacuumed_pages; double num_tuples, tups_vacuumed, nkeep, nunused; IndexBulkDeleteResult **indstats; int i; int reindex_count = 1; PGRUsage ru0; /* Fetch gp_persistent_relation_node information that will be added to XLOG record. */ RelationFetchGpRelationNodeForXLog(onerel); pg_rusage_init(&ru0); relname = RelationGetRelationName(onerel); ereport(elevel, (errmsg("vacuuming \"%s.%s\"", get_namespace_name(RelationGetNamespace(onerel)), relname))); empty_pages = vacuumed_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); nblocks = RelationGetNumberOfBlocks(onerel); vacrelstats->rel_pages = nblocks; vacrelstats->nonempty_pages = 0; lazy_space_alloc(vacrelstats, nblocks); for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool tupgone, hastup; int prev_dead_count; OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; vacuum_delay_point(); /* * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. */ if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && vacrelstats->num_dead_tuples > 0) { /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); reindex_count++; /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); /* Forget the now-vacuumed tuples, and press on */ vacrelstats->num_dead_tuples = 0; vacrelstats->num_index_scans++; } /* -------- MirroredLock ---------- */ MIRROREDLOCK_BUFMGR_LOCK; buf = ReadBufferWithStrategy(onerel, blkno, vac_strategy); /* We need buffer cleanup lock so that we can prune HOT chains. */ LockBufferForCleanup(buf); page = BufferGetPage(buf); if (PageIsNew(page)) { /* * An all-zeroes page could be left over if a backend extends the * relation but crashes before initializing the page. Reclaim such * pages for use. * * We have to be careful here because we could be looking at a * page that someone has just added to the relation and not yet * been able to initialize (see RelationGetBufferForTuple). To * protect against that, release the buffer lock, grab the * relation extension lock momentarily, and re-lock the buffer. If * the page is still uninitialized by then, it must be left over * from a crashed backend, and we can initialize it. * * We don't really need the relation lock when this is a new or * temp relation, but it's probably not worth the code space to * check that, since this surely isn't a critical path. * * Note: the comparable code in vacuum.c need not worry because * it's got exclusive lock on the whole relation. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ LockRelationForExtension(onerel, ExclusiveLock); UnlockRelationForExtension(onerel, ExclusiveLock); /* -------- MirroredLock ---------- */ MIRROREDLOCK_BUFMGR_LOCK; LockBufferForCleanup(buf); if (PageIsNew(page)) { ereport(WARNING, (errmsg("relation \"%s\" page %u is uninitialized --- fixing", relname, blkno))); PageInit(page, BufferGetPageSize(buf), 0); /* must record in xlog so that changetracking will know about this change */ log_heap_newpage(onerel, page, blkno); empty_pages++; lazy_record_free_space(vacrelstats, blkno, PageGetHeapFreeSpace(page)); } MarkBufferDirty(buf); UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ continue; } if (PageIsEmpty(page)) { empty_pages++; lazy_record_free_space(vacrelstats, blkno, PageGetHeapFreeSpace(page)); UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ continue; } /* * Prune all HOT-update chains in this page. * * We count tuples removed by the pruning step as removed by VACUUM. */ tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, false); /* * Now scan the page to collect vacuumable items and check for tuples * requiring freezing. */ nfrozen = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* Unused items require no processing, but we count 'em */ if (!ItemIdIsUsed(itemid)) { nunused += 1; continue; } /* Redirect items mustn't be touched */ if (ItemIdIsRedirected(itemid)) { hastup = true; /* this page won't be truncatable */ continue; } ItemPointerSet(&(tuple.t_self), blkno, offnum); /* * DEAD item pointers are to be vacuumed normally; but we don't * count them in tups_vacuumed, else we'd be double-counting (at * least in the common case where heap_page_prune() just freed up * a non-HOT tuple). */ if (ItemIdIsDead(itemid)) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); continue; } Assert(ItemIdIsNormal(itemid)); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tupgone = false; switch (HeapTupleSatisfiesVacuum(onerel, tuple.t_data, OldestXmin, buf)) { case HEAPTUPLE_DEAD: /* * Ordinarily, DEAD tuples would have been removed by * heap_page_prune(), but it's possible that the tuple * state changed since heap_page_prune() looked. In * particular an INSERT_IN_PROGRESS tuple could have * changed to DEAD if the inserter aborted. So this * cannot be considered an error condition. * * If the tuple is HOT-updated then it must only be * removed by a prune operation; so we keep it just as if * it were RECENTLY_DEAD. Also, if it's a heap-only * tuple, we choose to keep it, because it'll be a lot * cheaper to get rid of it in the next pruning pass than * to treat it like an indexed tuple. */ if (HeapTupleIsHotUpdated(&tuple) || HeapTupleIsHeapOnly(&tuple)) nkeep += 1; else tupgone = true; /* we can delete the tuple */ break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ if (onerel->rd_rel->relhasoids && !OidIsValid(HeapTupleGetOid(&tuple))) elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); break; case HEAPTUPLE_RECENTLY_DEAD: /* * If tuple is recently deleted then we must not remove it * from relation. */ nkeep += 1; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } if (tupgone) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); tups_vacuumed += 1; } else { num_tuples += 1; hastup = true; /* * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ if (heap_freeze_tuple(tuple.t_data, &FreezeLimit, InvalidBuffer, false)) frozen[nfrozen++] = offnum; } } /* scan along page */ /* * If we froze any tuples, mark the buffer dirty, and write a WAL * record recording the changes. We must log the changes to be * crash-safe against future truncation of CLOG. */ if (nfrozen > 0) { MarkBufferDirty(buf); /* no XLOG for temp tables, though */ if (!onerel->rd_istemp) { XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, frozen, nfrozen); PageSetLSN(page, recptr); } } /* * If there are no indexes then we can vacuum the page right now * instead of doing a second scan. */ if (nindexes == 0 && vacrelstats->num_dead_tuples > 0) { /* Remove tuples from heap */ lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); /* Forget the now-vacuumed tuples, and press on */ vacrelstats->num_dead_tuples = 0; vacuumed_pages++; } /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record * its post-compaction free space. If not, then we're done with this * page, so remember its free space as-is. (This path will always be * taken if there are no indexes.) */ if (vacrelstats->num_dead_tuples == prev_dead_count) { lazy_record_free_space(vacrelstats, blkno, PageGetHeapFreeSpace(page)); } /* Remember the location of the last page with nonremovable tuples */ if (hastup) vacrelstats->nonempty_pages = blkno + 1; UnlockReleaseBuffer(buf); MIRROREDLOCK_BUFMGR_UNLOCK; /* -------- MirroredLock ---------- */ } /* save stats for use later */ vacrelstats->rel_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) { /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); reindex_count++; /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); vacrelstats->num_index_scans++; } /* Do post-vacuum cleanup and statistics update for each index */ for (i = 0; i < nindexes; i++) lazy_cleanup_index(Irel[i], indstats[i], vacrelstats, updated_stats); /* If no indexes, make log report that lazy_vacuum_heap would've made */ if (vacuumed_pages) ereport(elevel, (errmsg("\"%s\": removed %.0f row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, vacuumed_pages))); ereport(elevel, (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, num_tuples, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages contain useful free space.\n" "%u pages are entirely empty.\n" "%s.", nkeep, nunused, vacrelstats->tot_free_pages, empty_pages, pg_rusage_show(&ru0)))); }
/* ---------------------------------------------------------------- * ExecInitFunctionScan * ---------------------------------------------------------------- */ FunctionScanState * ExecInitFunctionScan(FunctionScan *node, EState *estate, int eflags) { FunctionScanState *scanstate; RangeTblEntry *rte; Oid funcrettype; TypeFuncClass functypclass; TupleDesc tupdesc = NULL; /* * FunctionScan should not have any children. */ Assert(outerPlan(node) == NULL); Assert(innerPlan(node) == NULL); /* * create new ScanState for node */ scanstate = makeNode(FunctionScanState); scanstate->ss.ps.plan = (Plan *) node; scanstate->ss.ps.state = estate; /* * Miscellaneous initialization * * create expression context for node */ ExecAssignExprContext(estate, &scanstate->ss.ps); #define FUNCTIONSCAN_NSLOTS 2 /* * tuple table initialization */ ExecInitResultTupleSlot(estate, &scanstate->ss.ps); ExecInitScanTupleSlot(estate, &scanstate->ss); /* * initialize child expressions */ scanstate->ss.ps.targetlist = (List *) ExecInitExpr((Expr *) node->scan.plan.targetlist, (PlanState *) scanstate); scanstate->ss.ps.qual = (List *) ExecInitExpr((Expr *) node->scan.plan.qual, (PlanState *) scanstate); /* Check if targetlist or qual contains a var node referencing the ctid column */ scanstate->cdb_want_ctid = contain_ctid_var_reference(&node->scan); ItemPointerSet(&scanstate->cdb_fake_ctid, 0, 0); ItemPointerSet(&scanstate->cdb_mark_ctid, 0, 0); /* * get info about function */ rte = rt_fetch(node->scan.scanrelid, estate->es_range_table); Assert(rte->rtekind == RTE_FUNCTION); /* * Now determine if the function returns a simple or composite type, and * build an appropriate tupdesc. */ functypclass = get_expr_result_type(rte->funcexpr, &funcrettype, &tupdesc); if (functypclass == TYPEFUNC_COMPOSITE) { /* Composite data type, e.g. a table's row type */ Assert(tupdesc); /* Must copy it out of typcache for safety */ tupdesc = CreateTupleDescCopy(tupdesc); } else if (functypclass == TYPEFUNC_SCALAR) { /* Base data type, i.e. scalar */ char *attname = strVal(linitial(rte->eref->colnames)); tupdesc = CreateTemplateTupleDesc(1, false); TupleDescInitEntry(tupdesc, (AttrNumber) 1, attname, funcrettype, -1, 0); } else if (functypclass == TYPEFUNC_RECORD) { tupdesc = BuildDescFromLists(rte->eref->colnames, rte->funccoltypes, rte->funccoltypmods); } else { /* crummy error message, but parser should have caught this */ elog(ERROR, "function in FROM has unsupported return type"); } /* * For RECORD results, make sure a typmod has been assigned. (The * function should do this for itself, but let's cover things in case it * doesn't.) */ BlessTupleDesc(tupdesc); scanstate->tupdesc = tupdesc; ExecAssignScanType(&scanstate->ss, tupdesc); /* * Other node-specific setup */ scanstate->tuplestorestate = NULL; scanstate->funcexpr = ExecInitExpr((Expr *) rte->funcexpr, (PlanState *) scanstate); /* * Initialize result tuple type and projection info. */ ExecAssignResultTypeFromTL(&scanstate->ss.ps); ExecAssignScanProjectionInfo(&scanstate->ss); initGpmonPktForFunctionScan((Plan *)node, &scanstate->ss.ps.gpmon_pkt, estate); if (gp_resqueue_memory_policy != RESQUEUE_MEMORY_POLICY_NONE) { SPI_ReserveMemory(((Plan *)node)->operatorMemKB * 1024L); } return scanstate; }
/* * rtdosplit -- split a page in the tree. * * rtpicksplit does the interesting work of choosing the split. * This routine just does the bit-pushing. */ static void rtdosplit(Relation r, Buffer buffer, RTSTACK *stack, IndexTuple itup, RTSTATE *rtstate) { Page p; Buffer leftbuf, rightbuf; Page left, right; ItemId itemid; IndexTuple item; IndexTuple ltup, rtup; OffsetNumber maxoff; OffsetNumber i; OffsetNumber leftoff, rightoff; BlockNumber lbknum, rbknum; BlockNumber bufblock; RTreePageOpaque opaque; bool *isnull; SPLITVEC v; OffsetNumber *spl_left, *spl_right; TupleDesc tupDesc; int n; OffsetNumber newitemoff; p = (Page) BufferGetPage(buffer); opaque = (RTreePageOpaque) PageGetSpecialPointer(p); rtpicksplit(r, p, &v, itup, rtstate); /* * The root of the tree is the first block in the relation. If we're * about to split the root, we need to do some hocus-pocus to enforce this * guarantee. */ if (BufferGetBlockNumber(buffer) == P_ROOT) { leftbuf = ReadBuffer(r, P_NEW); RTInitBuffer(leftbuf, opaque->flags); lbknum = BufferGetBlockNumber(leftbuf); left = (Page) BufferGetPage(leftbuf); } else { leftbuf = buffer; IncrBufferRefCount(buffer); lbknum = BufferGetBlockNumber(buffer); left = (Page) PageGetTempPage(p, sizeof(RTreePageOpaqueData)); } rightbuf = ReadBuffer(r, P_NEW); RTInitBuffer(rightbuf, opaque->flags); rbknum = BufferGetBlockNumber(rightbuf); right = (Page) BufferGetPage(rightbuf); spl_left = v.spl_left; spl_right = v.spl_right; leftoff = rightoff = FirstOffsetNumber; maxoff = PageGetMaxOffsetNumber(p); newitemoff = OffsetNumberNext(maxoff); /* * spl_left contains a list of the offset numbers of the tuples that will * go to the left page. For each offset number, get the tuple item, then * add the item to the left page. Similarly for the right side. */ /* fill left node */ for (n = 0; n < v.spl_nleft; n++) { i = *spl_left; if (i == newitemoff) item = itup; else { itemid = PageGetItemId(p, i); item = (IndexTuple) PageGetItem(p, itemid); } if (PageAddItem(left, (Item) item, IndexTupleSize(item), leftoff, LP_USED) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(r)); leftoff = OffsetNumberNext(leftoff); spl_left++; /* advance in left split vector */ } /* fill right node */ for (n = 0; n < v.spl_nright; n++) { i = *spl_right; if (i == newitemoff) item = itup; else { itemid = PageGetItemId(p, i); item = (IndexTuple) PageGetItem(p, itemid); } if (PageAddItem(right, (Item) item, IndexTupleSize(item), rightoff, LP_USED) == InvalidOffsetNumber) elog(ERROR, "failed to add index item to \"%s\"", RelationGetRelationName(r)); rightoff = OffsetNumberNext(rightoff); spl_right++; /* advance in right split vector */ } /* Make sure we consumed all of the split vectors, and release 'em */ Assert(*spl_left == InvalidOffsetNumber); Assert(*spl_right == InvalidOffsetNumber); pfree(v.spl_left); pfree(v.spl_right); if ((bufblock = BufferGetBlockNumber(buffer)) != P_ROOT) PageRestoreTempPage(left, p); WriteBuffer(leftbuf); WriteBuffer(rightbuf); /* * Okay, the page is split. We have three things left to do: * * 1) Adjust any active scans on this index to cope with changes we * introduced in its structure by splitting this page. * * 2) "Tighten" the bounding box of the pointer to the left page in the * parent node in the tree, if any. Since we moved a bunch of stuff off * the left page, we expect it to get smaller. This happens in the * internal insertion routine. * * 3) Insert a pointer to the right page in the parent. This may cause * the parent to split. If it does, we need to repeat steps one and two * for each split node in the tree. */ /* adjust active scans */ rtadjscans(r, RTOP_SPLIT, bufblock, FirstOffsetNumber); tupDesc = r->rd_att; isnull = (bool *) palloc(r->rd_rel->relnatts * sizeof(bool)); memset(isnull, false, r->rd_rel->relnatts * sizeof(bool)); ltup = index_form_tuple(tupDesc, &(v.spl_ldatum), isnull); rtup = index_form_tuple(tupDesc, &(v.spl_rdatum), isnull); pfree(isnull); pfree(DatumGetPointer(v.spl_ldatum)); pfree(DatumGetPointer(v.spl_rdatum)); /* set pointers to new child pages in the internal index tuples */ ItemPointerSet(&(ltup->t_tid), lbknum, 1); ItemPointerSet(&(rtup->t_tid), rbknum, 1); rtintinsert(r, stack, ltup, rtup, rtstate); pfree(ltup); pfree(rtup); }
/* ---------------------------------------------------------------- * BitmapHeapNext * * Retrieve next tuple from the BitmapHeapScan node's currentRelation * ---------------------------------------------------------------- */ static TupleTableSlot * BitmapHeapNext(BitmapHeapScanState *node) { EState *estate; ExprContext *econtext; HeapScanDesc scan; Index scanrelid; TIDBitmap *tbm; TBMIterateResult *tbmres; OffsetNumber targoffset; TupleTableSlot *slot; /* * extract necessary information from index scan node */ estate = node->ss.ps.state; econtext = node->ss.ps.ps_ExprContext; slot = node->ss.ss_ScanTupleSlot; scan = node->ss.ss_currentScanDesc; scanrelid = ((BitmapHeapScan *) node->ss.ps.plan)->scan.scanrelid; tbm = node->tbm; tbmres = node->tbmres; /* * Check if we are evaluating PlanQual for tuple of this relation. * Additional checking is not good, but no other way for now. We could * introduce new nodes for this case and handle IndexScan --> NewNode * switching in Init/ReScan plan... */ if (estate->es_evTuple != NULL && estate->es_evTuple[scanrelid - 1] != NULL) { if (estate->es_evTupleNull[scanrelid - 1]) return ExecClearTuple(slot); ExecStoreTuple(estate->es_evTuple[scanrelid - 1], slot, InvalidBuffer, false); /* Does the tuple meet the original qual conditions? */ econtext->ecxt_scantuple = slot; ResetExprContext(econtext); if (!ExecQual(node->bitmapqualorig, econtext, false)) ExecClearTuple(slot); /* would not be returned by scan */ /* Flag for the next call that no more tuples */ estate->es_evTupleNull[scanrelid - 1] = true; return slot; } /* * If we haven't yet performed the underlying index scan, do it, and * prepare the bitmap to be iterated over. */ if (tbm == NULL) { tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node)); if (!tbm || !IsA(tbm, TIDBitmap)) elog(ERROR, "unrecognized result from subplan"); node->tbm = tbm; node->tbmres = tbmres = NULL; tbm_begin_iterate(tbm); } for (;;) { Page dp; ItemId lp; /* * Get next page of results if needed */ if (tbmres == NULL) { node->tbmres = tbmres = tbm_iterate(tbm); if (tbmres == NULL) { /* no more entries in the bitmap */ break; } /* * Ignore any claimed entries past what we think is the end of the * relation. (This is probably not necessary given that we got at * least AccessShareLock on the table before performing any of the * indexscans, but let's be safe.) */ if (tbmres->blockno >= scan->rs_nblocks) { node->tbmres = tbmres = NULL; continue; } /* * Fetch the current heap page and identify candidate tuples. */ bitgetpage(scan, tbmres); /* * Set rs_cindex to first slot to examine */ scan->rs_cindex = 0; } else { /* * Continuing in previously obtained page; advance rs_cindex */ scan->rs_cindex++; } /* * Out of range? If so, nothing more to look at on this page */ if (scan->rs_cindex < 0 || scan->rs_cindex >= scan->rs_ntuples) { node->tbmres = tbmres = NULL; continue; } /* * Okay to fetch the tuple */ targoffset = scan->rs_vistuples[scan->rs_cindex]; dp = (Page) BufferGetPage(scan->rs_cbuf); lp = PageGetItemId(dp, targoffset); Assert(ItemIdIsNormal(lp)); scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); scan->rs_ctup.t_len = ItemIdGetLength(lp); ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset); pgstat_count_heap_fetch(scan->rs_rd); /* * Set up the result slot to point to this tuple. Note that the slot * acquires a pin on the buffer. */ ExecStoreTuple(&scan->rs_ctup, slot, scan->rs_cbuf, false); /* * If we are using lossy info, we have to recheck the qual conditions * at every tuple. */ if (tbmres->ntuples < 0) { econtext->ecxt_scantuple = slot; ResetExprContext(econtext); if (!ExecQual(node->bitmapqualorig, econtext, false)) { /* Fails recheck, so drop it and loop back for another */ ExecClearTuple(slot); continue; } } /* OK to return this tuple */ return slot; } /* * if we get here it means we are at the end of the scan.. */ return ExecClearTuple(slot); }
/* * This function takes an already open relation and scans its pages, * skipping those that have the corresponding visibility map bit set. * For pages we skip, we find the free space from the free space map * and approximate tuple_len on that basis. For the others, we count * the exact number of dead tuples etc. * * This scan is loosely based on vacuumlazy.c:lazy_scan_heap(), but * we do not try to avoid skipping single pages. */ static void statapprox_heap(Relation rel, output_type *stat) { BlockNumber scanned, nblocks, blkno; Buffer vmbuffer = InvalidBuffer; BufferAccessStrategy bstrategy; TransactionId OldestXmin; uint64 misc_count = 0; OldestXmin = GetOldestXmin(rel, PROCARRAY_FLAGS_VACUUM); bstrategy = GetAccessStrategy(BAS_BULKREAD); nblocks = RelationGetNumberOfBlocks(rel); scanned = 0; for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; Size freespace; CHECK_FOR_INTERRUPTS(); /* * If the page has only visible tuples, then we can find out the free * space from the FSM and move on. */ if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer)) { freespace = GetRecordedFreeSpace(rel, blkno); stat->tuple_len += BLCKSZ - freespace; stat->free_space += freespace; continue; } buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buf, BUFFER_LOCK_SHARE); page = BufferGetPage(buf); /* * It's not safe to call PageGetHeapFreeSpace() on new pages, so we * treat them as being free space for our purposes. */ if (!PageIsNew(page)) stat->free_space += PageGetHeapFreeSpace(page); else stat->free_space += BLCKSZ - SizeOfPageHeaderData; if (PageIsNew(page) || PageIsEmpty(page)) { UnlockReleaseBuffer(buf); continue; } scanned++; /* * Look at each tuple on the page and decide whether it's live or * dead, then count it and its size. Unlike lazy_scan_heap, we can * afford to ignore problems and special cases. */ maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; HeapTupleData tuple; itemid = PageGetItemId(page, offnum); if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid) || ItemIdIsDead(itemid)) { continue; } Assert(ItemIdIsNormal(itemid)); ItemPointerSet(&(tuple.t_self), blkno, offnum); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tuple.t_tableOid = RelationGetRelid(rel); /* * We count live and dead tuples, but we also need to add up * others in order to feed vac_estimate_reltuples. */ switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) { case HEAPTUPLE_RECENTLY_DEAD: misc_count++; /* Fall through */ case HEAPTUPLE_DEAD: stat->dead_tuple_len += tuple.t_len; stat->dead_tuple_count++; break; case HEAPTUPLE_LIVE: stat->tuple_len += tuple.t_len; stat->tuple_count++; break; case HEAPTUPLE_INSERT_IN_PROGRESS: case HEAPTUPLE_DELETE_IN_PROGRESS: misc_count++; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } } UnlockReleaseBuffer(buf); } stat->table_len = (uint64) nblocks *BLCKSZ; stat->tuple_count = vac_estimate_reltuples(rel, false, nblocks, scanned, stat->tuple_count + misc_count); /* * Calculate percentages if the relation has one or more pages. */ if (nblocks != 0) { stat->scanned_percent = 100 * scanned / nblocks; stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len; stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len; stat->free_percent = 100.0 * stat->free_space / stat->table_len; } if (BufferIsValid(vmbuffer)) { ReleaseBuffer(vmbuffer); vmbuffer = InvalidBuffer; } }
/* * bitgetpage - subroutine for BitmapHeapNext() * * This routine reads and pins the specified page of the relation, then * builds an array indicating which tuples on the page are both potentially * interesting according to the bitmap, and visible according to the snapshot. */ static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres) { BlockNumber page = tbmres->blockno; Buffer buffer; Snapshot snapshot; int ntup; /* * Acquire pin on the target heap page, trading in any pin we held before. */ Assert(page < scan->rs_nblocks); scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf, scan->rs_rd, page); buffer = scan->rs_cbuf; snapshot = scan->rs_snapshot; ntup = 0; /* * Prune and repair fragmentation for the whole page, if possible. */ Assert(TransactionIdIsValid(RecentGlobalXmin)); heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin); /* * We must hold share lock on the buffer content while examining tuple * visibility. Afterwards, however, the tuples we have found to be * visible are guaranteed good as long as we hold the buffer pin. */ LockBuffer(buffer, BUFFER_LOCK_SHARE); /* * We need two separate strategies for lossy and non-lossy cases. */ if (tbmres->ntuples >= 0) { /* * Bitmap is non-lossy, so we just look through the offsets listed in * tbmres; but we have to follow any HOT chain starting at each such * offset. */ int curslot; for (curslot = 0; curslot < tbmres->ntuples; curslot++) { OffsetNumber offnum = tbmres->offsets[curslot]; ItemPointerData tid; ItemPointerSet(&tid, page, offnum); if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot, NULL)) scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); } } else { /* * Bitmap is lossy, so we must examine each item pointer on the page. * But we can ignore HOT chains, since we'll check each tuple anyway. */ Page dp = (Page) BufferGetPage(buffer); OffsetNumber maxoff = PageGetMaxOffsetNumber(dp); OffsetNumber offnum; for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId lp; HeapTupleData loctup; bool valid; lp = PageGetItemId(dp, offnum); if (!ItemIdIsNormal(lp)) continue; loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); loctup.t_len = ItemIdGetLength(lp); loctup.t_tableOid = scan->rs_rd->rd_id; ItemPointerSet(&loctup.t_self, page, offnum); valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); if (valid) { scan->rs_vistuples[ntup++] = offnum; PredicateLockTuple(scan->rs_rd, &loctup, snapshot); } CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup, buffer, snapshot); } } LockBuffer(buffer, BUFFER_LOCK_UNLOCK); Assert(ntup <= MaxHeapTuplesPerPage); scan->rs_ntuples = ntup; }
/* * _hash_step() -- step to the next valid item in a scan in the bucket. * * If no valid record exists in the requested direction, return * false. Else, return true and set the CurrentItemData for the * scan to the right thing. * * 'bufP' points to the current buffer, which is pinned and read-locked. * On success exit, we have pin and read-lock on whichever page * contains the right item; on failure, we have released all buffers. */ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) { Relation rel = scan->indexRelation; HashScanOpaque so = (HashScanOpaque) scan->opaque; ItemPointer current; Buffer buf; Page page; HashPageOpaque opaque; OffsetNumber maxoff; OffsetNumber offnum; BlockNumber blkno; IndexTuple itup; current = &(scan->currentItemData); buf = *bufP; _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* * If _hash_step is called from _hash_first, current will not be valid, so * we can't dereference it. However, in that case, we presumably want to * start at the beginning/end of the page... */ maxoff = PageGetMaxOffsetNumber(page); if (ItemPointerIsValid(current)) offnum = ItemPointerGetOffsetNumber(current); else offnum = InvalidOffsetNumber; /* * 'offnum' now points to the last tuple we examined (if any). * * continue to step through tuples until: 1) we get to the end of the * bucket chain or 2) we find a valid tuple. */ do { switch (dir) { case ForwardScanDirection: if (offnum != InvalidOffsetNumber) offnum = OffsetNumberNext(offnum); /* move forward */ else offnum = FirstOffsetNumber; /* new page */ while (offnum > maxoff) { /* * either this page is empty (maxoff == * InvalidOffsetNumber) or we ran off the end. */ _hash_readnext(rel, &buf, &page, &opaque); if (BufferIsValid(buf)) { maxoff = PageGetMaxOffsetNumber(page); offnum = FirstOffsetNumber; } else { /* end of bucket */ maxoff = offnum = InvalidOffsetNumber; break; /* exit while */ } } break; case BackwardScanDirection: if (offnum != InvalidOffsetNumber) offnum = OffsetNumberPrev(offnum); /* move back */ else offnum = maxoff; /* new page */ while (offnum < FirstOffsetNumber) { /* * either this page is empty (offnum == * InvalidOffsetNumber) or we ran off the end. */ _hash_readprev(rel, &buf, &page, &opaque); if (BufferIsValid(buf)) maxoff = offnum = PageGetMaxOffsetNumber(page); else { /* end of bucket */ maxoff = offnum = InvalidOffsetNumber; break; /* exit while */ } } break; default: /* NoMovementScanDirection */ /* this should not be reached */ break; } /* we ran off the end of the world without finding a match */ if (offnum == InvalidOffsetNumber) { /* we ran off the end of the bucket without finding a match */ *bufP = so->hashso_curbuf = InvalidBuffer; ItemPointerSetInvalid(current); return false; } /* get ready to check this tuple */ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); } while (!_hash_checkqual(scan, itup)); /* if we made it to here, we've found a valid tuple */ blkno = BufferGetBlockNumber(buf); *bufP = so->hashso_curbuf = buf; ItemPointerSet(current, blkno, offnum); return true; }
/* ---------------------------------------------------------------- * BitmapHeapNext * * Retrieve next tuple from the BitmapHeapScan node's currentRelation * ---------------------------------------------------------------- */ static TupleTableSlot * BitmapHeapNext(BitmapHeapScanState *node) { ExprContext *econtext; HeapScanDesc scan; TIDBitmap *tbm; TBMIterator *tbmiterator; TBMIterateResult *tbmres; TBMIterator *prefetch_iterator; OffsetNumber targoffset; TupleTableSlot *slot; /* * extract necessary information from index scan node */ econtext = node->ss.ps.ps_ExprContext; slot = node->ss.ss_ScanTupleSlot; scan = node->ss.ss_currentScanDesc; tbm = node->tbm; tbmiterator = node->tbmiterator; tbmres = node->tbmres; prefetch_iterator = node->prefetch_iterator; /* * If we haven't yet performed the underlying index scan, do it, and begin * the iteration over the bitmap. * * For prefetching, we use *two* iterators, one for the pages we are * actually scanning and another that runs ahead of the first for * prefetching. node->prefetch_pages tracks exactly how many pages ahead * the prefetch iterator is. Also, node->prefetch_target tracks the * desired prefetch distance, which starts small and increases up to the * GUC-controlled maximum, target_prefetch_pages. This is to avoid doing * a lot of prefetching in a scan that stops after a few tuples because of * a LIMIT. */ if (tbm == NULL) { tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node)); if (!tbm || !IsA(tbm, TIDBitmap)) elog(ERROR, "unrecognized result from subplan"); node->tbm = tbm; node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm); node->tbmres = tbmres = NULL; #ifdef USE_PREFETCH if (target_prefetch_pages > 0) { node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm); node->prefetch_pages = 0; node->prefetch_target = -1; } #endif /* USE_PREFETCH */ } for (;;) { Page dp; ItemId lp; /* * Get next page of results if needed */ if (tbmres == NULL) { node->tbmres = tbmres = tbm_iterate(tbmiterator); if (tbmres == NULL) { /* no more entries in the bitmap */ break; } #ifdef USE_PREFETCH if (node->prefetch_pages > 0) { /* The main iterator has closed the distance by one page */ node->prefetch_pages--; } else if (prefetch_iterator) { /* Do not let the prefetch iterator get behind the main one */ TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno) elog(ERROR, "prefetch and main iterators are out of sync"); } #endif /* USE_PREFETCH */ /* * Ignore any claimed entries past what we think is the end of the * relation. (This is probably not necessary given that we got at * least AccessShareLock on the table before performing any of the * indexscans, but let's be safe.) */ if (tbmres->blockno >= scan->rs_nblocks) { node->tbmres = tbmres = NULL; continue; } /* * Fetch the current heap page and identify candidate tuples. */ bitgetpage(scan, tbmres); /* * Set rs_cindex to first slot to examine */ scan->rs_cindex = 0; #ifdef USE_PREFETCH /* * Increase prefetch target if it's not yet at the max. Note that * we will increase it to zero after fetching the very first * page/tuple, then to one after the second tuple is fetched, then * it doubles as later pages are fetched. */ if (node->prefetch_target >= target_prefetch_pages) /* don't increase any further */ ; else if (node->prefetch_target >= target_prefetch_pages / 2) node->prefetch_target = target_prefetch_pages; else if (node->prefetch_target > 0) node->prefetch_target *= 2; else node->prefetch_target++; #endif /* USE_PREFETCH */ } else { /* * Continuing in previously obtained page; advance rs_cindex */ scan->rs_cindex++; #ifdef USE_PREFETCH /* * Try to prefetch at least a few pages even before we get to the * second page if we don't stop reading after the first tuple. */ if (node->prefetch_target < target_prefetch_pages) node->prefetch_target++; #endif /* USE_PREFETCH */ } /* * Out of range? If so, nothing more to look at on this page */ if (scan->rs_cindex < 0 || scan->rs_cindex >= scan->rs_ntuples) { node->tbmres = tbmres = NULL; continue; } #ifdef USE_PREFETCH /* * We issue prefetch requests *after* fetching the current page to try * to avoid having prefetching interfere with the main I/O. Also, this * should happen only when we have determined there is still something * to do on the current page, else we may uselessly prefetch the same * page we are just about to request for real. */ if (prefetch_iterator) { while (node->prefetch_pages < node->prefetch_target) { TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); if (tbmpre == NULL) { /* No more pages to prefetch */ tbm_end_iterate(prefetch_iterator); node->prefetch_iterator = prefetch_iterator = NULL; break; } node->prefetch_pages++; PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); } } #endif /* USE_PREFETCH */ /* * Okay to fetch the tuple */ targoffset = scan->rs_vistuples[scan->rs_cindex]; dp = (Page) BufferGetPage(scan->rs_cbuf); lp = PageGetItemId(dp, targoffset); Assert(ItemIdIsNormal(lp)); scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); scan->rs_ctup.t_len = ItemIdGetLength(lp); ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset); pgstat_count_heap_fetch(scan->rs_rd); /* * Set up the result slot to point to this tuple. Note that the slot * acquires a pin on the buffer. */ ExecStoreTuple(&scan->rs_ctup, slot, scan->rs_cbuf, false); /* * If we are using lossy info, we have to recheck the qual conditions * at every tuple. */ if (tbmres->recheck) { econtext->ecxt_scantuple = slot; ResetExprContext(econtext); if (!ExecQual(node->bitmapqualorig, econtext, false)) { /* Fails recheck, so drop it and loop back for another */ ExecClearTuple(slot); continue; } } /* OK to return this tuple */ return slot; } /* * if we get here it means we are at the end of the scan.. */ return ExecClearTuple(slot); }
/* * _bt_first() -- Find the first item in a scan. * * We need to be clever about the type of scan, the operation it's * performing, and the tree ordering. We find the * first item in the tree that satisfies the qualification * associated with the scan descriptor. On exit, the page containing * the current index tuple is read locked and pinned, and the scan's * opaque data entry is updated to include the buffer. */ bool _bt_first(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; Buffer buf; Page page; BTStack stack; OffsetNumber offnum; BTItem btitem; IndexTuple itup; ItemPointer current; BlockNumber blkno; StrategyNumber strat; bool res; int32 result; bool scanFromEnd; bool continuescan; ScanKey scankeys = NULL; int keysCount = 0; int *nKeyIs = NULL; int i, j; StrategyNumber strat_total; /* * Order the scan keys in our canonical fashion and eliminate any * redundant keys. */ _bt_orderkeys(scan); /* * Quit now if _bt_orderkeys() discovered that the scan keys can never * be satisfied (eg, x == 1 AND x > 2). */ if (!so->qual_ok) return false; /* * Examine the scan keys to discover where we need to start the scan. */ scanFromEnd = false; strat_total = BTEqualStrategyNumber; if (so->numberOfKeys > 0) { nKeyIs = (int *) palloc(so->numberOfKeys * sizeof(int)); for (i = 0; i < so->numberOfKeys; i++) { AttrNumber attno = so->keyData[i].sk_attno; /* ignore keys for already-determined attrs */ if (attno <= keysCount) continue; /* if we didn't find a boundary for the preceding attr, quit */ if (attno > keysCount + 1) break; strat = _bt_getstrat(rel, attno, so->keyData[i].sk_procedure); /* * Can we use this key as a starting boundary for this attr? * * We can use multiple keys if they look like, say, = >= = but we * have to stop after accepting a > or < boundary. */ if (strat == strat_total || strat == BTEqualStrategyNumber) nKeyIs[keysCount++] = i; else if (ScanDirectionIsBackward(dir) && (strat == BTLessStrategyNumber || strat == BTLessEqualStrategyNumber)) { nKeyIs[keysCount++] = i; strat_total = strat; if (strat == BTLessStrategyNumber) break; } else if (ScanDirectionIsForward(dir) && (strat == BTGreaterStrategyNumber || strat == BTGreaterEqualStrategyNumber)) { nKeyIs[keysCount++] = i; strat_total = strat; if (strat == BTGreaterStrategyNumber) break; } } if (keysCount == 0) scanFromEnd = true; } else scanFromEnd = true; /* if we just need to walk down one edge of the tree, do that */ if (scanFromEnd) { if (nKeyIs) pfree(nKeyIs); return _bt_endpoint(scan, dir); } /* * We want to start the scan somewhere within the index. Set up a * scankey we can use to search for the correct starting point. */ scankeys = (ScanKey) palloc(keysCount * sizeof(ScanKeyData)); for (i = 0; i < keysCount; i++) { FmgrInfo *procinfo; j = nKeyIs[i]; /* * _bt_orderkeys disallows it, but it's place to add some code * later */ if (so->keyData[j].sk_flags & SK_ISNULL) { pfree(nKeyIs); pfree(scankeys); elog(ERROR, "btree doesn't support is(not)null, yet"); return false; } procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC); ScanKeyEntryInitializeWithInfo(scankeys + i, so->keyData[j].sk_flags, i + 1, procinfo, CurrentMemoryContext, so->keyData[j].sk_argument); } if (nKeyIs) pfree(nKeyIs); current = &(scan->currentItemData); /* * Use the manufactured scan key to descend the tree and position * ourselves on the target leaf page. */ stack = _bt_search(rel, keysCount, scankeys, &buf, BT_READ); /* don't need to keep the stack around... */ _bt_freestack(stack); if (!BufferIsValid(buf)) { /* Only get here if index is completely empty */ ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; pfree(scankeys); return false; } /* remember which buffer we have pinned */ so->btso_curbuf = buf; blkno = BufferGetBlockNumber(buf); page = BufferGetPage(buf); /* position to the precise item on the page */ offnum = _bt_binsrch(rel, buf, keysCount, scankeys); ItemPointerSet(current, blkno, offnum); /* * At this point we are positioned at the first item >= scan key, or * possibly at the end of a page on which all the existing items are * less than the scan key and we know that everything on later pages * is greater than or equal to scan key. * * We could step forward in the latter case, but that'd be a waste of * time if we want to scan backwards. So, it's now time to examine * the scan strategy to find the exact place to start the scan. * * Note: if _bt_step fails (meaning we fell off the end of the index in * one direction or the other), we either return false (no matches) or * call _bt_endpoint() to set up a scan starting at that index * endpoint, as appropriate for the desired scan type. * * it's yet other place to add some code later for is(not)null ... */ switch (strat_total) { case BTLessStrategyNumber: /* * Back up one to arrive at last item < scankey */ if (!_bt_step(scan, &buf, BackwardScanDirection)) { pfree(scankeys); return false; } break; case BTLessEqualStrategyNumber: /* * We need to find the last item <= scankey, so step forward * till we find one > scankey, then step back one. */ if (offnum > PageGetMaxOffsetNumber(page)) { if (!_bt_step(scan, &buf, ForwardScanDirection)) { pfree(scankeys); return _bt_endpoint(scan, dir); } } for (;;) { offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(buf); result = _bt_compare(rel, keysCount, scankeys, page, offnum); if (result < 0) break; if (!_bt_step(scan, &buf, ForwardScanDirection)) { pfree(scankeys); return _bt_endpoint(scan, dir); } } if (!_bt_step(scan, &buf, BackwardScanDirection)) { pfree(scankeys); return false; } break; case BTEqualStrategyNumber: /* * Make sure we are on the first equal item; might have to * step forward if currently at end of page. */ if (offnum > PageGetMaxOffsetNumber(page)) { if (!_bt_step(scan, &buf, ForwardScanDirection)) { pfree(scankeys); return false; } offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(buf); } result = _bt_compare(rel, keysCount, scankeys, page, offnum); if (result != 0) goto nomatches; /* no equal items! */ /* * If a backward scan was specified, need to start with last * equal item not first one. */ if (ScanDirectionIsBackward(dir)) { do { if (!_bt_step(scan, &buf, ForwardScanDirection)) { pfree(scankeys); return _bt_endpoint(scan, dir); } offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(buf); result = _bt_compare(rel, keysCount, scankeys, page, offnum); } while (result == 0); if (!_bt_step(scan, &buf, BackwardScanDirection)) elog(ERROR, "equal items disappeared?"); } break; case BTGreaterEqualStrategyNumber: /* * We want the first item >= scankey, which is where we are... * unless we're not anywhere at all... */ if (offnum > PageGetMaxOffsetNumber(page)) { if (!_bt_step(scan, &buf, ForwardScanDirection)) { pfree(scankeys); return false; } } break; case BTGreaterStrategyNumber: /* * We want the first item > scankey, so make sure we are on an * item and then step over any equal items. */ if (offnum > PageGetMaxOffsetNumber(page)) { if (!_bt_step(scan, &buf, ForwardScanDirection)) { pfree(scankeys); return false; } offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(buf); } result = _bt_compare(rel, keysCount, scankeys, page, offnum); while (result == 0) { if (!_bt_step(scan, &buf, ForwardScanDirection)) { pfree(scankeys); return false; } offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(buf); result = _bt_compare(rel, keysCount, scankeys, page, offnum); } break; } /* okay, current item pointer for the scan is right */ offnum = ItemPointerGetOffsetNumber(current); page = BufferGetPage(buf); btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); itup = &btitem->bti_itup; /* is the first item actually acceptable? */ if (_bt_checkkeys(scan, itup, dir, &continuescan)) { /* yes, return it */ scan->xs_ctup.t_self = itup->t_tid; res = true; } else if (continuescan) { /* no, but there might be another one that is */ res = _bt_next(scan, dir); } else { /* no tuples in the index match this scan key */ nomatches: ItemPointerSetInvalid(current); so->btso_curbuf = InvalidBuffer; _bt_relbuf(rel, buf); res = false; } pfree(scankeys); return res; }
/* * _hash_step() -- step to the next valid item in a scan in the bucket. * * If no valid record exists in the requested direction, return * false. Else, return true and set the hashso_curpos for the * scan to the right thing. * * 'bufP' points to the current buffer, which is pinned and read-locked. * On success exit, we have pin and read-lock on whichever page * contains the right item; on failure, we have released all buffers. */ bool _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) { Relation rel = scan->indexRelation; HashScanOpaque so = (HashScanOpaque) scan->opaque; ItemPointer current; Buffer buf; Page page; HashPageOpaque opaque; OffsetNumber maxoff; OffsetNumber offnum; BlockNumber blkno; IndexTuple itup; current = &(so->hashso_curpos); buf = *bufP; _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); page = BufferGetPage(buf); opaque = (HashPageOpaque) PageGetSpecialPointer(page); /* * If _hash_step is called from _hash_first, current will not be valid, so * we can't dereference it. However, in that case, we presumably want to * start at the beginning/end of the page... */ maxoff = PageGetMaxOffsetNumber(page); if (ItemPointerIsValid(current)) offnum = ItemPointerGetOffsetNumber(current); else offnum = InvalidOffsetNumber; /* * 'offnum' now points to the last tuple we examined (if any). * * continue to step through tuples until: 1) we get to the end of the * bucket chain or 2) we find a valid tuple. */ do { switch (dir) { case ForwardScanDirection: if (offnum != InvalidOffsetNumber) offnum = OffsetNumberNext(offnum); /* move forward */ else { /* new page, locate starting position by binary search */ offnum = _hash_binsearch(page, so->hashso_sk_hash); } for (;;) { /* * check if we're still in the range of items with the * target hash key */ if (offnum <= maxoff) { Assert(offnum >= FirstOffsetNumber); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup)) break; /* yes, so exit for-loop */ } /* * ran off the end of this page, try the next */ _hash_readnext(rel, &buf, &page, &opaque); if (BufferIsValid(buf)) { maxoff = PageGetMaxOffsetNumber(page); offnum = _hash_binsearch(page, so->hashso_sk_hash); } else { /* end of bucket */ itup = NULL; break; /* exit for-loop */ } } break; case BackwardScanDirection: if (offnum != InvalidOffsetNumber) offnum = OffsetNumberPrev(offnum); /* move back */ else { /* new page, locate starting position by binary search */ offnum = _hash_binsearch_last(page, so->hashso_sk_hash); } for (;;) { /* * check if we're still in the range of items with the * target hash key */ if (offnum >= FirstOffsetNumber) { Assert(offnum <= maxoff); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup)) break; /* yes, so exit for-loop */ } /* * ran off the end of this page, try the next */ _hash_readprev(rel, &buf, &page, &opaque); if (BufferIsValid(buf)) { maxoff = PageGetMaxOffsetNumber(page); offnum = _hash_binsearch_last(page, so->hashso_sk_hash); } else { /* end of bucket */ itup = NULL; break; /* exit for-loop */ } } break; default: /* NoMovementScanDirection */ /* this should not be reached */ itup = NULL; break; } if (itup == NULL) { /* we ran off the end of the bucket without finding a match */ *bufP = so->hashso_curbuf = InvalidBuffer; ItemPointerSetInvalid(current); return false; } /* check the tuple quals, loop around if not met */ } while (!_hash_checkqual(scan, itup)); /* if we made it to here, we've found a valid tuple */ blkno = BufferGetBlockNumber(buf); *bufP = so->hashso_curbuf = buf; ItemPointerSet(current, blkno, offnum); return true; }
/*---------- * Add an item to a disk page from the sort output. * * We must be careful to observe the page layout conventions of nbtsearch.c: * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY. * - on non-leaf pages, the key portion of the first item need not be * stored, we should store only the link. * * A leaf page being built looks like: * * +----------------+---------------------------------+ * | PageHeaderData | linp0 linp1 linp2 ... | * +-----------+----+---------------------------------+ * | ... linpN | | * +-----------+--------------------------------------+ * | ^ last | * | | * +-------------+------------------------------------+ * | | itemN ... | * +-------------+------------------+-----------------+ * | ... item3 item2 item1 | "special space" | * +--------------------------------+-----------------+ * * Contrast this with the diagram in bufpage.h; note the mismatch * between linps and items. This is because we reserve linp0 as a * placeholder for the pointer to the "high key" item; when we have * filled up the page, we will set linp0 to point to itemN and clear * linpN. On the other hand, if we find this is the last (rightmost) * page, we leave the items alone and slide the linp array over. * * 'last' pointer indicates the last offset added to the page. *---------- */ static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup) { Page npage; BlockNumber nblkno; OffsetNumber last_off; Size pgspc; Size itupsz; /* * This is a handy place to check for cancel interrupts during the btree * load phase of index creation. */ CHECK_FOR_INTERRUPTS(); npage = state->btps_page; nblkno = state->btps_blkno; last_off = state->btps_lastoff; pgspc = PageGetFreeSpace(npage); itupsz = IndexTupleDSize(*itup); itupsz = MAXALIGN(itupsz); /* * Check whether the item can fit on a btree page at all. (Eventually, we * ought to try to apply TOAST methods if not.) We actually need to be * able to fit three items on every page, so restrict any one item to 1/3 * the per-page available space. Note that at this point, itupsz doesn't * include the ItemId. * * NOTE: similar code appears in _bt_insertonpg() to defend against * oversize items being inserted into an already-existing index. But * during creation of an index, we don't go through there. */ if (itupsz > BTMaxItemSize(npage)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", itupsz, BTMaxItemSize(npage), RelationGetRelationName(wstate->index)), errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" "Consider a function index of an MD5 hash of the value, " "or use full text indexing."), errtableconstraint(wstate->heap, RelationGetRelationName(wstate->index)))); /* * Check to see if page is "full". It's definitely full if the item won't * fit. Otherwise, compare to the target freespace derived from the * fillfactor. However, we must put at least two items on each page, so * disregard fillfactor if we don't have that many. */ if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY)) { /* * Finish off the page and write it out. */ Page opage = npage; BlockNumber oblkno = nblkno; ItemId ii; ItemId hii; IndexTuple oitup; /* Create new page of same level */ npage = _bt_blnewpage(state->btps_level); /* and assign it a page position */ nblkno = wstate->btws_pages_alloced++; /* * We copy the last item on the page into the new page, and then * rearrange the old page so that the 'last item' becomes its high key * rather than a true data item. There had better be at least two * items on the page already, else the page would be empty of useful * data. */ Assert(last_off > P_FIRSTKEY); ii = PageGetItemId(opage, last_off); oitup = (IndexTuple) PageGetItem(opage, ii); _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY); /* * Move 'last' into the high key position on opage */ hii = PageGetItemId(opage, P_HIKEY); *hii = *ii; ItemIdSetUnused(ii); /* redundant */ ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); /* * Link the old page into its parent, using its minimum key. If we * don't have a parent, we have to create one; this adds a new btree * level. */ if (state->btps_next == NULL) state->btps_next = _bt_pagestate(wstate, state->btps_level + 1); Assert(state->btps_minkey != NULL); ItemPointerSet(&(state->btps_minkey->t_tid), oblkno, P_HIKEY); _bt_buildadd(wstate, state->btps_next, state->btps_minkey); pfree(state->btps_minkey); /* * Save a copy of the minimum key for the new page. We have to copy * it off the old page, not the new one, in case we are not at leaf * level. */ state->btps_minkey = CopyIndexTuple(oitup); /* * Set the sibling links for both pages. */ { BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage); BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage); oopaque->btpo_next = nblkno; nopaque->btpo_prev = oblkno; nopaque->btpo_next = P_NONE; /* redundant */ } /* * Write out the old page. We never need to touch it again, so we can * free the opage workspace too. */ _bt_blwritepage(wstate, opage, oblkno); /* * Reset last_off to point to new page */ last_off = P_FIRSTKEY; } /* * If the new item is the first for its page, stash a copy for later. Note * this will only happen for the first item on a level; on later pages, * the first item for a page is copied from the prior page in the code * above. */ if (last_off == P_HIKEY) { Assert(state->btps_minkey == NULL); state->btps_minkey = CopyIndexTuple(itup); } /* * Add the new item into the current page. */ last_off = OffsetNumberNext(last_off); _bt_sortaddtup(npage, itupsz, itup, last_off); state->btps_page = npage; state->btps_blkno = nblkno; state->btps_lastoff = last_off; }
/* * Returns a list of items whose visibility map information does not match * the status of the tuples on the page. * * If all_visible is passed as true, this will include all items which are * on pages marked as all-visible in the visibility map but which do not * seem to in fact be all-visible. * * If all_frozen is passed as true, this will include all items which are * on pages marked as all-frozen but which do not seem to in fact be frozen. */ static corrupt_items * collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen) { Relation rel; BlockNumber nblocks; corrupt_items *items; BlockNumber blkno; Buffer vmbuffer = InvalidBuffer; BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD); TransactionId OldestXmin = InvalidTransactionId; if (all_visible) { /* Don't pass rel; that will fail in recovery. */ OldestXmin = GetOldestXmin(NULL, true); } rel = relation_open(relid, AccessShareLock); if (rel->rd_rel->relkind != RELKIND_RELATION && rel->rd_rel->relkind != RELKIND_MATVIEW && rel->rd_rel->relkind != RELKIND_TOASTVALUE) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is not a table, materialized view, or TOAST table", RelationGetRelationName(rel)))); nblocks = RelationGetNumberOfBlocks(rel); /* * Guess an initial array size. We don't expect many corrupted tuples, so * start with a small array. This function uses the "next" field to track * the next offset where we can store an item (which is the same thing as * the number of items found so far) and the "count" field to track the * number of entries allocated. We'll repurpose these fields before * returning. */ items = palloc0(sizeof(corrupt_items)); items->next = 0; items->count = 64; items->tids = palloc(items->count * sizeof(ItemPointerData)); /* Loop over every block in the relation. */ for (blkno = 0; blkno < nblocks; ++blkno) { bool check_frozen = false; bool check_visible = false; Buffer buffer; Page page; OffsetNumber offnum, maxoff; /* Make sure we are interruptible. */ CHECK_FOR_INTERRUPTS(); /* Use the visibility map to decide whether to check this page. */ if (all_frozen && VM_ALL_FROZEN(rel, blkno, &vmbuffer)) check_frozen = true; if (all_visible && VM_ALL_VISIBLE(rel, blkno, &vmbuffer)) check_visible = true; if (!check_visible && !check_frozen) continue; /* Read and lock the page. */ buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); maxoff = PageGetMaxOffsetNumber(page); /* * The visibility map bits might have changed while we were acquiring * the page lock. Recheck to avoid returning spurious results. */ if (check_frozen && !VM_ALL_FROZEN(rel, blkno, &vmbuffer)) check_frozen = false; if (check_visible && !VM_ALL_VISIBLE(rel, blkno, &vmbuffer)) check_visible = false; if (!check_visible && !check_frozen) { UnlockReleaseBuffer(buffer); continue; } /* Iterate over each tuple on the page. */ for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { HeapTupleData tuple; ItemId itemid; itemid = PageGetItemId(page, offnum); /* Unused or redirect line pointers are of no interest. */ if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid)) continue; /* Dead line pointers are neither all-visible nor frozen. */ if (ItemIdIsDead(itemid)) { ItemPointerSet(&(tuple.t_self), blkno, offnum); record_corrupt_item(items, &tuple.t_self); continue; } /* Initialize a HeapTupleData structure for checks below. */ ItemPointerSet(&(tuple.t_self), blkno, offnum); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tuple.t_tableOid = relid; /* * If we're checking whether the page is all-visible, we expect * the tuple to be all-visible. */ if (check_visible && !tuple_all_visible(&tuple, OldestXmin, buffer)) { TransactionId RecomputedOldestXmin; /* * Time has passed since we computed OldestXmin, so it's * possible that this tuple is all-visible in reality even * though it doesn't appear so based on our * previously-computed value. Let's compute a new value so we * can be certain whether there is a problem. * * From a concurrency point of view, it sort of sucks to * retake ProcArrayLock here while we're holding the buffer * exclusively locked, but it should be safe against * deadlocks, because surely GetOldestXmin() should never take * a buffer lock. And this shouldn't happen often, so it's * worth being careful so as to avoid false positives. */ RecomputedOldestXmin = GetOldestXmin(NULL, true); if (!TransactionIdPrecedes(OldestXmin, RecomputedOldestXmin)) record_corrupt_item(items, &tuple.t_self); else { OldestXmin = RecomputedOldestXmin; if (!tuple_all_visible(&tuple, OldestXmin, buffer)) record_corrupt_item(items, &tuple.t_self); } } /* * If we're checking whether the page is all-frozen, we expect the * tuple to be in a state where it will never need freezing. */ if (check_frozen) { if (heap_tuple_needs_eventual_freeze(tuple.t_data)) record_corrupt_item(items, &tuple.t_self); } } UnlockReleaseBuffer(buffer); } /* Clean up. */ if (vmbuffer != InvalidBuffer) ReleaseBuffer(vmbuffer); relation_close(rel, AccessShareLock); /* * Before returning, repurpose the fields to match caller's expectations. * next is now the next item that should be read (rather than written) and * count is now the number of items we wrote (rather than the number we * allocated). */ items->count = items->next; items->next = 0; return items; }
/* * Subroutine to pre-check whether a page deletion is safe, that is, its * parent page would be left in a valid or deletable state. * * "target" is the page we wish to delete, and "stack" is a search stack * leading to it (approximately). Note that we will update the stack * entry(s) to reflect current downlink positions --- this is harmless and * indeed saves later search effort in _bt_pagedel. * * Note: it's OK to release page locks after checking, because a safe * deletion can't become unsafe due to concurrent activity. A non-rightmost * page cannot become rightmost unless there's a concurrent page deletion, * but only VACUUM does page deletion and we only allow one VACUUM on an index * at a time. An only child could acquire a sibling (of the same parent) only * by being split ... but that would make it a non-rightmost child so the * deletion is still safe. */ static bool _bt_parent_deletion_safe(Relation rel, BlockNumber target, BTStack stack) { BlockNumber parent; OffsetNumber poffset, maxoff; Buffer pbuf; Page page; BTPageOpaque opaque; MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; /* * In recovery mode, assume the deletion being replayed is valid. We * can't always check it because we won't have a full search stack, and we * should complain if there's a problem, anyway. */ if (InRecovery) return true; /* Locate the parent's downlink (updating the stack entry if needed) */ ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY); pbuf = _bt_getstackbuf(rel, stack, BT_READ); if (pbuf == InvalidBuffer) elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u", RelationGetRelationName(rel), target); parent = stack->bts_blkno; poffset = stack->bts_offset; page = BufferGetPage(pbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); maxoff = PageGetMaxOffsetNumber(page); /* * If the target is the rightmost child of its parent, then we can't * delete, unless it's also the only child. */ if (poffset >= maxoff) { /* It's rightmost child... */ if (poffset == P_FIRSTDATAKEY(opaque)) { /* * It's only child, so safe if parent would itself be removable. * We have to check the parent itself, and then recurse to test * the conditions at the parent's parent. */ if (P_RIGHTMOST(opaque) || P_ISROOT(opaque)) { _bt_relbuf(rel, pbuf); return false; } _bt_relbuf(rel, pbuf); return _bt_parent_deletion_safe(rel, parent, stack->bts_parent); } else { /* Unsafe to delete */ _bt_relbuf(rel, pbuf); return false; } } else { /* Not rightmost child, so safe to delete */ _bt_relbuf(rel, pbuf); return true; } }
static void btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_mark_page_halfdead *xlrec = (xl_btree_mark_page_halfdead *) XLogRecGetData(record); Buffer buffer; Page page; BTPageOpaque pageop; IndexTupleData trunctuple; /* * In normal operation, we would lock all the pages this WAL record * touches before changing any of them. In WAL replay, it should be okay * to lock just one page at a time, since no concurrent index updates can * be happening, and readers should not care whether they arrive at the * target page or not (since it's surely empty). */ /* parent page */ if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) { OffsetNumber poffset; ItemId itemid; IndexTuple itup; OffsetNumber nextoffset; BlockNumber rightsib; page = (Page) BufferGetPage(buffer); pageop = (BTPageOpaque) PageGetSpecialPointer(page); poffset = xlrec->poffset; nextoffset = OffsetNumberNext(poffset); itemid = PageGetItemId(page, nextoffset); itup = (IndexTuple) PageGetItem(page, itemid); rightsib = ItemPointerGetBlockNumber(&itup->t_tid); itemid = PageGetItemId(page, poffset); itup = (IndexTuple) PageGetItem(page, itemid); ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY); nextoffset = OffsetNumberNext(poffset); PageIndexTupleDelete(page, nextoffset); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* Rewrite the leaf page as a halfdead page */ buffer = XLogInitBufferForRedo(record, 0); page = (Page) BufferGetPage(buffer); _bt_pageinit(page, BufferGetPageSize(buffer)); pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_prev = xlrec->leftblk; pageop->btpo_next = xlrec->rightblk; pageop->btpo.level = 0; pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; pageop->btpo_cycleid = 0; /* * Construct a dummy hikey item that points to the next parent to be * deleted (if any). */ MemSet(&trunctuple, 0, sizeof(IndexTupleData)); trunctuple.t_info = sizeof(IndexTupleData); if (xlrec->topparent != InvalidBlockNumber) ItemPointerSet(&trunctuple.t_tid, xlrec->topparent, P_HIKEY); else ItemPointerSetInvalid(&trunctuple.t_tid); if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, false, false) == InvalidOffsetNumber) elog(ERROR, "could not add dummy high key to half-dead page"); PageSetLSN(page, lsn); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); }
/* * lazy_scan_heap() -- scan an open heap relation * * This routine sets commit status bits, builds lists of dead tuples * and pages with free space, and calculates statistics on the number * of live tuples in the heap. When done, or when we run low on space * for dead-tuple TIDs, invoke vacuuming of indexes and heap. * * If there are no indexes then we just vacuum each dirty page as we * process it, since there's no point in gathering many tuples. */ static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats, Relation *Irel, int nindexes, bool scan_all) { BlockNumber nblocks, blkno; HeapTupleData tuple; char *relname; BlockNumber empty_pages, vacuumed_pages; double num_tuples, tups_vacuumed, nkeep, nunused; IndexBulkDeleteResult **indstats; int i; PGRUsage ru0; Buffer vmbuffer = InvalidBuffer; BlockNumber next_not_all_visible_block; bool skipping_all_visible_blocks; pg_rusage_init(&ru0); relname = RelationGetRelationName(onerel); ereport(elevel, (errmsg("vacuuming \"%s.%s\"", get_namespace_name(RelationGetNamespace(onerel)), relname))); empty_pages = vacuumed_pages = 0; num_tuples = tups_vacuumed = nkeep = nunused = 0; indstats = (IndexBulkDeleteResult **) palloc0(nindexes * sizeof(IndexBulkDeleteResult *)); nblocks = RelationGetNumberOfBlocks(onerel); vacrelstats->rel_pages = nblocks; vacrelstats->scanned_pages = 0; vacrelstats->nonempty_pages = 0; vacrelstats->latestRemovedXid = InvalidTransactionId; lazy_space_alloc(vacrelstats, nblocks); /* * We want to skip pages that don't require vacuuming according to the * visibility map, but only when we can skip at least SKIP_PAGES_THRESHOLD * consecutive pages. Since we're reading sequentially, the OS should be * doing readahead for us, so there's no gain in skipping a page now and * then; that's likely to disable readahead and so be counterproductive. * Also, skipping even a single page means that we can't update * relfrozenxid, so we only want to do it if we can skip a goodly number * of pages. * * Before entering the main loop, establish the invariant that * next_not_all_visible_block is the next block number >= blkno that's not * all-visible according to the visibility map, or nblocks if there's no * such block. Also, we set up the skipping_all_visible_blocks flag, * which is needed because we need hysteresis in the decision: once we've * started skipping blocks, we may as well skip everything up to the next * not-all-visible block. * * Note: if scan_all is true, we won't actually skip any pages; but we * maintain next_not_all_visible_block anyway, so as to set up the * all_visible_according_to_vm flag correctly for each page. */ for (next_not_all_visible_block = 0; next_not_all_visible_block < nblocks; next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } if (next_not_all_visible_block >= SKIP_PAGES_THRESHOLD) skipping_all_visible_blocks = true; else skipping_all_visible_blocks = false; for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; bool tupgone, hastup; int prev_dead_count; OffsetNumber frozen[MaxOffsetNumber]; int nfrozen; Size freespace; bool all_visible_according_to_vm; bool all_visible; bool has_dead_tuples; if (blkno == next_not_all_visible_block) { /* Time to advance next_not_all_visible_block */ for (next_not_all_visible_block++; next_not_all_visible_block < nblocks; next_not_all_visible_block++) { if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer)) break; vacuum_delay_point(); } /* * We know we can't skip the current block. But set up * skipping_all_visible_blocks to do the right thing at the * following blocks. */ if (next_not_all_visible_block - blkno > SKIP_PAGES_THRESHOLD) skipping_all_visible_blocks = true; else skipping_all_visible_blocks = false; all_visible_according_to_vm = false; } else { /* Current block is all-visible */ if (skipping_all_visible_blocks && !scan_all) continue; all_visible_according_to_vm = true; } vacuum_delay_point(); vacrelstats->scanned_pages++; /* * If we are close to overrunning the available space for dead-tuple * TIDs, pause and do a cycle of vacuuming before we tackle this page. */ if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage && vacrelstats->num_dead_tuples > 0) { /* Log cleanup info before we touch indexes */ vacuum_log_cleanup_info(onerel, vacrelstats); /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); /* * Forget the now-vacuumed tuples, and press on, but be careful * not to reset latestRemovedXid since we want that value to be * valid. */ vacrelstats->num_dead_tuples = 0; vacrelstats->num_index_scans++; } buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno, RBM_NORMAL, vac_strategy); /* We need buffer cleanup lock so that we can prune HOT chains. */ LockBufferForCleanup(buf); page = BufferGetPage(buf); if (PageIsNew(page)) { /* * An all-zeroes page could be left over if a backend extends the * relation but crashes before initializing the page. Reclaim such * pages for use. * * We have to be careful here because we could be looking at a * page that someone has just added to the relation and not yet * been able to initialize (see RelationGetBufferForTuple). To * protect against that, release the buffer lock, grab the * relation extension lock momentarily, and re-lock the buffer. If * the page is still uninitialized by then, it must be left over * from a crashed backend, and we can initialize it. * * We don't really need the relation lock when this is a new or * temp relation, but it's probably not worth the code space to * check that, since this surely isn't a critical path. * * Note: the comparable code in vacuum.c need not worry because * it's got exclusive lock on the whole relation. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); LockRelationForExtension(onerel, ExclusiveLock); UnlockRelationForExtension(onerel, ExclusiveLock); LockBufferForCleanup(buf); if (PageIsNew(page)) { ereport(WARNING, (errmsg("relation \"%s\" page %u is uninitialized --- fixing", relname, blkno))); PageInit(page, BufferGetPageSize(buf), 0); empty_pages++; } freespace = PageGetHeapFreeSpace(page); MarkBufferDirty(buf); UnlockReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } if (PageIsEmpty(page)) { empty_pages++; freespace = PageGetHeapFreeSpace(page); if (!PageIsAllVisible(page)) { PageSetAllVisible(page); SetBufferCommitInfoNeedsSave(buf); } LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* Update the visibility map */ if (!all_visible_according_to_vm) { visibilitymap_pin(onerel, blkno, &vmbuffer); LockBuffer(buf, BUFFER_LOCK_SHARE); if (PageIsAllVisible(page)) visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } ReleaseBuffer(buf); RecordPageWithFreeSpace(onerel, blkno, freespace); continue; } /* * Prune all HOT-update chains in this page. * * We count tuples removed by the pruning step as removed by VACUUM. */ tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, &vacrelstats->latestRemovedXid); /* * Now scan the page to collect vacuumable items and check for tuples * requiring freezing. */ all_visible = true; has_dead_tuples = false; nfrozen = 0; hastup = false; prev_dead_count = vacrelstats->num_dead_tuples; maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; itemid = PageGetItemId(page, offnum); /* Unused items require no processing, but we count 'em */ if (!ItemIdIsUsed(itemid)) { nunused += 1; continue; } /* Redirect items mustn't be touched */ if (ItemIdIsRedirected(itemid)) { hastup = true; /* this page won't be truncatable */ continue; } ItemPointerSet(&(tuple.t_self), blkno, offnum); /* * DEAD item pointers are to be vacuumed normally; but we don't * count them in tups_vacuumed, else we'd be double-counting (at * least in the common case where heap_page_prune() just freed up * a non-HOT tuple). */ if (ItemIdIsDead(itemid)) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); all_visible = false; continue; } Assert(ItemIdIsNormal(itemid)); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tupgone = false; switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf)) { case HEAPTUPLE_DEAD: /* * Ordinarily, DEAD tuples would have been removed by * heap_page_prune(), but it's possible that the tuple * state changed since heap_page_prune() looked. In * particular an INSERT_IN_PROGRESS tuple could have * changed to DEAD if the inserter aborted. So this * cannot be considered an error condition. * * If the tuple is HOT-updated then it must only be * removed by a prune operation; so we keep it just as if * it were RECENTLY_DEAD. Also, if it's a heap-only * tuple, we choose to keep it, because it'll be a lot * cheaper to get rid of it in the next pruning pass than * to treat it like an indexed tuple. */ if (HeapTupleIsHotUpdated(&tuple) || HeapTupleIsHeapOnly(&tuple)) nkeep += 1; else tupgone = true; /* we can delete the tuple */ all_visible = false; break; case HEAPTUPLE_LIVE: /* Tuple is good --- but let's do some validity checks */ if (onerel->rd_rel->relhasoids && !OidIsValid(HeapTupleGetOid(&tuple))) elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", relname, blkno, offnum); /* * Is the tuple definitely visible to all transactions? * * NB: Like with per-tuple hint bits, we can't set the * PD_ALL_VISIBLE flag if the inserter committed * asynchronously. See SetHintBits for more info. Check * that the HEAP_XMIN_COMMITTED hint bit is set because of * that. */ if (all_visible) { TransactionId xmin; if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)) { all_visible = false; break; } /* * The inserter definitely committed. But is it old * enough that everyone sees it as committed? */ xmin = HeapTupleHeaderGetXmin(tuple.t_data); if (!TransactionIdPrecedes(xmin, OldestXmin)) { all_visible = false; break; } } break; case HEAPTUPLE_RECENTLY_DEAD: /* * If tuple is recently deleted then we must not remove it * from relation. */ nkeep += 1; all_visible = false; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* This is an expected case during concurrent vacuum */ all_visible = false; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } if (tupgone) { lazy_record_dead_tuple(vacrelstats, &(tuple.t_self)); HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, &vacrelstats->latestRemovedXid); tups_vacuumed += 1; has_dead_tuples = true; } else { num_tuples += 1; hastup = true; /* * Each non-removable tuple must be checked to see if it needs * freezing. Note we already have exclusive buffer lock. */ if (heap_freeze_tuple(tuple.t_data, FreezeLimit, InvalidBuffer)) frozen[nfrozen++] = offnum; } } /* scan along page */ /* * If we froze any tuples, mark the buffer dirty, and write a WAL * record recording the changes. We must log the changes to be * crash-safe against future truncation of CLOG. */ if (nfrozen > 0) { MarkBufferDirty(buf); if (RelationNeedsWAL(onerel)) { XLogRecPtr recptr; recptr = log_heap_freeze(onerel, buf, FreezeLimit, frozen, nfrozen); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } /* * If there are no indexes then we can vacuum the page right now * instead of doing a second scan. */ if (nindexes == 0 && vacrelstats->num_dead_tuples > 0) { /* Remove tuples from heap */ lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats); /* * Forget the now-vacuumed tuples, and press on, but be careful * not to reset latestRemovedXid since we want that value to be * valid. */ vacrelstats->num_dead_tuples = 0; vacuumed_pages++; } freespace = PageGetHeapFreeSpace(page); /* Update the all-visible flag on the page */ if (!PageIsAllVisible(page) && all_visible) { PageSetAllVisible(page); SetBufferCommitInfoNeedsSave(buf); } /* * It's possible for the value returned by GetOldestXmin() to move * backwards, so it's not wrong for us to see tuples that appear to * not be visible to everyone yet, while PD_ALL_VISIBLE is already * set. The real safe xmin value never moves backwards, but * GetOldestXmin() is conservative and sometimes returns a value * that's unnecessarily small, so if we see that contradiction it just * means that the tuples that we think are not visible to everyone yet * actually are, and the PD_ALL_VISIBLE flag is correct. * * There should never be dead tuples on a page with PD_ALL_VISIBLE * set, however. */ else if (PageIsAllVisible(page) && has_dead_tuples) { elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u", relname, blkno); PageClearAllVisible(page); SetBufferCommitInfoNeedsSave(buf); /* * Normally, we would drop the lock on the heap page before * updating the visibility map, but since this case shouldn't * happen anyway, don't worry about that. */ visibilitymap_clear(onerel, blkno); } LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* Update the visibility map */ if (!all_visible_according_to_vm && all_visible) { visibilitymap_pin(onerel, blkno, &vmbuffer); LockBuffer(buf, BUFFER_LOCK_SHARE); if (PageIsAllVisible(page)) visibilitymap_set(onerel, blkno, PageGetLSN(page), &vmbuffer); LockBuffer(buf, BUFFER_LOCK_UNLOCK); } ReleaseBuffer(buf); /* Remember the location of the last page with nonremovable tuples */ if (hastup) vacrelstats->nonempty_pages = blkno + 1; /* * If we remembered any tuples for deletion, then the page will be * visited again by lazy_vacuum_heap, which will compute and record * its post-compaction free space. If not, then we're done with this * page, so remember its free space as-is. (This path will always be * taken if there are no indexes.) */ if (vacrelstats->num_dead_tuples == prev_dead_count) RecordPageWithFreeSpace(onerel, blkno, freespace); } /* save stats for use later */ vacrelstats->scanned_tuples = num_tuples; vacrelstats->tuples_deleted = tups_vacuumed; /* now we can compute the new value for pg_class.reltuples */ vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false, nblocks, vacrelstats->scanned_pages, num_tuples); /* If any tuples need to be deleted, perform final vacuum cycle */ /* XXX put a threshold on min number of tuples here? */ if (vacrelstats->num_dead_tuples > 0) { /* Log cleanup info before we touch indexes */ vacuum_log_cleanup_info(onerel, vacrelstats); /* Remove index entries */ for (i = 0; i < nindexes; i++) lazy_vacuum_index(Irel[i], &indstats[i], vacrelstats); /* Remove tuples from heap */ lazy_vacuum_heap(onerel, vacrelstats); vacrelstats->num_index_scans++; } /* Release the pin on the visibility map page */ if (BufferIsValid(vmbuffer)) { ReleaseBuffer(vmbuffer); vmbuffer = InvalidBuffer; } /* Do post-vacuum cleanup and statistics update for each index */ for (i = 0; i < nindexes; i++) lazy_cleanup_index(Irel[i], indstats[i], vacrelstats); /* If no indexes, make log report that lazy_vacuum_heap would've made */ if (vacuumed_pages) ereport(elevel, (errmsg("\"%s\": removed %.0f row versions in %u pages", RelationGetRelationName(onerel), tups_vacuumed, vacuumed_pages))); ereport(elevel, (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages", RelationGetRelationName(onerel), tups_vacuumed, num_tuples, vacrelstats->scanned_pages, nblocks), errdetail("%.0f dead row versions cannot be removed yet.\n" "There were %.0f unused item pointers.\n" "%u pages are entirely empty.\n" "%s.", nkeep, nunused, empty_pages, pg_rusage_show(&ru0)))); }
static void btree_xlog_unlink_page(uint8 info, XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record); BlockNumber leftsib; BlockNumber rightsib; Buffer buffer; Page page; BTPageOpaque pageop; leftsib = xlrec->leftsib; rightsib = xlrec->rightsib; /* * In normal operation, we would lock all the pages this WAL record * touches before changing any of them. In WAL replay, it should be okay * to lock just one page at a time, since no concurrent index updates can * be happening, and readers should not care whether they arrive at the * target page or not (since it's surely empty). */ /* Fix left-link of right sibling */ if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO) { page = (Page) BufferGetPage(buffer); pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_prev = leftsib; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); /* Fix right-link of left sibling, if any */ if (leftsib != P_NONE) { if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) { page = (Page) BufferGetPage(buffer); pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_next = rightsib; PageSetLSN(page, lsn); MarkBufferDirty(buffer); } if (BufferIsValid(buffer)) UnlockReleaseBuffer(buffer); } /* Rewrite target page as empty deleted page */ buffer = XLogInitBufferForRedo(record, 0); page = (Page) BufferGetPage(buffer); _bt_pageinit(page, BufferGetPageSize(buffer)); pageop = (BTPageOpaque) PageGetSpecialPointer(page); pageop->btpo_prev = leftsib; pageop->btpo_next = rightsib; pageop->btpo.xact = xlrec->btpo_xact; pageop->btpo_flags = BTP_DELETED; pageop->btpo_cycleid = 0; PageSetLSN(page, lsn); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); /* * If we deleted a parent of the targeted leaf page, instead of the leaf * itself, update the leaf to point to the next remaining child in the * branch. */ if (XLogRecHasBlockRef(record, 3)) { /* * There is no real data on the page, so we just re-create it from * scratch using the information from the WAL record. */ IndexTupleData trunctuple; buffer = XLogInitBufferForRedo(record, 3); page = (Page) BufferGetPage(buffer); pageop = (BTPageOpaque) PageGetSpecialPointer(page); _bt_pageinit(page, BufferGetPageSize(buffer)); pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF; pageop->btpo_prev = xlrec->leafleftsib; pageop->btpo_next = xlrec->leafrightsib; pageop->btpo.level = 0; pageop->btpo_cycleid = 0; /* Add a dummy hikey item */ MemSet(&trunctuple, 0, sizeof(IndexTupleData)); trunctuple.t_info = sizeof(IndexTupleData); if (xlrec->topparent != InvalidBlockNumber) ItemPointerSet(&trunctuple.t_tid, xlrec->topparent, P_HIKEY); else ItemPointerSetInvalid(&trunctuple.t_tid); if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY, false, false) == InvalidOffsetNumber) elog(ERROR, "could not add dummy high key to half-dead page"); PageSetLSN(page, lsn); MarkBufferDirty(buffer); UnlockReleaseBuffer(buffer); } /* Update metapage if needed */ if (info == XLOG_BTREE_UNLINK_PAGE_META) _bt_restore_meta(record, 4); }