/* ---------------- * index_getnext - get the next heap tuple from a scan * * The result is the next heap tuple satisfying the scan keys and the * snapshot, or NULL if no more matching tuples exist. On success, * the buffer containing the heap tuple is pinned (the pin will be dropped * at the next index_getnext or index_endscan). * * Note: caller must check scan->xs_recheck, and perform rechecking of the * scan keys if required. We do not do that here because we don't have * enough information to do it efficiently in the general case. * ---------------- */ HeapTuple index_getnext(IndexScanDesc scan, ScanDirection direction) { HeapTuple heapTuple = &scan->xs_ctup; ItemPointer tid = &heapTuple->t_self; FmgrInfo *procedure; SCAN_CHECKS; GET_SCAN_PROCEDURE(amgettuple); Assert(TransactionIdIsValid(RecentGlobalXmin)); /* * We always reset xs_hot_dead; if we are here then either we are just * starting the scan, or we previously returned a visible tuple, and in * either case it's inappropriate to kill the prior index entry. */ scan->xs_hot_dead = false; for (;;) { OffsetNumber offnum; bool at_chain_start; Page dp; if (scan->xs_next_hot != InvalidOffsetNumber) { /* * We are resuming scan of a HOT chain after having returned an * earlier member. Must still hold pin on current heap page. */ Assert(BufferIsValid(scan->xs_cbuf)); Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(scan->xs_cbuf)); Assert(TransactionIdIsValid(scan->xs_prev_xmax)); offnum = scan->xs_next_hot; at_chain_start = false; scan->xs_next_hot = InvalidOffsetNumber; } else { bool found; Buffer prev_buf; /* * If we scanned a whole HOT chain and found only dead tuples, * tell index AM to kill its entry for that TID. We do not do this * when in recovery because it may violate MVCC to do so. see * comments in RelationGetIndexScan(). */ if (!scan->xactStartedInRecovery) scan->kill_prior_tuple = scan->xs_hot_dead; /* * The AM's gettuple proc finds the next index entry matching the * scan keys, and puts the TID in xs_ctup.t_self (ie, *tid). It * should also set scan->xs_recheck, though we pay no attention to * that here. */ found = DatumGetBool(FunctionCall2(procedure, PointerGetDatum(scan), Int32GetDatum(direction))); /* Reset kill flag immediately for safety */ scan->kill_prior_tuple = false; /* If we're out of index entries, break out of outer loop */ if (!found) break; pgstat_count_index_tuples(scan->indexRelation, 1); /* Switch to correct buffer if we don't have it already */ prev_buf = scan->xs_cbuf; scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf, scan->heapRelation, ItemPointerGetBlockNumber(tid)); /* * Prune page, but only if we weren't already on this page */ if (prev_buf != scan->xs_cbuf) heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf, RecentGlobalXmin); /* Prepare to scan HOT chain starting at index-referenced offnum */ offnum = ItemPointerGetOffsetNumber(tid); at_chain_start = true; /* We don't know what the first tuple's xmin should be */ scan->xs_prev_xmax = InvalidTransactionId; /* Initialize flag to detect if all entries are dead */ scan->xs_hot_dead = true; } /* Obtain share-lock on the buffer so we can examine visibility */ LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); dp = (Page) BufferGetPage(scan->xs_cbuf); /* Scan through possible multiple members of HOT-chain */ for (;;) { ItemId lp; ItemPointer ctid; bool valid; /* check for bogus TID */ if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp)) break; lp = PageGetItemId(dp, offnum); /* check for unused, dead, or redirected items */ if (!ItemIdIsNormal(lp)) { /* We should only see a redirect at start of chain */ if (ItemIdIsRedirected(lp) && at_chain_start) { /* Follow the redirect */ offnum = ItemIdGetRedirect(lp); at_chain_start = false; continue; } /* else must be end of chain */ break; } /* * We must initialize all of *heapTuple (ie, scan->xs_ctup) since * it is returned to the executor on success. */ heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp); heapTuple->t_len = ItemIdGetLength(lp); ItemPointerSetOffsetNumber(tid, offnum); heapTuple->t_tableOid = RelationGetRelid(scan->heapRelation); ctid = &heapTuple->t_data->t_ctid; /* * Shouldn't see a HEAP_ONLY tuple at chain start. (This test * should be unnecessary, since the chain root can't be removed * while we have pin on the index entry, but let's make it * anyway.) */ if (at_chain_start && HeapTupleIsHeapOnly(heapTuple)) break; /* * The xmin should match the previous xmax value, else chain is * broken. (Note: this test is not optional because it protects * us against the case where the prior chain member's xmax aborted * since we looked at it.) */ if (TransactionIdIsValid(scan->xs_prev_xmax) && !TransactionIdEquals(scan->xs_prev_xmax, HeapTupleHeaderGetXmin(heapTuple->t_data))) break; /* If it's visible per the snapshot, we must return it */ valid = HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot, scan->xs_cbuf); CheckForSerializableConflictOut(valid, scan->heapRelation, heapTuple, scan->xs_cbuf); if (valid) { /* * If the snapshot is MVCC, we know that it could accept at * most one member of the HOT chain, so we can skip examining * any more members. Otherwise, check for continuation of the * HOT-chain, and set state for next time. */ if (IsMVCCSnapshot(scan->xs_snapshot)) scan->xs_next_hot = InvalidOffsetNumber; else if (HeapTupleIsHotUpdated(heapTuple)) { Assert(ItemPointerGetBlockNumber(ctid) == ItemPointerGetBlockNumber(tid)); scan->xs_next_hot = ItemPointerGetOffsetNumber(ctid); scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data); } else scan->xs_next_hot = InvalidOffsetNumber; PredicateLockTuple(scan->heapRelation, heapTuple); LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); pgstat_count_heap_fetch(scan->indexRelation); return heapTuple; } /* * If we can't see it, maybe no one else can either. Check to see * if the tuple is dead to all transactions. If we find that all * the tuples in the HOT chain are dead, we'll signal the index AM * to not return that TID on future indexscans. */ if (scan->xs_hot_dead && HeapTupleSatisfiesVacuum(heapTuple->t_data, RecentGlobalXmin, scan->xs_cbuf) != HEAPTUPLE_DEAD) scan->xs_hot_dead = false; /* * Check to see if HOT chain continues past this tuple; if so * fetch the next offnum (we don't bother storing it into * xs_next_hot, but must store xs_prev_xmax), and loop around. */ if (HeapTupleIsHotUpdated(heapTuple)) { Assert(ItemPointerGetBlockNumber(ctid) == ItemPointerGetBlockNumber(tid)); offnum = ItemPointerGetOffsetNumber(ctid); at_chain_start = false; scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data); } else break; /* end of chain */ } /* loop over a single HOT chain */ LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); /* Loop around to ask index AM for another TID */ scan->xs_next_hot = InvalidOffsetNumber; } /* Release any held pin on a heap page */ if (BufferIsValid(scan->xs_cbuf)) { ReleaseBuffer(scan->xs_cbuf); scan->xs_cbuf = InvalidBuffer; } return NULL; /* failure exit */ }
void heap2_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; info &= XLOG_HEAP_OPMASK; if (info == XLOG_HEAP2_CLEAN) { xl_heap_clean *xlrec = (xl_heap_clean *) rec; appendStringInfo(buf, "remxid %u", xlrec->latestRemovedXid); } else if (info == XLOG_HEAP2_FREEZE_PAGE) { xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) rec; appendStringInfo(buf, "cutoff xid %u ntuples %u", xlrec->cutoff_xid, xlrec->ntuples); } else if (info == XLOG_HEAP2_CLEANUP_INFO) { xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) rec; appendStringInfo(buf, "remxid %u", xlrec->latestRemovedXid); } else if (info == XLOG_HEAP2_VISIBLE) { xl_heap_visible *xlrec = (xl_heap_visible *) rec; appendStringInfo(buf, "cutoff xid %u flags %d", xlrec->cutoff_xid, xlrec->flags); } else if (info == XLOG_HEAP2_MULTI_INSERT) { xl_heap_multi_insert *xlrec = (xl_heap_multi_insert *) rec; appendStringInfo(buf, "%d tuples", xlrec->ntuples); } else if (info == XLOG_HEAP2_LOCK_UPDATED) { xl_heap_lock_updated *xlrec = (xl_heap_lock_updated *) rec; appendStringInfo(buf, "off %u: xmax %u: flags %u ", xlrec->offnum, xlrec->xmax, xlrec->flags); out_infobits(buf, xlrec->infobits_set); } else if (info == XLOG_HEAP2_NEW_CID) { xl_heap_new_cid *xlrec = (xl_heap_new_cid *) rec; appendStringInfo(buf, "rel %u/%u/%u; tid %u/%u", xlrec->target_node.spcNode, xlrec->target_node.dbNode, xlrec->target_node.relNode, ItemPointerGetBlockNumber(&(xlrec->target_tid)), ItemPointerGetOffsetNumber(&(xlrec->target_tid))); appendStringInfo(buf, "; cmin: %u, cmax: %u, combo: %u", xlrec->cmin, xlrec->cmax, xlrec->combocid); } }
/* ---------------------------------------------------------------- * IndexOnlyNext * * Retrieve a tuple from the IndexOnlyScan node's index. * ---------------------------------------------------------------- */ static TupleTableSlot * IndexOnlyNext(IndexOnlyScanState *node) { EState *estate; ExprContext *econtext; ScanDirection direction; IndexScanDesc scandesc; TupleTableSlot *slot; ItemPointer tid; /* * extract necessary information from index scan node */ estate = node->ss.ps.state; direction = estate->es_direction; /* flip direction if this is an overall backward scan */ if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir)) { if (ScanDirectionIsForward(direction)) direction = BackwardScanDirection; else if (ScanDirectionIsBackward(direction)) direction = ForwardScanDirection; } scandesc = node->ioss_ScanDesc; econtext = node->ss.ps.ps_ExprContext; slot = node->ss.ss_ScanTupleSlot; /* * OK, now that we have what we need, fetch the next tuple. */ while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { HeapTuple tuple = NULL; /* * We can skip the heap fetch if the TID references a heap page on * which all tuples are known visible to everybody. In any case, * we'll use the index tuple not the heap tuple as the data source. * * Note on Memory Ordering Effects: visibilitymap_test does not lock * the visibility map buffer, and therefore the result we read here * could be slightly stale. However, it can't be stale enough to * matter. * * We need to detect clearing a VM bit due to an insert right away, * because the tuple is present in the index page but not visible. The * reading of the TID by this scan (using a shared lock on the index * buffer) is serialized with the insert of the TID into the index * (using an exclusive lock on the index buffer). Because the VM bit * is cleared before updating the index, and locking/unlocking of the * index page acts as a full memory barrier, we are sure to see the * cleared bit if we see a recently-inserted TID. * * Deletes do not update the index page (only VACUUM will clear out * the TID), so the clearing of the VM bit by a delete is not * serialized with this test below, and we may see a value that is * significantly stale. However, we don't care about the delete right * away, because the tuple is still visible until the deleting * transaction commits or the statement ends (if it's our * transaction). In either case, the lock on the VM buffer will have * been released (acting as a write barrier) after clearing the bit. * And for us to have a snapshot that includes the deleting * transaction (making the tuple invisible), we must have acquired * ProcArrayLock after that time, acting as a read barrier. * * It's worth going through this complexity to avoid needing to lock * the VM buffer, which could cause significant contention. */ if (!visibilitymap_test(scandesc->heapRelation, ItemPointerGetBlockNumber(tid), &node->ioss_VMBuffer)) { /* * Rats, we have to visit the heap to check visibility. */ node->ioss_HeapFetches++; tuple = index_fetch_heap(scandesc); if (tuple == NULL) continue; /* no visible tuple, try next index entry */ /* * Only MVCC snapshots are supported here, so there should be no * need to keep following the HOT chain once a visible entry has * been found. If we did want to allow that, we'd need to keep * more state to remember not to call index_getnext_tid next time. */ if (scandesc->xs_continue_hot) elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); /* * Note: at this point we are holding a pin on the heap page, as * recorded in scandesc->xs_cbuf. We could release that pin now, * but it's not clear whether it's a win to do so. The next index * entry might require a visit to the same heap page. */ } /* * Fill the scan tuple slot with data from the index. */ StoreIndexTuple(slot, scandesc->xs_itup, scandesc->xs_itupdesc); /* * If the index was lossy, we have to recheck the index quals. * (Currently, this can never happen, but we should support the case * for possible future use, eg with GiST indexes.) */ if (scandesc->xs_recheck) { econtext->ecxt_scantuple = slot; ResetExprContext(econtext); if (!ExecQual(node->indexqual, econtext, false)) { /* Fails recheck, so drop it and loop back for another */ InstrCountFiltered2(node, 1); continue; } } /* * We don't currently support rechecking ORDER BY distances. (In * principle, if the index can support retrieval of the originally * indexed value, it should be able to produce an exact distance * calculation too. So it's not clear that adding code here for * recheck/re-sort would be worth the trouble. But we should at least * throw an error if someone tries it.) */ if (scandesc->numberOfOrderBys > 0 && scandesc->xs_recheckorderby) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("lossy distance functions are not supported in index-only scans"))); /* * Predicate locks for index-only scans must be acquired at the page * level when the heap is not accessed, since tuple-level predicate * locks need the tuple's xmin value. If we had to visit the tuple * anyway, then we already have the tuple-level lock and can skip the * page lock. */ if (tuple == NULL) PredicateLockPage(scandesc->heapRelation, ItemPointerGetBlockNumber(tid), estate->es_snapshot); return slot; } /* * if we get here it means the index scan failed so we are at the end of * the scan.. */ return ExecClearTuple(slot); }
/* * Bulk deletion of all index entries pointing to a set of heap tuples and * check invalid tuples left after upgrade. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ Datum gistbulkdelete(PG_FUNCTION_ARGS) { IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2); void *callback_state = (void *) PG_GETARG_POINTER(3); Relation rel = info->index; GistBDItem *stack, *ptr; /* first time through? */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); /* we'll re-count the tuples each time */ stats->estimated_count = false; stats->num_index_tuples = 0; stack = (GistBDItem *) palloc0(sizeof(GistBDItem)); stack->blkno = GIST_ROOT_BLKNO; while (stack) { Buffer buffer; Page page; OffsetNumber i, maxoff; IndexTuple idxtuple; ItemId iid; buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIST_SHARE); gistcheckpage(rel, buffer); page = (Page) BufferGetPage(buffer); if (GistPageIsLeaf(page)) { OffsetNumber todelete[MaxOffsetNumber]; int ntodelete = 0; LockBuffer(buffer, GIST_UNLOCK); LockBuffer(buffer, GIST_EXCLUSIVE); page = (Page) BufferGetPage(buffer); if (stack->blkno == GIST_ROOT_BLKNO && !GistPageIsLeaf(page)) { /* only the root can become non-leaf during relock */ UnlockReleaseBuffer(buffer); /* one more check */ continue; } /* * check for split proceeded after look at parent, we should check * it after relock */ pushStackIfSplited(page, stack); /* * Remove deletable tuples from page */ maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); if (callback(&(idxtuple->t_tid), callback_state)) { todelete[ntodelete] = i - ntodelete; ntodelete++; stats->tuples_removed += 1; } else stats->num_index_tuples += 1; } if (ntodelete) { START_CRIT_SECTION(); MarkBufferDirty(buffer); for (i = 0; i < ntodelete; i++) PageIndexTupleDelete(page, todelete[i]); GistMarkTuplesDeleted(page); if (RelationNeedsWAL(rel)) { XLogRecPtr recptr; recptr = gistXLogUpdate(rel->rd_node, buffer, todelete, ntodelete, NULL, 0, InvalidBuffer); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } else PageSetLSN(page, GetXLogRecPtrForTemp()); END_CRIT_SECTION(); } } else { /* check for split proceeded after look at parent */ pushStackIfSplited(page, stack); maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); ptr = (GistBDItem *) palloc(sizeof(GistBDItem)); ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); ptr->parentlsn = PageGetLSN(page); ptr->next = stack->next; stack->next = ptr; if (GistTupleIsInvalid(idxtuple)) ereport(LOG, (errmsg("index \"%s\" contains an inner tuple marked as invalid", RelationGetRelationName(rel)), errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), errhint("Please REINDEX it."))); } } UnlockReleaseBuffer(buffer); ptr = stack->next; pfree(stack); stack = ptr; vacuum_delay_point(); } PG_RETURN_POINTER(stats); }
/* * A tuple in the heap is being inserted. To keep a brin index up to date, * we need to obtain the relevant index tuple and compare its stored values * with those of the new tuple. If the tuple values are not consistent with * the summary tuple, we need to update the index tuple. * * If autosummarization is enabled, check if we need to summarize the previous * page range. * * If the range is not currently summarized (i.e. the revmap returns NULL for * it), there's nothing to do for this tuple. */ bool brininsert(Relation idxRel, Datum *values, bool *nulls, ItemPointer heaptid, Relation heapRel, IndexUniqueCheck checkUnique, IndexInfo *indexInfo) { BlockNumber pagesPerRange; BlockNumber origHeapBlk; BlockNumber heapBlk; BrinDesc *bdesc = (BrinDesc *) indexInfo->ii_AmCache; BrinRevmap *revmap; Buffer buf = InvalidBuffer; MemoryContext tupcxt = NULL; MemoryContext oldcxt = CurrentMemoryContext; bool autosummarize = BrinGetAutoSummarize(idxRel); revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL); /* * origHeapBlk is the block number where the insertion occurred. heapBlk * is the first block in the corresponding page range. */ origHeapBlk = ItemPointerGetBlockNumber(heaptid); heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange; for (;;) { bool need_insert = false; OffsetNumber off; BrinTuple *brtup; BrinMemTuple *dtup; int keyno; CHECK_FOR_INTERRUPTS(); /* * If auto-summarization is enabled and we just inserted the first * tuple into the first block of a new non-first page range, request a * summarization run of the previous range. */ if (autosummarize && heapBlk > 0 && heapBlk == origHeapBlk && ItemPointerGetOffsetNumber(heaptid) == FirstOffsetNumber) { BlockNumber lastPageRange = heapBlk - 1; BrinTuple *lastPageTuple; lastPageTuple = brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off, NULL, BUFFER_LOCK_SHARE, NULL); if (!lastPageTuple) { bool recorded; recorded = AutoVacuumRequestWork(AVW_BRINSummarizeRange, RelationGetRelid(idxRel), lastPageRange); if (!recorded) ereport(LOG, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("request for BRIN range summarization for index \"%s\" page %u was not recorded", RelationGetRelationName(idxRel), lastPageRange))); } else LockBuffer(buf, BUFFER_LOCK_UNLOCK); } brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL, BUFFER_LOCK_SHARE, NULL); /* if range is unsummarized, there's nothing to do */ if (!brtup) break; /* First time through in this statement? */ if (bdesc == NULL) { MemoryContextSwitchTo(indexInfo->ii_Context); bdesc = brin_build_desc(idxRel); indexInfo->ii_AmCache = (void *) bdesc; MemoryContextSwitchTo(oldcxt); } /* First time through in this brininsert call? */ if (tupcxt == NULL) { tupcxt = AllocSetContextCreate(CurrentMemoryContext, "brininsert cxt", ALLOCSET_DEFAULT_SIZES); MemoryContextSwitchTo(tupcxt); } dtup = brin_deform_tuple(bdesc, brtup, NULL); /* * Compare the key values of the new tuple to the stored index values; * our deformed tuple will get updated if the new tuple doesn't fit * the original range (note this means we can't break out of the loop * early). Make a note of whether this happens, so that we know to * insert the modified tuple later. */ for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) { Datum result; BrinValues *bval; FmgrInfo *addValue; bval = &dtup->bt_columns[keyno]; addValue = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_ADDVALUE); result = FunctionCall4Coll(addValue, idxRel->rd_indcollation[keyno], PointerGetDatum(bdesc), PointerGetDatum(bval), values[keyno], nulls[keyno]); /* if that returned true, we need to insert the updated tuple */ need_insert |= DatumGetBool(result); } if (!need_insert) { /* * The tuple is consistent with the new values, so there's nothing * to do. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); } else { Page page = BufferGetPage(buf); ItemId lp = PageGetItemId(page, off); Size origsz; BrinTuple *origtup; Size newsz; BrinTuple *newtup; bool samepage; /* * Make a copy of the old tuple, so that we can compare it after * re-acquiring the lock. */ origsz = ItemIdGetLength(lp); origtup = brin_copy_tuple(brtup, origsz, NULL, NULL); /* * Before releasing the lock, check if we can attempt a same-page * update. Another process could insert a tuple concurrently in * the same page though, so downstream we must be prepared to cope * if this turns out to not be possible after all. */ newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz); samepage = brin_can_do_samepage_update(buf, origsz, newsz); LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* * Try to update the tuple. If this doesn't work for whatever * reason, we need to restart from the top; the revmap might be * pointing at a different tuple for this block now, so we need to * recompute to ensure both our new heap tuple and the other * inserter's are covered by the combined tuple. It might be that * we don't need to update at all. */ if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk, buf, off, origtup, origsz, newtup, newsz, samepage)) { /* no luck; start over */ MemoryContextResetAndDeleteChildren(tupcxt); continue; } } /* success! */ break; } brinRevmapTerminate(revmap); if (BufferIsValid(buf)) ReleaseBuffer(buf); MemoryContextSwitchTo(oldcxt); if (tupcxt != NULL) MemoryContextDelete(tupcxt); return false; }
void heap_desc(StringInfo buf, XLogRecord *record) { char *rec = XLogRecGetData(record); uint8 info = record->xl_info & ~XLR_INFO_MASK; info &= XLOG_HEAP_OPMASK; if (info == XLOG_HEAP_INSERT) { xl_heap_insert *xlrec = (xl_heap_insert *) rec; out_target(buf, &(xlrec->target)); } else if (info == XLOG_HEAP_DELETE) { xl_heap_delete *xlrec = (xl_heap_delete *) rec; out_target(buf, &(xlrec->target)); appendStringInfoChar(buf, ' '); out_infobits(buf, xlrec->infobits_set); } else if (info == XLOG_HEAP_UPDATE) { xl_heap_update *xlrec = (xl_heap_update *) rec; out_target(buf, &(xlrec->target)); appendStringInfo(buf, " xmax %u ", xlrec->old_xmax); out_infobits(buf, xlrec->old_infobits_set); appendStringInfo(buf, "; new tid %u/%u xmax %u", ItemPointerGetBlockNumber(&(xlrec->newtid)), ItemPointerGetOffsetNumber(&(xlrec->newtid)), xlrec->new_xmax); } else if (info == XLOG_HEAP_HOT_UPDATE) { xl_heap_update *xlrec = (xl_heap_update *) rec; out_target(buf, &(xlrec->target)); appendStringInfo(buf, " xmax %u ", xlrec->old_xmax); out_infobits(buf, xlrec->old_infobits_set); appendStringInfo(buf, "; new tid %u/%u xmax %u", ItemPointerGetBlockNumber(&(xlrec->newtid)), ItemPointerGetOffsetNumber(&(xlrec->newtid)), xlrec->new_xmax); } else if (info == XLOG_HEAP_LOCK) { xl_heap_lock *xlrec = (xl_heap_lock *) rec; appendStringInfo(buf, "xid %u: ", xlrec->locking_xid); out_target(buf, &(xlrec->target)); appendStringInfoChar(buf, ' '); out_infobits(buf, xlrec->infobits_set); } else if (info == XLOG_HEAP_INPLACE) { xl_heap_inplace *xlrec = (xl_heap_inplace *) rec; out_target(buf, &(xlrec->target)); } }
/* * Updates the stack so that child->parent is the correct parent of the * child. child->parent must be exclusively locked on entry, and will * remain so at exit, but it might not be the same page anymore. */ static void gistFindCorrectParent(Relation r, GISTInsertStack *child) { GISTInsertStack *parent = child->parent; gistcheckpage(r, parent->buffer); parent->page = (Page) BufferGetPage(parent->buffer); /* here we don't need to distinguish between split and page update */ if (child->downlinkoffnum == InvalidOffsetNumber || parent->lsn != PageGetLSN(parent->page)) { /* parent is changed, look child in right links until found */ OffsetNumber i, maxoff; ItemId iid; IndexTuple idxtuple; GISTInsertStack *ptr; while (true) { maxoff = PageGetMaxOffsetNumber(parent->page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(parent->page, i); idxtuple = (IndexTuple) PageGetItem(parent->page, iid); if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno) { /* yes!!, found */ child->downlinkoffnum = i; return; } } parent->blkno = GistPageGetOpaque(parent->page)->rightlink; UnlockReleaseBuffer(parent->buffer); if (parent->blkno == InvalidBlockNumber) { /* * End of chain and still didn't find parent. It's a very-very * rare situation when root splited. */ break; } parent->buffer = ReadBuffer(r, parent->blkno); LockBuffer(parent->buffer, GIST_EXCLUSIVE); gistcheckpage(r, parent->buffer); parent->page = (Page) BufferGetPage(parent->buffer); } /* * awful!!, we need search tree to find parent ... , but before we * should release all old parent */ ptr = child->parent->parent; /* child->parent already released * above */ while (ptr) { ReleaseBuffer(ptr->buffer); ptr = ptr->parent; } /* ok, find new path */ ptr = parent = gistFindPath(r, child->blkno, &child->downlinkoffnum); /* read all buffers as expected by caller */ /* note we don't lock them or gistcheckpage them here! */ while (ptr) { ptr->buffer = ReadBuffer(r, ptr->blkno); ptr->page = (Page) BufferGetPage(ptr->buffer); ptr = ptr->parent; } /* install new chain of parents to stack */ child->parent = parent; /* make recursive call to normal processing */ LockBuffer(child->parent->buffer, GIST_EXCLUSIVE); gistFindCorrectParent(r, child); } return; }
/* * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so. * * This action unlinks the page from the b-tree structure, removing all * pointers leading to it --- but not touching its own left and right links. * The page cannot be physically reclaimed right away, since other processes * may currently be trying to follow links leading to the page; they have to * be allowed to use its right-link to recover. See nbtree/README. * * On entry, the target buffer must be pinned and locked (either read or write * lock is OK). This lock and pin will be dropped before exiting. * * The "stack" argument can be a search stack leading (approximately) to the * target page, or NULL --- outside callers typically pass NULL since they * have not done such a search, but internal recursion cases pass the stack * to avoid duplicated search effort. * * Returns the number of pages successfully deleted (zero if page cannot * be deleted now; could be more than one if parent pages were deleted too). * * NOTE: this leaks memory. Rather than trying to clean up everything * carefully, it's better to run it in a temp context that can be reset * frequently. */ int _bt_pagedel(Relation rel, Buffer buf, BTStack stack) { int result; BlockNumber target, leftsib, rightsib, parent; OffsetNumber poffset, maxoff; uint32 targetlevel, ilevel; ItemId itemid; IndexTuple targetkey, itup; ScanKey itup_scankey; Buffer lbuf, rbuf, pbuf; bool parent_half_dead; bool parent_one_child; bool rightsib_empty; Buffer metabuf = InvalidBuffer; Page metapg = NULL; BTMetaPageData *metad = NULL; Page page; BTPageOpaque opaque; /* * We can never delete rightmost pages nor root pages. While at it, check * that page is not already deleted and is empty. */ page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) || P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page)) { /* Should never fail to delete a half-dead page */ Assert(!P_ISHALFDEAD(opaque)); _bt_relbuf(rel, buf); return 0; } /* * Save info about page, including a copy of its high key (it must have * one, being non-rightmost). */ target = BufferGetBlockNumber(buf); targetlevel = opaque->btpo.level; leftsib = opaque->btpo_prev; itemid = PageGetItemId(page, P_HIKEY); targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid)); /* * To avoid deadlocks, we'd better drop the target page lock before going * further. */ _bt_relbuf(rel, buf); /* * We need an approximate pointer to the page's parent page. We use the * standard search mechanism to search for the page's high key; this will * give us a link to either the current parent or someplace to its left * (if there are multiple equal high keys). In recursion cases, the * caller already generated a search stack and we can just re-use that * work. */ if (stack == NULL) { if (!InRecovery) { /* we need an insertion scan key to do our search, so build one */ itup_scankey = _bt_mkscankey(rel, targetkey); /* find the leftmost leaf page containing this key */ stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false, &lbuf, BT_READ); /* don't need a pin on that either */ _bt_relbuf(rel, lbuf); /* * If we are trying to delete an interior page, _bt_search did * more than we needed. Locate the stack item pointing to our * parent level. */ ilevel = 0; for (;;) { if (stack == NULL) elog(ERROR, "not enough stack items"); if (ilevel == targetlevel) break; stack = stack->bts_parent; ilevel++; } } else { /* * During WAL recovery, we can't use _bt_search (for one reason, * it might invoke user-defined comparison functions that expect * facilities not available in recovery mode). Instead, just set * up a dummy stack pointing to the left end of the parent tree * level, from which _bt_getstackbuf will walk right to the parent * page. Painful, but we don't care too much about performance in * this scenario. */ pbuf = _bt_get_endpoint(rel, targetlevel + 1, false); stack = (BTStack) palloc(sizeof(BTStackData)); stack->bts_blkno = BufferGetBlockNumber(pbuf); stack->bts_offset = InvalidOffsetNumber; /* bts_btentry will be initialized below */ stack->bts_parent = NULL; _bt_relbuf(rel, pbuf); } } /* * We cannot delete a page that is the rightmost child of its immediate * parent, unless it is the only child --- in which case the parent has to * be deleted too, and the same condition applies recursively to it. We * have to check this condition all the way up before trying to delete. We * don't need to re-test when deleting a non-leaf page, though. */ if (targetlevel == 0 && !_bt_parent_deletion_safe(rel, target, stack)) return 0; /* * We have to lock the pages we need to modify in the standard order: * moving right, then up. Else we will deadlock against other writers. * * So, we need to find and write-lock the current left sibling of the * target page. The sibling that was current a moment ago could have * split, so we may have to move right. This search could fail if either * the sibling or the target page was deleted by someone else meanwhile; * if so, give up. (Right now, that should never happen, since page * deletion is only done in VACUUM and there shouldn't be multiple VACUUMs * concurrently on the same table.) */ if (leftsib != P_NONE) { lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); while (P_ISDELETED(opaque) || opaque->btpo_next != target) { /* step right one page */ leftsib = opaque->btpo_next; _bt_relbuf(rel, lbuf); if (leftsib == P_NONE) { elog(LOG, "no left sibling (concurrent deletion?) in \"%s\"", RelationGetRelationName(rel)); return 0; } lbuf = _bt_getbuf(rel, leftsib, BT_WRITE); page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); } } else lbuf = InvalidBuffer; /* * Next write-lock the target page itself. It should be okay to take just * a write lock not a superexclusive lock, since no scans would stop on an * empty page. */ buf = _bt_getbuf(rel, target, BT_WRITE); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* * Check page is still empty etc, else abandon deletion. The empty check * is necessary since someone else might have inserted into it while we * didn't have it locked; the others are just for paranoia's sake. */ if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) || P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page)) { _bt_relbuf(rel, buf); if (BufferIsValid(lbuf)) _bt_relbuf(rel, lbuf); return 0; } if (opaque->btpo_prev != leftsib) elog(ERROR, "left link changed unexpectedly in block %u of index \"%s\"", target, RelationGetRelationName(rel)); /* * And next write-lock the (current) right sibling. */ rightsib = opaque->btpo_next; rbuf = _bt_getbuf(rel, rightsib, BT_WRITE); page = BufferGetPage(rbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (opaque->btpo_prev != target) elog(ERROR, "right sibling's left-link doesn't match: " "block %u links to %u instead of expected %u in index \"%s\"", rightsib, opaque->btpo_prev, target, RelationGetRelationName(rel)); /* * Next find and write-lock the current parent of the target page. This is * essentially the same as the corresponding step of splitting. */ ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY); pbuf = _bt_getstackbuf(rel, stack, BT_WRITE); if (pbuf == InvalidBuffer) elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u", RelationGetRelationName(rel), target); parent = stack->bts_blkno; poffset = stack->bts_offset; /* * If the target is the rightmost child of its parent, then we can't * delete, unless it's also the only child --- in which case the parent * changes to half-dead status. The "can't delete" case should have been * detected by _bt_parent_deletion_safe, so complain if we see it now. */ page = BufferGetPage(pbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); maxoff = PageGetMaxOffsetNumber(page); parent_half_dead = false; parent_one_child = false; if (poffset >= maxoff) { if (poffset == P_FIRSTDATAKEY(opaque)) parent_half_dead = true; else elog(ERROR, "failed to delete rightmost child %u of block %u in index \"%s\"", target, parent, RelationGetRelationName(rel)); } else { /* Will there be exactly one child left in this parent? */ if (OffsetNumberNext(P_FIRSTDATAKEY(opaque)) == maxoff) parent_one_child = true; } /* * If we are deleting the next-to-last page on the target's level, then * the rightsib is a candidate to become the new fast root. (In theory, it * might be possible to push the fast root even further down, but the odds * of doing so are slim, and the locking considerations daunting.) * * We don't support handling this in the case where the parent is becoming * half-dead, even though it theoretically could occur. * * We can safely acquire a lock on the metapage here --- see comments for * _bt_newroot(). */ if (leftsib == P_NONE && !parent_half_dead) { page = BufferGetPage(rbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo.level == targetlevel); if (P_RIGHTMOST(opaque)) { /* rightsib will be the only one left on the level */ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); /* * The expected case here is btm_fastlevel == targetlevel+1; if * the fastlevel is <= targetlevel, something is wrong, and we * choose to overwrite it to fix it. */ if (metad->btm_fastlevel > targetlevel + 1) { /* no update wanted */ _bt_relbuf(rel, metabuf); metabuf = InvalidBuffer; } } } /* * Check that the parent-page index items we're about to delete/overwrite * contain what we expect. This can fail if the index has become * corrupt for some reason. We want to throw any error before entering * the critical section --- otherwise it'd be a PANIC. * * The test on the target item is just an Assert because _bt_getstackbuf * should have guaranteed it has the expected contents. The test on the * next-child downlink is known to sometimes fail in the field, though. */ page = BufferGetPage(pbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); #ifdef USE_ASSERT_CHECKING itemid = PageGetItemId(page, poffset); itup = (IndexTuple) PageGetItem(page, itemid); Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target); #endif if (!parent_half_dead) { OffsetNumber nextoffset; nextoffset = OffsetNumberNext(poffset); itemid = PageGetItemId(page, nextoffset); itup = (IndexTuple) PageGetItem(page, itemid); if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib) elog(ERROR, "right sibling %u of block %u is not next child %u of block %u in index \"%s\"", rightsib, target, ItemPointerGetBlockNumber(&(itup->t_tid)), parent, RelationGetRelationName(rel)); } /* * Here we begin doing the deletion. */ /* No ereport(ERROR) until changes are logged */ START_CRIT_SECTION(); /* * Update parent. The normal case is a tad tricky because we want to * delete the target's downlink and the *following* key. Easiest way is * to copy the right sibling's downlink over the target downlink, and then * delete the following item. */ if (parent_half_dead) { PageIndexTupleDelete(page, poffset); opaque->btpo_flags |= BTP_HALF_DEAD; } else { OffsetNumber nextoffset; itemid = PageGetItemId(page, poffset); itup = (IndexTuple) PageGetItem(page, itemid); ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY); nextoffset = OffsetNumberNext(poffset); PageIndexTupleDelete(page, nextoffset); } /* * Update siblings' side-links. Note the target page's side-links will * continue to point to the siblings. Asserts here are just rechecking * things we already verified above. */ if (BufferIsValid(lbuf)) { page = BufferGetPage(lbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo_next == target); opaque->btpo_next = rightsib; } page = BufferGetPage(rbuf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); Assert(opaque->btpo_prev == target); opaque->btpo_prev = leftsib; rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)); /* * Mark the page itself deleted. It can be recycled when all current * transactions are gone. */ page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); opaque->btpo_flags &= ~BTP_HALF_DEAD; opaque->btpo_flags |= BTP_DELETED; opaque->btpo.xact = ReadNewTransactionId(); /* And update the metapage, if needed */ if (BufferIsValid(metabuf)) { metad->btm_fastroot = rightsib; metad->btm_fastlevel = targetlevel; MarkBufferDirty(metabuf); } /* Must mark buffers dirty before XLogInsert */ MarkBufferDirty(pbuf); MarkBufferDirty(rbuf); MarkBufferDirty(buf); if (BufferIsValid(lbuf)) MarkBufferDirty(lbuf); /* XLOG stuff */ if (!rel->rd_istemp) { xl_btree_delete_page xlrec; xl_btree_metadata xlmeta; uint8 xlinfo; XLogRecPtr recptr; XLogRecData rdata[5]; XLogRecData *nextrdata; xlrec.target.node = rel->rd_node; ItemPointerSet(&(xlrec.target.tid), parent, poffset); xlrec.deadblk = target; xlrec.leftblk = leftsib; xlrec.rightblk = rightsib; xlrec.btpo_xact = opaque->btpo.xact; rdata[0].data = (char *) &xlrec; rdata[0].len = SizeOfBtreeDeletePage; rdata[0].buffer = InvalidBuffer; rdata[0].next = nextrdata = &(rdata[1]); if (BufferIsValid(metabuf)) { xlmeta.root = metad->btm_root; xlmeta.level = metad->btm_level; xlmeta.fastroot = metad->btm_fastroot; xlmeta.fastlevel = metad->btm_fastlevel; nextrdata->data = (char *) &xlmeta; nextrdata->len = sizeof(xl_btree_metadata); nextrdata->buffer = InvalidBuffer; nextrdata->next = nextrdata + 1; nextrdata++; xlinfo = XLOG_BTREE_DELETE_PAGE_META; } else if (parent_half_dead) xlinfo = XLOG_BTREE_DELETE_PAGE_HALF; else xlinfo = XLOG_BTREE_DELETE_PAGE; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->next = nextrdata + 1; nextrdata->buffer = pbuf; nextrdata->buffer_std = true; nextrdata++; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->buffer = rbuf; nextrdata->buffer_std = true; nextrdata->next = NULL; if (BufferIsValid(lbuf)) { nextrdata->next = nextrdata + 1; nextrdata++; nextrdata->data = NULL; nextrdata->len = 0; nextrdata->buffer = lbuf; nextrdata->buffer_std = true; nextrdata->next = NULL; } recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata); if (BufferIsValid(metabuf)) { PageSetLSN(metapg, recptr); PageSetTLI(metapg, ThisTimeLineID); } page = BufferGetPage(pbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); page = BufferGetPage(rbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); page = BufferGetPage(buf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); if (BufferIsValid(lbuf)) { page = BufferGetPage(lbuf); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); } } END_CRIT_SECTION(); /* release metapage; send out relcache inval if metapage changed */ if (BufferIsValid(metabuf)) { CacheInvalidateRelcache(rel); _bt_relbuf(rel, metabuf); } /* can always release leftsib immediately */ if (BufferIsValid(lbuf)) _bt_relbuf(rel, lbuf); /* * If parent became half dead, recurse to delete it. Otherwise, if right * sibling is empty and is now the last child of the parent, recurse to * try to delete it. (These cases cannot apply at the same time, though * the second case might itself recurse to the first.) * * When recursing to parent, we hold the lock on the target page until * done. This delays any insertions into the keyspace that was just * effectively reassigned to the parent's right sibling. If we allowed * that, and there were enough such insertions before we finish deleting * the parent, page splits within that keyspace could lead to inserting * out-of-order keys into the grandparent level. It is thought that that * wouldn't have any serious consequences, but it still seems like a * pretty bad idea. */ if (parent_half_dead) { /* recursive call will release pbuf */ _bt_relbuf(rel, rbuf); result = _bt_pagedel(rel, pbuf, stack->bts_parent) + 1; _bt_relbuf(rel, buf); } else if (parent_one_child && rightsib_empty) { _bt_relbuf(rel, pbuf); _bt_relbuf(rel, buf); /* recursive call will release rbuf */ result = _bt_pagedel(rel, rbuf, stack) + 1; } else { _bt_relbuf(rel, pbuf); _bt_relbuf(rel, buf); _bt_relbuf(rel, rbuf); result = 1; } return result; }
/* * Workhouse routine for doing insertion into a GiST index. Note that * this routine assumes it is invoked in a short-lived memory context, * so it does not bother releasing palloc'd allocations. */ void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate) { ItemId iid; IndexTuple idxtuple; GISTInsertStack firststack; GISTInsertStack *stack; GISTInsertState state; bool xlocked = false; memset(&state, 0, sizeof(GISTInsertState)); state.freespace = freespace; state.r = r; /* Start from the root */ firststack.blkno = GIST_ROOT_BLKNO; firststack.lsn = 0; firststack.parent = NULL; firststack.downlinkoffnum = InvalidOffsetNumber; state.stack = stack = &firststack; /* * Walk down along the path of smallest penalty, updating the parent * pointers with the key we're inserting as we go. If we crash in the * middle, the tree is consistent, although the possible parent updates * were a waste. */ for (;;) { if (XLogRecPtrIsInvalid(stack->lsn)) stack->buffer = ReadBuffer(state.r, stack->blkno); /* * Be optimistic and grab shared lock first. Swap it for an exclusive * lock later if we need to update the page. */ if (!xlocked) { LockBuffer(stack->buffer, GIST_SHARE); gistcheckpage(state.r, stack->buffer); } stack->page = (Page) BufferGetPage(stack->buffer); stack->lsn = PageGetLSN(stack->page); Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn)); /* * If this page was split but the downlink was never inserted to the * parent because the inserting backend crashed before doing that, fix * that now. */ if (GistFollowRight(stack->page)) { if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; /* someone might've completed the split when we unlocked */ if (!GistFollowRight(stack->page)) continue; } gistfixsplit(&state, giststate); UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } if (stack->blkno != GIST_ROOT_BLKNO && stack->parent->lsn < GistPageGetNSN(stack->page)) { /* * Concurrent split detected. There's no guarantee that the * downlink for this page is consistent with the tuple we're * inserting anymore, so go back to parent and rechoose the best * child. */ UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } if (!GistPageIsLeaf(stack->page)) { /* * This is an internal page so continue to walk down the tree. * Find the child node that has the minimum insertion penalty. */ BlockNumber childblkno; IndexTuple newtup; GISTInsertStack *item; OffsetNumber downlinkoffnum; downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate); iid = PageGetItemId(stack->page, downlinkoffnum); idxtuple = (IndexTuple) PageGetItem(stack->page, iid); childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); /* * Check that it's not a leftover invalid tuple from pre-9.1 */ if (GistTupleIsInvalid(idxtuple)) ereport(ERROR, (errmsg("index \"%s\" contains an inner tuple marked as invalid", RelationGetRelationName(r)), errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), errhint("Please REINDEX it."))); /* * Check that the key representing the target child node is * consistent with the key we're inserting. Update it if it's not. */ newtup = gistgetadjusted(state.r, idxtuple, itup, giststate); if (newtup) { /* * Swap shared lock for an exclusive one. Beware, the page may * change while we unlock/lock the page... */ if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; stack->page = (Page) BufferGetPage(stack->buffer); if (PageGetLSN(stack->page) != stack->lsn) { /* the page was changed while we unlocked it, retry */ continue; } } /* * Update the tuple. * * We still hold the lock after gistinserttuple(), but it * might have to split the page to make the updated tuple fit. * In that case the updated tuple might migrate to the other * half of the split, so we have to go back to the parent and * descend back to the half that's a better fit for the new * tuple. */ if (gistinserttuple(&state, stack, giststate, newtup, downlinkoffnum)) { /* * If this was a root split, the root page continues to be * the parent and the updated tuple went to one of the * child pages, so we just need to retry from the root * page. */ if (stack->blkno != GIST_ROOT_BLKNO) { UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; } continue; } } LockBuffer(stack->buffer, GIST_UNLOCK); xlocked = false; /* descend to the chosen child */ item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); item->blkno = childblkno; item->parent = stack; item->downlinkoffnum = downlinkoffnum; state.stack = stack = item; } else { /* * Leaf page. Insert the new key. We've already updated all the * parents on the way down, but we might have to split the page if * it doesn't fit. gistinserthere() will take care of that. */ /* * Swap shared lock for an exclusive one. Be careful, the page may * change while we unlock/lock the page... */ if (!xlocked) { LockBuffer(stack->buffer, GIST_UNLOCK); LockBuffer(stack->buffer, GIST_EXCLUSIVE); xlocked = true; stack->page = (Page) BufferGetPage(stack->buffer); stack->lsn = PageGetLSN(stack->page); if (stack->blkno == GIST_ROOT_BLKNO) { /* * the only page that can become inner instead of leaf is * the root page, so for root we should recheck it */ if (!GistPageIsLeaf(stack->page)) { /* * very rare situation: during unlock/lock index with * number of pages = 1 was increased */ LockBuffer(stack->buffer, GIST_UNLOCK); xlocked = false; continue; } /* * we don't need to check root split, because checking * leaf/inner is enough to recognize split for root */ } else if (GistFollowRight(stack->page) || stack->parent->lsn < GistPageGetNSN(stack->page)) { /* * The page was split while we momentarily unlocked the * page. Go back to parent. */ UnlockReleaseBuffer(stack->buffer); xlocked = false; state.stack = stack = stack->parent; continue; } } /* now state.stack->(page, buffer and blkno) points to leaf page */ gistinserttuple(&state, stack, giststate, itup, InvalidOffsetNumber); LockBuffer(stack->buffer, GIST_UNLOCK); /* Release any pins we might still hold before exiting */ for (; stack; stack = stack->parent) ReleaseBuffer(stack->buffer); break; } } }
void heap_desc(StringInfo buf, uint8 xl_info, char *rec) { uint8 info = xl_info & ~XLR_INFO_MASK; info &= XLOG_HEAP_OPMASK; if (info == XLOG_HEAP_INSERT) { xl_heap_insert *xlrec = (xl_heap_insert *) rec; if (xl_info & XLOG_HEAP_INIT_PAGE) appendStringInfoString(buf, "insert(init): "); else appendStringInfoString(buf, "insert: "); out_target(buf, &(xlrec->target)); } else if (info == XLOG_HEAP_DELETE) { xl_heap_delete *xlrec = (xl_heap_delete *) rec; appendStringInfoString(buf, "delete: "); out_target(buf, &(xlrec->target)); appendStringInfoChar(buf, ' '); out_infobits(buf, xlrec->infobits_set); } else if (info == XLOG_HEAP_UPDATE) { xl_heap_update *xlrec = (xl_heap_update *) rec; if (xl_info & XLOG_HEAP_INIT_PAGE) appendStringInfoString(buf, "update(init): "); else appendStringInfoString(buf, "update: "); out_target(buf, &(xlrec->target)); appendStringInfo(buf, " xmax %u ", xlrec->old_xmax); out_infobits(buf, xlrec->old_infobits_set); appendStringInfo(buf, "; new tid %u/%u xmax %u", ItemPointerGetBlockNumber(&(xlrec->newtid)), ItemPointerGetOffsetNumber(&(xlrec->newtid)), xlrec->new_xmax); } else if (info == XLOG_HEAP_HOT_UPDATE) { xl_heap_update *xlrec = (xl_heap_update *) rec; if (xl_info & XLOG_HEAP_INIT_PAGE) /* can this case happen? */ appendStringInfoString(buf, "hot_update(init): "); else appendStringInfoString(buf, "hot_update: "); out_target(buf, &(xlrec->target)); appendStringInfo(buf, " xmax %u ", xlrec->old_xmax); out_infobits(buf, xlrec->old_infobits_set); appendStringInfo(buf, "; new tid %u/%u xmax %u", ItemPointerGetBlockNumber(&(xlrec->newtid)), ItemPointerGetOffsetNumber(&(xlrec->newtid)), xlrec->new_xmax); } else if (info == XLOG_HEAP_NEWPAGE) { xl_heap_newpage *xlrec = (xl_heap_newpage *) rec; appendStringInfo(buf, "newpage: rel %u/%u/%u; fork %u, blk %u", xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode, xlrec->forknum, xlrec->blkno); } else if (info == XLOG_HEAP_LOCK) { xl_heap_lock *xlrec = (xl_heap_lock *) rec; appendStringInfo(buf, "lock %u: ", xlrec->locking_xid); out_target(buf, &(xlrec->target)); appendStringInfoChar(buf, ' '); out_infobits(buf, xlrec->infobits_set); } else if (info == XLOG_HEAP_INPLACE) { xl_heap_inplace *xlrec = (xl_heap_inplace *) rec; appendStringInfoString(buf, "inplace: "); out_target(buf, &(xlrec->target)); } else appendStringInfoString(buf, "UNKNOWN"); }
/* ---------------------------------------------------------------- * FunctionNext * * This is a workhorse for ExecFunctionScan * ---------------------------------------------------------------- */ static TupleTableSlot * FunctionNext(FunctionScanState *node) { TupleTableSlot *slot; EState *estate; ScanDirection direction; Tuplestorestate *tuplestorestate; /* * get information from the estate and scan state */ estate = node->ss.ps.state; direction = estate->es_direction; tuplestorestate = node->tuplestorestate; /* * If first time through, read all tuples from function and put them in a * tuplestore. Subsequent calls just fetch tuples from tuplestore. */ if (tuplestorestate == NULL) { tuplestorestate = ExecMakeTableFunctionResult( node->funcexpr, node->ss.ps.ps_ExprContext, node->tupdesc, PlanStateOperatorMemKB( (PlanState *) node)); node->tuplestorestate = tuplestorestate; /* CDB: Offer extra info for EXPLAIN ANALYZE. */ if (node->ss.ps.instrument) { /* Let the tuplestore share our Instrumentation object. */ tuplestore_set_instrument(tuplestorestate, node->ss.ps.instrument); /* Request a callback at end of query. */ node->ss.ps.cdbexplainfun = ExecFunctionScanExplainEnd; } } /* * Get the next tuple from tuplestore. Return NULL if no more tuples. */ slot = node->ss.ss_ScanTupleSlot; if (tuplestore_gettupleslot(tuplestorestate, ScanDirectionIsForward(direction), slot)) { /* CDB: Label each row with a synthetic ctid for subquery dedup. */ if (node->cdb_want_ctid) { HeapTuple tuple = ExecFetchSlotHeapTuple(slot); /* Increment 48-bit row count */ node->cdb_fake_ctid.ip_posid++; if (node->cdb_fake_ctid.ip_posid == 0) ItemPointerSetBlockNumber(&node->cdb_fake_ctid, 1 + ItemPointerGetBlockNumber(&node->cdb_fake_ctid)); tuple->t_self = node->cdb_fake_ctid; } } if (!TupIsNull(slot)) { Gpmon_M_Incr_Rows_Out(GpmonPktFromFuncScanState(node)); CheckSendPlanStateGpmonPkt(&node->ss.ps); } else if (!node->ss.ps.delayEagerFree) { ExecEagerFreeFunctionScan((FunctionScanState *)(&node->ss.ps)); } return slot; }
/* * A tuple in the heap is being inserted. To keep a brin index up to date, * we need to obtain the relevant index tuple and compare its stored values * with those of the new tuple. If the tuple values are not consistent with * the summary tuple, we need to update the index tuple. * * If the range is not currently summarized (i.e. the revmap returns NULL for * it), there's nothing to do. */ Datum brininsert(PG_FUNCTION_ARGS) { Relation idxRel = (Relation) PG_GETARG_POINTER(0); Datum *values = (Datum *) PG_GETARG_POINTER(1); bool *nulls = (bool *) PG_GETARG_POINTER(2); ItemPointer heaptid = (ItemPointer) PG_GETARG_POINTER(3); /* we ignore the rest of our arguments */ BlockNumber pagesPerRange; BrinDesc *bdesc = NULL; BrinRevmap *revmap; Buffer buf = InvalidBuffer; MemoryContext tupcxt = NULL; MemoryContext oldcxt = NULL; revmap = brinRevmapInitialize(idxRel, &pagesPerRange); for (;;) { bool need_insert = false; OffsetNumber off; BrinTuple *brtup; BrinMemTuple *dtup; BlockNumber heapBlk; int keyno; #ifdef USE_ASSERT_CHECKING BrinTuple *tmptup; BrinMemTuple *tmpdtup; Size tmpsiz; #endif CHECK_FOR_INTERRUPTS(); heapBlk = ItemPointerGetBlockNumber(heaptid); /* normalize the block number to be the first block in the range */ heapBlk = (heapBlk / pagesPerRange) * pagesPerRange; brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL, BUFFER_LOCK_SHARE); /* if range is unsummarized, there's nothing to do */ if (!brtup) break; /* First time through? */ if (bdesc == NULL) { bdesc = brin_build_desc(idxRel); tupcxt = AllocSetContextCreate(CurrentMemoryContext, "brininsert cxt", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); oldcxt = MemoryContextSwitchTo(tupcxt); } dtup = brin_deform_tuple(bdesc, brtup); #ifdef USE_ASSERT_CHECKING { /* * When assertions are enabled, we use this as an opportunity to * test the "union" method, which would otherwise be used very * rarely: first create a placeholder tuple, and addValue the * value we just got into it. Then union the existing index tuple * with the updated placeholder tuple. The tuple resulting from * that union should be identical to the one resulting from the * regular operation (straight addValue) below. * * Here we create the tuple to compare with; the actual comparison * is below. */ tmptup = brin_form_placeholder_tuple(bdesc, heapBlk, &tmpsiz); tmpdtup = brin_deform_tuple(bdesc, tmptup); for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) { BrinValues *bval; FmgrInfo *addValue; bval = &tmpdtup->bt_columns[keyno]; addValue = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_ADDVALUE); FunctionCall4Coll(addValue, idxRel->rd_indcollation[keyno], PointerGetDatum(bdesc), PointerGetDatum(bval), values[keyno], nulls[keyno]); } union_tuples(bdesc, tmpdtup, brtup); tmpdtup->bt_placeholder = dtup->bt_placeholder; tmptup = brin_form_tuple(bdesc, heapBlk, tmpdtup, &tmpsiz); } #endif /* * Compare the key values of the new tuple to the stored index values; * our deformed tuple will get updated if the new tuple doesn't fit * the original range (note this means we can't break out of the loop * early). Make a note of whether this happens, so that we know to * insert the modified tuple later. */ for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) { Datum result; BrinValues *bval; FmgrInfo *addValue; bval = &dtup->bt_columns[keyno]; addValue = index_getprocinfo(idxRel, keyno + 1, BRIN_PROCNUM_ADDVALUE); result = FunctionCall4Coll(addValue, idxRel->rd_indcollation[keyno], PointerGetDatum(bdesc), PointerGetDatum(bval), values[keyno], nulls[keyno]); /* if that returned true, we need to insert the updated tuple */ need_insert |= DatumGetBool(result); } #ifdef USE_ASSERT_CHECKING { /* * Now we can compare the tuple produced by the union function * with the one from plain addValue. */ BrinTuple *cmptup; Size cmpsz; cmptup = brin_form_tuple(bdesc, heapBlk, dtup, &cmpsz); Assert(brin_tuples_equal(tmptup, tmpsiz, cmptup, cmpsz)); } #endif if (!need_insert) { /* * The tuple is consistent with the new values, so there's nothing * to do. */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); } else { Page page = BufferGetPage(buf); ItemId lp = PageGetItemId(page, off); Size origsz; BrinTuple *origtup; Size newsz; BrinTuple *newtup; bool samepage; /* * Make a copy of the old tuple, so that we can compare it after * re-acquiring the lock. */ origsz = ItemIdGetLength(lp); origtup = brin_copy_tuple(brtup, origsz); /* * Before releasing the lock, check if we can attempt a same-page * update. Another process could insert a tuple concurrently in * the same page though, so downstream we must be prepared to cope * if this turns out to not be possible after all. */ newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz); samepage = brin_can_do_samepage_update(buf, origsz, newsz); LockBuffer(buf, BUFFER_LOCK_UNLOCK); /* * Try to update the tuple. If this doesn't work for whatever * reason, we need to restart from the top; the revmap might be * pointing at a different tuple for this block now, so we need to * recompute to ensure both our new heap tuple and the other * inserter's are covered by the combined tuple. It might be that * we don't need to update at all. */ if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk, buf, off, origtup, origsz, newtup, newsz, samepage)) { /* no luck; start over */ MemoryContextResetAndDeleteChildren(tupcxt); continue; } } /* success! */ break; } brinRevmapTerminate(revmap); if (BufferIsValid(buf)) ReleaseBuffer(buf); if (bdesc != NULL) { brin_free_desc(bdesc); MemoryContextSwitchTo(oldcxt); MemoryContextDelete(tupcxt); } return BoolGetDatum(false); }
/* * _bt_search() -- Search the tree for a particular scankey, * or more precisely for the first leaf page it could be on. * * The passed scankey must be an insertion-type scankey (see nbtree/README), * but it can omit the rightmost column(s) of the index. * * When nextkey is false (the usual case), we are looking for the first * item >= scankey. When nextkey is true, we are looking for the first * item strictly greater than scankey. * * Return value is a stack of parent-page pointers. *bufP is set to the * address of the leaf-page buffer, which is read-locked and pinned. * No locks are held on the parent pages, however! * * NOTE that the returned buffer is read-locked regardless of the access * parameter. However, access = BT_WRITE will allow an empty root page * to be created and returned. When access = BT_READ, an empty index * will result in *bufP being set to InvalidBuffer. */ BTStack _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, Buffer *bufP, int access) { BTStack stack_in = NULL; /* Get the root page to start with */ *bufP = _bt_getroot(rel, access); /* If index is empty and access = BT_READ, no root page is created. */ if (!BufferIsValid(*bufP)) return (BTStack) NULL; /* Loop iterates once per level descended in the tree */ for (;;) { Page page; BTPageOpaque opaque; OffsetNumber offnum; ItemId itemid; IndexTuple itup; BlockNumber blkno; BlockNumber par_blkno; BTStack new_stack; /* * Race -- the page we just grabbed may have split since we read its * pointer in the parent (or metapage). If it has, we may need to * move right to its new sibling. Do that. */ *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ); /* if this is a leaf page, we're done */ page = BufferGetPage(*bufP); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_ISLEAF(opaque)) break; /* * Find the appropriate item on the internal page, and get the child * page that it points to. */ offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey); itemid = PageGetItemId(page, offnum); itup = (IndexTuple) PageGetItem(page, itemid); blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); par_blkno = BufferGetBlockNumber(*bufP); /* * We need to save the location of the index entry we chose in the * parent page on a stack. In case we split the tree, we'll use the * stack to work back up to the parent page. We also save the actual * downlink (TID) to uniquely identify the index entry, in case it * moves right while we're working lower in the tree. See the paper * by Lehman and Yao for how this is detected and handled. (We use the * child link to disambiguate duplicate keys in the index -- Lehman * and Yao disallow duplicate keys.) */ new_stack = (BTStack) palloc(sizeof(BTStackData)); new_stack->bts_blkno = par_blkno; new_stack->bts_offset = offnum; memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData)); new_stack->bts_parent = stack_in; /* drop the read lock on the parent page, acquire one on the child */ *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ); /* okay, all set to move down a level */ stack_in = new_stack; } return stack_in; }
/* * _bt_get_endpoint() -- Find the first or last page on a given tree level * * If the index is empty, we will return InvalidBuffer; any other failure * condition causes ereport(). We will not return a dead page. * * The returned buffer is pinned and read-locked. */ Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost) { Buffer buf; Page page; BTPageOpaque opaque; OffsetNumber offnum; BlockNumber blkno; IndexTuple itup; /* * If we are looking for a leaf page, okay to descend from fast root; * otherwise better descend from true root. (There is no point in being * smarter about intermediate levels.) */ if (level == 0) buf = _bt_getroot(rel, BT_READ); else buf = _bt_gettrueroot(rel); if (!BufferIsValid(buf)) { /* empty index... */ return InvalidBuffer; } page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); for (;;) { /* * If we landed on a deleted page, step right to find a live page * (there must be one). Also, if we want the rightmost page, step * right if needed to get to it (this could happen if the page split * since we obtained a pointer to it). */ while (P_IGNORE(opaque) || (rightmost && !P_RIGHTMOST(opaque))) { blkno = opaque->btpo_next; if (blkno == P_NONE) elog(ERROR, "fell off the end of \"%s\"", RelationGetRelationName(rel)); buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); } /* Done? */ if (opaque->btpo.level == level) break; if (opaque->btpo.level < level) elog(ERROR, "btree level %u not found", level); /* Descend to leftmost or rightmost child page */ if (rightmost) offnum = PageGetMaxOffsetNumber(page); else offnum = P_FIRSTDATAKEY(opaque); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); } return buf; }
/* * Prune specified item pointer or a HOT chain originating at that item. * * If the item is an index-referenced tuple (i.e. not a heap-only tuple), * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple. * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really * DEAD, the OldestXmin test is just too coarse to detect it. * * The root line pointer is redirected to the tuple immediately after the * latest DEAD tuple. If all tuples in the chain are DEAD, the root line * pointer is marked LP_DEAD. (This includes the case of a DEAD simple * tuple, which we treat as a chain of length 1.) * * OldestXmin is the cutoff XID used to identify dead tuples. * * We don't actually change the page here, except perhaps for hint-bit updates * caused by HeapTupleSatisfiesVacuum. We just add entries to the arrays in * prstate showing the changes to be made. Items to be redirected are added * to the redirected[] array (two entries per redirection); items to be set to * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED * state are added to nowunused[]. * * If redirect_move is true, we intend to get rid of redirecting line pointers, * not just make redirection entries. * * Returns the number of tuples (to be) deleted from the page. */ static int heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum, TransactionId OldestXmin, PruneState *prstate, bool redirect_move) { int ndeleted = 0; Page dp = (Page) BufferGetPage(buffer); TransactionId priorXmax = InvalidTransactionId; ItemId rootlp; HeapTupleHeader htup; OffsetNumber latestdead = InvalidOffsetNumber, redirect_target = InvalidOffsetNumber, maxoff = PageGetMaxOffsetNumber(dp), offnum; OffsetNumber chainitems[MaxHeapTuplesPerPage]; int nchain = 0, i; rootlp = PageGetItemId(dp, rootoffnum); /* * If it's a heap-only tuple, then it is not the start of a HOT chain. */ if (ItemIdIsNormal(rootlp)) { htup = (HeapTupleHeader) PageGetItem(dp, rootlp); if (HeapTupleHeaderIsHeapOnly(htup)) { /* * If the tuple is DEAD and doesn't chain to anything else, mark * it unused immediately. (If it does chain, we can only remove * it as part of pruning its chain.) * * We need this primarily to handle aborted HOT updates, that is, * XMIN_INVALID heap-only tuples. Those might not be linked to by * any chain, since the parent tuple might be re-updated before * any pruning occurs. So we have to be able to reap them * separately from chain-pruning. (Note that * HeapTupleHeaderIsHotUpdated will never return true for an * XMIN_INVALID tuple, so this code will work even when there were * sequential updates within the aborted transaction.) * * Note that we might first arrive at a dead heap-only tuple * either here or while following a chain below. Whichever path * gets there first will mark the tuple unused. */ if (HeapTupleSatisfiesVacuum(relation, htup, OldestXmin, buffer) == HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup)) { heap_prune_record_unused(prstate, rootoffnum); ndeleted++; } /* Nothing more to do */ return ndeleted; } } /* Start from the root tuple */ offnum = rootoffnum; /* while not end of the chain */ for (;;) { ItemId lp; bool tupdead, recent_dead; /* Some sanity checks */ if (offnum < FirstOffsetNumber || offnum > maxoff) break; /* If item is already processed, stop --- it must not be same chain */ if (prstate->marked[offnum]) break; lp = PageGetItemId(dp, offnum); /* Unused item obviously isn't part of the chain */ if (!ItemIdIsUsed(lp)) break; /* * If we are looking at the redirected root line pointer, jump to the * first normal tuple in the chain. If we find a redirect somewhere * else, stop --- it must not be same chain. */ if (ItemIdIsRedirected(lp)) { if (nchain > 0) break; /* not at start of chain */ chainitems[nchain++] = offnum; offnum = ItemIdGetRedirect(rootlp); continue; } /* * Likewise, a dead item pointer can't be part of the chain. (We * already eliminated the case of dead root tuple outside this * function.) */ if (ItemIdIsDead(lp)) break; Assert(ItemIdIsNormal(lp)); htup = (HeapTupleHeader) PageGetItem(dp, lp); /* * Check the tuple XMIN against prior XMAX, if any */ if (TransactionIdIsValid(priorXmax) && !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax)) break; /* * OK, this tuple is indeed a member of the chain. */ chainitems[nchain++] = offnum; /* * Check tuple's visibility status. */ tupdead = recent_dead = false; switch (HeapTupleSatisfiesVacuum(relation, htup, OldestXmin, buffer)) { case HEAPTUPLE_DEAD: tupdead = true; break; case HEAPTUPLE_RECENTLY_DEAD: recent_dead = true; /* * This tuple may soon become DEAD. Update the hint field so * that the page is reconsidered for pruning in future. */ heap_prune_record_prunable(prstate, HeapTupleHeaderGetXmax(htup)); break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* * This tuple may soon become DEAD. Update the hint field so * that the page is reconsidered for pruning in future. */ heap_prune_record_prunable(prstate, HeapTupleHeaderGetXmax(htup)); break; case HEAPTUPLE_LIVE: case HEAPTUPLE_INSERT_IN_PROGRESS: /* * If we wanted to optimize for aborts, we might consider * marking the page prunable when we see INSERT_IN_PROGRESS. * But we don't. See related decisions about when to mark the * page prunable in heapam.c. */ break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } /* * Remember the last DEAD tuple seen. We will advance past * RECENTLY_DEAD tuples just in case there's a DEAD one after them; * but we can't advance past anything else. (XXX is it really worth * continuing to scan beyond RECENTLY_DEAD? The case where we will * find another DEAD tuple is a fairly unusual corner case.) */ if (tupdead) latestdead = offnum; else if (!recent_dead) break; /* * If the tuple is not HOT-updated, then we are at the end of this * HOT-update chain. */ if (!HeapTupleHeaderIsHotUpdated(htup)) break; /* * Advance to next chain member. */ Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == BufferGetBlockNumber(buffer)); offnum = ItemPointerGetOffsetNumber(&htup->t_ctid); priorXmax = HeapTupleHeaderGetXmax(htup); } /* * If we found a DEAD tuple in the chain, adjust the HOT chain so that all * the DEAD tuples at the start of the chain are removed and the root line * pointer is appropriately redirected. */ if (OffsetNumberIsValid(latestdead)) { /* * Mark as unused each intermediate item that we are able to remove * from the chain. * * When the previous item is the last dead tuple seen, we are at the * right candidate for redirection. */ for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++) { heap_prune_record_unused(prstate, chainitems[i]); ndeleted++; } /* * If the root entry had been a normal tuple, we are deleting it, so * count it in the result. But changing a redirect (even to DEAD * state) doesn't count. */ if (ItemIdIsNormal(rootlp)) ndeleted++; /* * If the DEAD tuple is at the end of the chain, the entire chain is * dead and the root line pointer can be marked dead. Otherwise just * redirect the root to the correct chain member. */ if (i >= nchain) heap_prune_record_dead(prstate, rootoffnum); else { heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]); /* If the redirection will be a move, need more processing */ if (redirect_move) redirect_target = chainitems[i]; } } else if (nchain < 2 && ItemIdIsRedirected(rootlp)) { /* * We found a redirect item that doesn't point to a valid follow-on * item. This can happen if the loop in heap_page_prune caused us to * visit the dead successor of a redirect item before visiting the * redirect item. We can clean up by setting the redirect item to * DEAD state. */ heap_prune_record_dead(prstate, rootoffnum); } else if (redirect_move && ItemIdIsRedirected(rootlp)) { /* * If we desire to eliminate LP_REDIRECT items by moving tuples, * make a redirection entry for each redirected root item; this * will cause heap_page_prune_execute to actually do the move. * (We get here only when there are no DEAD tuples in the chain; * otherwise the redirection entry was made above.) */ heap_prune_record_redirect(prstate, rootoffnum, chainitems[1]); redirect_target = chainitems[1]; } /* * If we are going to implement a redirect by moving tuples, we have * to issue a cache invalidation against the redirection target tuple, * because its CTID will be effectively changed by the move. Note that * CacheInvalidateHeapTuple only queues the request, it doesn't send it; * if we fail before reaching EndNonTransactionalInvalidation, nothing * happens and no harm is done. */ if (OffsetNumberIsValid(redirect_target)) { ItemId firstlp = PageGetItemId(dp, redirect_target); HeapTupleData firsttup; Assert(ItemIdIsNormal(firstlp)); /* Set up firsttup to reference the tuple at its existing CTID */ firsttup.t_data = (HeapTupleHeader) PageGetItem(dp, firstlp); firsttup.t_len = ItemIdGetLength(firstlp); ItemPointerSet(&firsttup.t_self, BufferGetBlockNumber(buffer), redirect_target); CacheInvalidateHeapTuple(relation, &firsttup); } return ndeleted; }
/* * Traverse the tree to find path from root page to specified "child" block. * * returns a new insertion stack, starting from the parent of "child", up * to the root. *downlinkoffnum is set to the offset of the downlink in the * direct parent of child. * * To prevent deadlocks, this should lock only one page at a time. */ static GISTInsertStack * gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum) { Page page; Buffer buffer; OffsetNumber i, maxoff; ItemId iid; IndexTuple idxtuple; List *fifo; GISTInsertStack *top, *ptr; BlockNumber blkno; top = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); top->blkno = GIST_ROOT_BLKNO; top->downlinkoffnum = InvalidOffsetNumber; fifo = list_make1(top); while (fifo != NIL) { /* Get next page to visit */ top = linitial(fifo); fifo = list_delete_first(fifo); buffer = ReadBuffer(r, top->blkno); LockBuffer(buffer, GIST_SHARE); gistcheckpage(r, buffer); page = (Page) BufferGetPage(buffer); if (GistPageIsLeaf(page)) { /* * Because we scan the index top-down, all the rest of the pages * in the queue must be leaf pages as well. */ UnlockReleaseBuffer(buffer); break; } top->lsn = PageGetLSN(page); /* * If F_FOLLOW_RIGHT is set, the page to the right doesn't have a * downlink. This should not normally happen.. */ if (GistFollowRight(page)) elog(ERROR, "concurrent GiST page split was incomplete"); if (top->parent && top->parent->lsn < GistPageGetNSN(page) && GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */ ) { /* * Page was split while we looked elsewhere. We didn't see the * downlink to the right page when we scanned the parent, so add * it to the queue now. * * Put the right page ahead of the queue, so that we visit it * next. That's important, because if this is the lowest internal * level, just above leaves, we might already have queued up some * leaf pages, and we assume that there can't be any non-leaf * pages behind leaf pages. */ ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); ptr->blkno = GistPageGetOpaque(page)->rightlink; ptr->downlinkoffnum = InvalidOffsetNumber; ptr->parent = top->parent; fifo = lcons(ptr, fifo); } maxoff = PageGetMaxOffsetNumber(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); if (blkno == child) { /* Found it! */ UnlockReleaseBuffer(buffer); *downlinkoffnum = i; return top; } else { /* Append this child to the list of pages to visit later */ ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack)); ptr->blkno = blkno; ptr->downlinkoffnum = i; ptr->parent = top; fifo = lappend(fifo, ptr); } } UnlockReleaseBuffer(buffer); } elog(ERROR, "failed to re-find parent of a page in index \"%s\", block %u", RelationGetRelationName(r), child); return NULL; /* keep compiler quiet */ }
/* * Compute the list of TIDs to be visited, by evaluating the expressions * for them. * * (The result is actually an array, not a list.) */ static void TidListCreate(TidScanState *tidstate) { List *evalList = tidstate->tss_tidquals; ExprContext *econtext = tidstate->ss.ps.ps_ExprContext; BlockNumber nblocks; ItemPointerData *tidList; int numAllocTids; int numTids; ListCell *l; /* * We silently discard any TIDs that are out of range at the time of scan * start. (Since we hold at least AccessShareLock on the table, it won't * be possible for someone to truncate away the blocks we intend to * visit.) */ nblocks = RelationGetNumberOfBlocks(tidstate->ss.ss_currentRelation); /* * We initialize the array with enough slots for the case that all quals * are simple OpExprs or CurrentOfExprs. If there are any * ScalarArrayOpExprs, we may have to enlarge the array. */ numAllocTids = list_length(evalList); tidList = (ItemPointerData *) palloc(numAllocTids * sizeof(ItemPointerData)); numTids = 0; tidstate->tss_isCurrentOf = false; foreach(l, evalList) { ExprState *exstate = (ExprState *) lfirst(l); Expr *expr = exstate->expr; ItemPointer itemptr; bool isNull; if (is_opclause(expr)) { FuncExprState *fexstate = (FuncExprState *) exstate; Node *arg1; Node *arg2; arg1 = get_leftop(expr); arg2 = get_rightop(expr); if (IsCTIDVar(arg1)) exstate = (ExprState *) lsecond(fexstate->args); else if (IsCTIDVar(arg2)) exstate = (ExprState *) linitial(fexstate->args); else elog(ERROR, "could not identify CTID variable"); itemptr = (ItemPointer) DatumGetPointer(ExecEvalExprSwitchContext(exstate, econtext, &isNull, NULL)); if (!isNull && ItemPointerIsValid(itemptr) && ItemPointerGetBlockNumber(itemptr) < nblocks) { if (numTids >= numAllocTids) { numAllocTids *= 2; tidList = (ItemPointerData *) repalloc(tidList, numAllocTids * sizeof(ItemPointerData)); } tidList[numTids++] = *itemptr; } } else if (expr && IsA(expr, ScalarArrayOpExpr)) { ScalarArrayOpExprState *saexstate = (ScalarArrayOpExprState *) exstate; Datum arraydatum; ArrayType *itemarray; Datum *ipdatums; bool *ipnulls; int ndatums; int i; exstate = (ExprState *) lsecond(saexstate->fxprstate.args); arraydatum = ExecEvalExprSwitchContext(exstate, econtext, &isNull, NULL); if (isNull) continue; itemarray = DatumGetArrayTypeP(arraydatum); deconstruct_array(itemarray, TIDOID, SizeOfIptrData, false, 's', &ipdatums, &ipnulls, &ndatums); if (numTids + ndatums > numAllocTids) { numAllocTids = numTids + ndatums; tidList = (ItemPointerData *) repalloc(tidList, numAllocTids * sizeof(ItemPointerData)); } for (i = 0; i < ndatums; i++) { if (!ipnulls[i]) { itemptr = (ItemPointer) DatumGetPointer(ipdatums[i]); if (ItemPointerIsValid(itemptr) && ItemPointerGetBlockNumber(itemptr) < nblocks) tidList[numTids++] = *itemptr; } } pfree(ipdatums); pfree(ipnulls); } else if (expr && IsA(expr, CurrentOfExpr)) { CurrentOfExpr *cexpr = (CurrentOfExpr *) expr; ItemPointerData cursor_tid; if (execCurrentOf(cexpr, econtext, RelationGetRelid(tidstate->ss.ss_currentRelation), &cursor_tid)) { if (numTids >= numAllocTids) { numAllocTids *= 2; tidList = (ItemPointerData *) repalloc(tidList, numAllocTids * sizeof(ItemPointerData)); } tidList[numTids++] = cursor_tid; tidstate->tss_isCurrentOf = true; } } else elog(ERROR, "could not identify CTID expression"); }
/* * Get the latestRemovedXid from the heap pages pointed at by the index * tuples being deleted. See also btree_xlog_delete_get_latestRemovedXid, * on which this function is based. */ static TransactionId gistRedoDeleteRecordGetLatestRemovedXid(XLogReaderState *record) { gistxlogDelete *xlrec = (gistxlogDelete *) XLogRecGetData(record); OffsetNumber *todelete; Buffer ibuffer, hbuffer; Page ipage, hpage; RelFileNode rnode; BlockNumber blkno; ItemId iitemid, hitemid; IndexTuple itup; HeapTupleHeader htuphdr; BlockNumber hblkno; OffsetNumber hoffnum; TransactionId latestRemovedXid = InvalidTransactionId; int i; /* * If there's nothing running on the standby we don't need to derive a * full latestRemovedXid value, so use a fast path out of here. This * returns InvalidTransactionId, and so will conflict with all HS * transactions; but since we just worked out that that's zero people, * it's OK. * * XXX There is a race condition here, which is that a new backend might * start just after we look. If so, it cannot need to conflict, but this * coding will result in throwing a conflict anyway. */ if (CountDBBackends(InvalidOid) == 0) return latestRemovedXid; /* * In what follows, we have to examine the previous state of the index * page, as well as the heap page(s) it points to. This is only valid if * WAL replay has reached a consistent database state; which means that * the preceding check is not just an optimization, but is *necessary*. We * won't have let in any user sessions before we reach consistency. */ if (!reachedConsistency) elog(PANIC, "gistRedoDeleteRecordGetLatestRemovedXid: cannot operate with inconsistent data"); /* * Get index page. If the DB is consistent, this should not fail, nor * should any of the heap page fetches below. If one does, we return * InvalidTransactionId to cancel all HS transactions. That's probably * overkill, but it's safe, and certainly better than panicking here. */ XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); ibuffer = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, RBM_NORMAL); if (!BufferIsValid(ibuffer)) return InvalidTransactionId; LockBuffer(ibuffer, BUFFER_LOCK_EXCLUSIVE); ipage = (Page) BufferGetPage(ibuffer); /* * Loop through the deleted index items to obtain the TransactionId from * the heap items they point to. */ todelete = (OffsetNumber *) ((char *) xlrec + SizeOfGistxlogDelete); for (i = 0; i < xlrec->ntodelete; i++) { /* * Identify the index tuple about to be deleted */ iitemid = PageGetItemId(ipage, todelete[i]); itup = (IndexTuple) PageGetItem(ipage, iitemid); /* * Locate the heap page that the index tuple points at */ hblkno = ItemPointerGetBlockNumber(&(itup->t_tid)); hbuffer = XLogReadBufferExtended(xlrec->hnode, MAIN_FORKNUM, hblkno, RBM_NORMAL); if (!BufferIsValid(hbuffer)) { UnlockReleaseBuffer(ibuffer); return InvalidTransactionId; } LockBuffer(hbuffer, BUFFER_LOCK_SHARE); hpage = (Page) BufferGetPage(hbuffer); /* * Look up the heap tuple header that the index tuple points at by * using the heap node supplied with the xlrec. We can't use * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer. * Note that we are not looking at tuple data here, just headers. */ hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid)); hitemid = PageGetItemId(hpage, hoffnum); /* * Follow any redirections until we find something useful. */ while (ItemIdIsRedirected(hitemid)) { hoffnum = ItemIdGetRedirect(hitemid); hitemid = PageGetItemId(hpage, hoffnum); CHECK_FOR_INTERRUPTS(); } /* * If the heap item has storage, then read the header and use that to * set latestRemovedXid. * * Some LP_DEAD items may not be accessible, so we ignore them. */ if (ItemIdHasStorage(hitemid)) { htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid); HeapTupleHeaderAdvanceLatestRemovedXid(htuphdr, &latestRemovedXid); } else if (ItemIdIsDead(hitemid)) { /* * Conjecture: if hitemid is dead then it had xids before the xids * marked on LP_NORMAL items. So we just ignore this item and move * onto the next, for the purposes of calculating * latestRemovedxids. */ } else Assert(!ItemIdIsUsed(hitemid)); UnlockReleaseBuffer(hbuffer); } UnlockReleaseBuffer(ibuffer); /* * If all heap tuples were LP_DEAD then we will be returning * InvalidTransactionId here, which avoids conflicts. This matches * existing logic which assumes that LP_DEAD tuples must already be older * than the latestRemovedXid on the cleanup record that set them as * LP_DEAD, hence must already have generated a conflict. */ return latestRemovedXid; }