/* * systable_recheck_tuple --- recheck visibility of most-recently-fetched tuple * * This is useful to test whether an object was deleted while we waited to * acquire lock on it. * * Note: we don't actually *need* the tuple to be passed in, but it's a * good crosscheck that the caller is interested in the right tuple. */ bool systable_recheck_tuple(SysScanDesc sysscan, HeapTuple tup) { bool result; if (sysscan->irel) { IndexScanDesc scan = sysscan->iscan; Assert(tup == &scan->xs_ctup); Assert(BufferIsValid(scan->xs_cbuf)); /* must hold a buffer lock to call HeapTupleSatisfiesVisibility */ LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); result = HeapTupleSatisfiesVisibility(tup, scan->xs_snapshot, scan->xs_cbuf); LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); } else { HeapScanDesc scan = sysscan->scan; Assert(tup == &scan->rs_ctup); Assert(BufferIsValid(scan->rs_cbuf)); /* must hold a buffer lock to call HeapTupleSatisfiesVisibility */ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); result = HeapTupleSatisfiesVisibility(tup, scan->rs_snapshot, scan->rs_cbuf); LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); } return result; }
/* * systable_recheck_tuple --- recheck visibility of most-recently-fetched tuple * * In particular, determine if this tuple would be visible to a catalog scan * that started now. We don't handle the case of a non-MVCC scan snapshot, * because no caller needs that yet. * * This is useful to test whether an object was deleted while we waited to * acquire lock on it. * * Note: we don't actually *need* the tuple to be passed in, but it's a * good crosscheck that the caller is interested in the right tuple. */ bool systable_recheck_tuple(SysScanDesc sysscan, HeapTuple tup) { Snapshot freshsnap; bool result; /* * Trust that LockBuffer() and HeapTupleSatisfiesMVCC() do not themselves * acquire snapshots, so we need not register the snapshot. Those * facilities are too low-level to have any business scanning tables. */ freshsnap = GetCatalogSnapshot(RelationGetRelid(sysscan->heap_rel)); if (sysscan->irel) { IndexScanDesc scan = sysscan->iscan; Assert(IsMVCCSnapshot(scan->xs_snapshot)); Assert(tup == &scan->xs_ctup); Assert(BufferIsValid(scan->xs_cbuf)); /* must hold a buffer lock to call HeapTupleSatisfiesVisibility */ LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); result = HeapTupleSatisfiesVisibility(tup, freshsnap, scan->xs_cbuf); LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); } else { HeapScanDesc scan = sysscan->scan; Assert(IsMVCCSnapshot(scan->rs_snapshot)); Assert(tup == &scan->rs_ctup); Assert(BufferIsValid(scan->rs_cbuf)); /* must hold a buffer lock to call HeapTupleSatisfiesVisibility */ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); result = HeapTupleSatisfiesVisibility(tup, freshsnap, scan->rs_cbuf); LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); } return result; }
/* * pgstat_heap -- returns live/dead tuples info in a heap */ static Datum pgstat_heap(Relation rel, FunctionCallInfo fcinfo) { HeapScanDesc scan; HeapTuple tuple; BlockNumber nblocks; BlockNumber block = 0; /* next block to count free space in */ BlockNumber tupblock; Buffer buffer; pgstattuple_type stat = {0}; /* Disable syncscan because we assume we scan from block zero upwards */ scan = heap_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false); nblocks = scan->rs_nblocks; /* # blocks to be scanned */ /* scan the relation */ while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { CHECK_FOR_INTERRUPTS(); /* must hold a buffer lock to call HeapTupleSatisfiesVisibility */ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); if (HeapTupleSatisfiesVisibility(tuple, SnapshotNow, scan->rs_cbuf)) { stat.tuple_len += tuple->t_len; stat.tuple_count++; } else { stat.dead_tuple_len += tuple->t_len; stat.dead_tuple_count++; } LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); /* * To avoid physically reading the table twice, try to do the * free-space scan in parallel with the heap scan. However, * heap_getnext may find no tuples on a given page, so we cannot * simply examine the pages returned by the heap scan. */ tupblock = BlockIdGetBlockNumber(&tuple->t_self.ip_blkid); while (block <= tupblock) { CHECK_FOR_INTERRUPTS(); buffer = ReadBuffer(rel, block); LockBuffer(buffer, BUFFER_LOCK_SHARE); stat.free_space += PageGetHeapFreeSpace((Page) BufferGetPage(buffer)); UnlockReleaseBuffer(buffer); block++; } } heap_endscan(scan); while (block < nblocks) { CHECK_FOR_INTERRUPTS(); buffer = ReadBuffer(rel, block); LockBuffer(buffer, BUFFER_LOCK_SHARE); stat.free_space += PageGetHeapFreeSpace((Page) BufferGetPage(buffer)); UnlockReleaseBuffer(buffer); block++; } relation_close(rel, AccessShareLock); stat.table_len = (uint64) nblocks *BLCKSZ; return build_pgstattuple_type(&stat, fcinfo); }
/* * bitgetpage - subroutine for BitmapHeapNext() * * This routine reads and pins the specified page of the relation, then * builds an array indicating which tuples on the page are both potentially * interesting according to the bitmap, and visible according to the snapshot. */ static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres) { BlockNumber page = tbmres->blockno; Buffer buffer; Snapshot snapshot; int ntup; /* * Acquire pin on the target heap page, trading in any pin we held before. */ Assert(page < scan->rs_nblocks); scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf, scan->rs_rd, page); buffer = scan->rs_cbuf; snapshot = scan->rs_snapshot; ntup = 0; /* * Prune and repair fragmentation for the whole page, if possible. */ Assert(TransactionIdIsValid(RecentGlobalXmin)); heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin); /* * We must hold share lock on the buffer content while examining tuple * visibility. Afterwards, however, the tuples we have found to be * visible are guaranteed good as long as we hold the buffer pin. */ LockBuffer(buffer, BUFFER_LOCK_SHARE); /* * We need two separate strategies for lossy and non-lossy cases. */ if (tbmres->ntuples >= 0) { /* * Bitmap is non-lossy, so we just look through the offsets listed in * tbmres; but we have to follow any HOT chain starting at each such * offset. */ int curslot; for (curslot = 0; curslot < tbmres->ntuples; curslot++) { OffsetNumber offnum = tbmres->offsets[curslot]; ItemPointerData tid; ItemPointerSet(&tid, page, offnum); if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot, NULL)) scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid); } } else { /* * Bitmap is lossy, so we must examine each item pointer on the page. * But we can ignore HOT chains, since we'll check each tuple anyway. */ Page dp = (Page) BufferGetPage(buffer); OffsetNumber maxoff = PageGetMaxOffsetNumber(dp); OffsetNumber offnum; for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId lp; HeapTupleData loctup; bool valid; lp = PageGetItemId(dp, offnum); if (!ItemIdIsNormal(lp)) continue; loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); loctup.t_len = ItemIdGetLength(lp); loctup.t_tableOid = scan->rs_rd->rd_id; ItemPointerSet(&loctup.t_self, page, offnum); valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); if (valid) { scan->rs_vistuples[ntup++] = offnum; PredicateLockTuple(scan->rs_rd, &loctup, snapshot); } CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup, buffer, snapshot); } } LockBuffer(buffer, BUFFER_LOCK_UNLOCK); Assert(ntup <= MaxHeapTuplesPerPage); scan->rs_ntuples = ntup; }
/* ---------------- * index_getnext - get the next heap tuple from a scan * * The result is the next heap tuple satisfying the scan keys and the * snapshot, or NULL if no more matching tuples exist. On success, * the buffer containing the heap tuple is pinned (the pin will be dropped * at the next index_getnext or index_endscan). * * Note: caller must check scan->xs_recheck, and perform rechecking of the * scan keys if required. We do not do that here because we don't have * enough information to do it efficiently in the general case. * ---------------- */ HeapTuple index_getnext(IndexScanDesc scan, ScanDirection direction) { HeapTuple heapTuple = &scan->xs_ctup; ItemPointer tid = &heapTuple->t_self; FmgrInfo *procedure; SCAN_CHECKS; GET_SCAN_PROCEDURE(amgettuple); Assert(TransactionIdIsValid(RecentGlobalXmin)); /* * We always reset xs_hot_dead; if we are here then either we are just * starting the scan, or we previously returned a visible tuple, and in * either case it's inappropriate to kill the prior index entry. */ scan->xs_hot_dead = false; for (;;) { OffsetNumber offnum; bool at_chain_start; Page dp; if (scan->xs_next_hot != InvalidOffsetNumber) { /* * We are resuming scan of a HOT chain after having returned an * earlier member. Must still hold pin on current heap page. */ Assert(BufferIsValid(scan->xs_cbuf)); Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(scan->xs_cbuf)); Assert(TransactionIdIsValid(scan->xs_prev_xmax)); offnum = scan->xs_next_hot; at_chain_start = false; scan->xs_next_hot = InvalidOffsetNumber; } else { bool found; Buffer prev_buf; /* * If we scanned a whole HOT chain and found only dead tuples, * tell index AM to kill its entry for that TID. We do not do this * when in recovery because it may violate MVCC to do so. see * comments in RelationGetIndexScan(). */ if (!scan->xactStartedInRecovery) scan->kill_prior_tuple = scan->xs_hot_dead; /* * The AM's gettuple proc finds the next index entry matching the * scan keys, and puts the TID in xs_ctup.t_self (ie, *tid). It * should also set scan->xs_recheck, though we pay no attention to * that here. */ found = DatumGetBool(FunctionCall2(procedure, PointerGetDatum(scan), Int32GetDatum(direction))); /* Reset kill flag immediately for safety */ scan->kill_prior_tuple = false; /* If we're out of index entries, break out of outer loop */ if (!found) break; pgstat_count_index_tuples(scan->indexRelation, 1); /* Switch to correct buffer if we don't have it already */ prev_buf = scan->xs_cbuf; scan->xs_cbuf = ReleaseAndReadBuffer(scan->xs_cbuf, scan->heapRelation, ItemPointerGetBlockNumber(tid)); /* * Prune page, but only if we weren't already on this page */ if (prev_buf != scan->xs_cbuf) heap_page_prune_opt(scan->heapRelation, scan->xs_cbuf, RecentGlobalXmin); /* Prepare to scan HOT chain starting at index-referenced offnum */ offnum = ItemPointerGetOffsetNumber(tid); at_chain_start = true; /* We don't know what the first tuple's xmin should be */ scan->xs_prev_xmax = InvalidTransactionId; /* Initialize flag to detect if all entries are dead */ scan->xs_hot_dead = true; } /* Obtain share-lock on the buffer so we can examine visibility */ LockBuffer(scan->xs_cbuf, BUFFER_LOCK_SHARE); dp = (Page) BufferGetPage(scan->xs_cbuf); /* Scan through possible multiple members of HOT-chain */ for (;;) { ItemId lp; ItemPointer ctid; bool valid; /* check for bogus TID */ if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp)) break; lp = PageGetItemId(dp, offnum); /* check for unused, dead, or redirected items */ if (!ItemIdIsNormal(lp)) { /* We should only see a redirect at start of chain */ if (ItemIdIsRedirected(lp) && at_chain_start) { /* Follow the redirect */ offnum = ItemIdGetRedirect(lp); at_chain_start = false; continue; } /* else must be end of chain */ break; } /* * We must initialize all of *heapTuple (ie, scan->xs_ctup) since * it is returned to the executor on success. */ heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp); heapTuple->t_len = ItemIdGetLength(lp); ItemPointerSetOffsetNumber(tid, offnum); heapTuple->t_tableOid = RelationGetRelid(scan->heapRelation); ctid = &heapTuple->t_data->t_ctid; /* * Shouldn't see a HEAP_ONLY tuple at chain start. (This test * should be unnecessary, since the chain root can't be removed * while we have pin on the index entry, but let's make it * anyway.) */ if (at_chain_start && HeapTupleIsHeapOnly(heapTuple)) break; /* * The xmin should match the previous xmax value, else chain is * broken. (Note: this test is not optional because it protects * us against the case where the prior chain member's xmax aborted * since we looked at it.) */ if (TransactionIdIsValid(scan->xs_prev_xmax) && !TransactionIdEquals(scan->xs_prev_xmax, HeapTupleHeaderGetXmin(heapTuple->t_data))) break; /* If it's visible per the snapshot, we must return it */ valid = HeapTupleSatisfiesVisibility(heapTuple, scan->xs_snapshot, scan->xs_cbuf); CheckForSerializableConflictOut(valid, scan->heapRelation, heapTuple, scan->xs_cbuf); if (valid) { /* * If the snapshot is MVCC, we know that it could accept at * most one member of the HOT chain, so we can skip examining * any more members. Otherwise, check for continuation of the * HOT-chain, and set state for next time. */ if (IsMVCCSnapshot(scan->xs_snapshot) && !IsolationIsSerializable()) scan->xs_next_hot = InvalidOffsetNumber; else if (HeapTupleIsHotUpdated(heapTuple)) { Assert(ItemPointerGetBlockNumber(ctid) == ItemPointerGetBlockNumber(tid)); scan->xs_next_hot = ItemPointerGetOffsetNumber(ctid); scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data); } else scan->xs_next_hot = InvalidOffsetNumber; PredicateLockTuple(scan->heapRelation, heapTuple); LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); pgstat_count_heap_fetch(scan->indexRelation); return heapTuple; } /* * If we can't see it, maybe no one else can either. Check to see * if the tuple is dead to all transactions. If we find that all * the tuples in the HOT chain are dead, we'll signal the index AM * to not return that TID on future indexscans. */ if (scan->xs_hot_dead && HeapTupleSatisfiesVacuum(heapTuple->t_data, RecentGlobalXmin, scan->xs_cbuf) != HEAPTUPLE_DEAD) scan->xs_hot_dead = false; /* * Check to see if HOT chain continues past this tuple; if so * fetch the next offnum (we don't bother storing it into * xs_next_hot, but must store xs_prev_xmax), and loop around. */ if (HeapTupleIsHotUpdated(heapTuple)) { Assert(ItemPointerGetBlockNumber(ctid) == ItemPointerGetBlockNumber(tid)); offnum = ItemPointerGetOffsetNumber(ctid); at_chain_start = false; scan->xs_prev_xmax = HeapTupleHeaderGetXmax(heapTuple->t_data); } else break; /* end of chain */ } /* loop over a single HOT chain */ LockBuffer(scan->xs_cbuf, BUFFER_LOCK_UNLOCK); /* Loop around to ask index AM for another TID */ scan->xs_next_hot = InvalidOffsetNumber; } /* Release any held pin on a heap page */ if (BufferIsValid(scan->xs_cbuf)) { ReleaseBuffer(scan->xs_cbuf); scan->xs_cbuf = InvalidBuffer; } return NULL; /* failure exit */ }
/** * This method estimates the number of tuples and pages in a heaptable relation. Getting the number of blocks is straightforward. * Estimating the number of tuples is a little trickier. There are two factors that complicate this: * 1. Tuples may be of variable length. * 2. There may be dead tuples lying around. * To do this, it chooses a certain number of blocks (as determined by a guc) randomly. The process of choosing is not strictly * uniformly random since we have a target number of blocks in mind. We start processing blocks in order and choose an block * with a probability p determined by the ratio of target to total blocks. It is possible that we get really unlucky and reject * a large number of blocks up front. We compensate for this by increasing p dynamically. Thus, we are guaranteed to choose the target number * of blocks. We read all heaptuples from these blocks and keep count of number of live tuples. We scale up this count to * estimate reltuples. Relpages is an exact value. * * Input: * rel - Relation. Must be a heaptable. * * Output: * reltuples - estimated number of tuples in relation. * relpages - exact number of pages. */ static void gp_statistics_estimate_reltuples_relpages_heap(Relation rel, float4 *reltuples, float4 *relpages) { MIRROREDLOCK_BUFMGR_DECLARE; float4 nrowsseen = 0; /* # rows seen (including dead rows) */ float4 nrowsdead = 0; /* # rows dead */ float4 totalEmptyPages = 0; /* # of empty pages with only dead rows */ float4 totalSamplePages = 0; /* # of pages sampled */ BlockNumber nblockstotal = 0; /* nblocks in relation */ BlockNumber nblockstarget = (BlockNumber) gp_statistics_blocks_target; BlockNumber nblocksseen = 0; int j = 0; /* counter */ /** * Ensure that the right kind of relation with the right kind of storage is passed to us. */ Assert(rel->rd_rel->relkind == RELKIND_RELATION); Assert(RelationIsHeap(rel)); nblockstotal = RelationGetNumberOfBlocks(rel); if (nblockstotal == 0 || nblockstarget == 0) { /** * If there are no blocks, there cannot be tuples. */ *reltuples = 0.0; *relpages = 0.0; return; } for (j=0 ; j<nblockstotal; j++) { /** * Threshold is dynamically adjusted based on how many blocks we need to examine and how many blocks * are left. */ double threshold = ((double) nblockstarget - nblocksseen)/((double) nblockstotal - j); /** * Random dice thrown to determine if current block is chosen. */ double diceValue = ((double) random()) / ((double) MAX_RANDOM_VALUE); if (threshold >= 1.0 || diceValue <= threshold) { totalSamplePages++; /** * Block j shall be examined! */ BlockNumber targblock = j; Buffer targbuffer; Page targpage; OffsetNumber targoffset, maxoffset; /** * Check for cancellations. */ CHECK_FOR_INTERRUPTS(); /* * We must maintain a pin on the target page's buffer to ensure that * the maxoffset value stays good (else concurrent VACUUM might delete * tuples out from under us). Hence, pin the page until we are done * looking at it. We don't maintain a lock on the page, so tuples * could get added to it, but we ignore such tuples. */ // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; targbuffer = ReadBuffer(rel, targblock); LockBuffer(targbuffer, BUFFER_LOCK_SHARE); targpage = BufferGetPage(targbuffer); maxoffset = PageGetMaxOffsetNumber(targpage); /* Figure out overall nrowsdead/nrowsseen ratio */ /* Figure out # of empty pages based on page level #rowsseen and #rowsdead.*/ float4 pageRowsSeen = 0.0; float4 pageRowsDead = 0.0; /* Inner loop over all tuples on the selected block. */ for (targoffset = FirstOffsetNumber; targoffset <= maxoffset; targoffset++) { ItemId itemid; itemid = PageGetItemId(targpage, targoffset); nrowsseen++; pageRowsSeen++; if(!ItemIdIsNormal(itemid)) { nrowsdead += 1; pageRowsDead++; } else { HeapTupleData targtuple; ItemPointerSet(&targtuple.t_self, targblock, targoffset); targtuple.t_data = (HeapTupleHeader) PageGetItem(targpage, itemid); targtuple.t_len = ItemIdGetLength(itemid); if(!HeapTupleSatisfiesVisibility(rel, &targtuple, SnapshotNow, targbuffer)) { nrowsdead += 1; pageRowsDead++; } } } /* Now release the pin on the page */ UnlockReleaseBuffer(targbuffer); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- /* detect empty pages: pageRowsSeen == pageRowsDead, also log the nrowsseen (total) and nrowsdead (total) */ if (pageRowsSeen == pageRowsDead && pageRowsSeen > 0) { totalEmptyPages++; } nblocksseen++; } } Assert(nblocksseen > 0); /** * To calculate reltuples, scale up the number of live rows per block seen to the total number * of blocks. */ *reltuples = ceil((nrowsseen - nrowsdead) * nblockstotal / nblocksseen); *relpages = nblockstotal; if (totalSamplePages * 0.5 <= totalEmptyPages && totalSamplePages != 0) { /* * LOG empty pages of bloated table for each segments. */ elog(DEBUG1, "ANALYZE detected 50%% or more empty pages (%f empty out of %f pages), please run VACUUM FULL for accurate estimation.", totalEmptyPages, totalSamplePages); } return; }
/* * bitgetpage - subroutine for BitmapHeapNext() * * This routine reads and pins the specified page of the relation, then * builds an array indicating which tuples on the page are both potentially * interesting according to the bitmap, and visible according to the snapshot. */ static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres) { BlockNumber page = tbmres->blockno; Buffer buffer; Snapshot snapshot; Page dp; int ntup; int curslot; int minslot; int maxslot; int maxoff; /* * Acquire pin on the target heap page, trading in any pin we held before. */ Assert(page < scan->rs_nblocks); scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf, scan->rs_rd, page); buffer = scan->rs_cbuf; snapshot = scan->rs_snapshot; /* * We must hold share lock on the buffer content while examining tuple * visibility. Afterwards, however, the tuples we have found to be * visible are guaranteed good as long as we hold the buffer pin. */ LockBuffer(buffer, BUFFER_LOCK_SHARE); dp = (Page) BufferGetPage(buffer); maxoff = PageGetMaxOffsetNumber(dp); /* * Determine how many entries we need to look at on this page. If the * bitmap is lossy then we need to look at each physical item pointer; * otherwise we just look through the offsets listed in tbmres. */ if (tbmres->ntuples >= 0) { /* non-lossy case */ minslot = 0; maxslot = tbmres->ntuples - 1; } else { /* lossy case */ minslot = FirstOffsetNumber; maxslot = maxoff; } ntup = 0; for (curslot = minslot; curslot <= maxslot; curslot++) { OffsetNumber targoffset; ItemId lp; HeapTupleData loctup; bool valid; if (tbmres->ntuples >= 0) { /* non-lossy case */ targoffset = tbmres->offsets[curslot]; } else { /* lossy case */ targoffset = (OffsetNumber) curslot; } /* * We'd better check for out-of-range offnum in case of VACUUM since * the TID was obtained. */ if (targoffset < FirstOffsetNumber || targoffset > maxoff) continue; lp = PageGetItemId(dp, targoffset); /* * Must check for deleted tuple. */ if (!ItemIdIsUsed(lp)) continue; /* * check time qualification of tuple, remember it if valid */ loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); loctup.t_len = ItemIdGetLength(lp); ItemPointerSet(&(loctup.t_self), page, targoffset); valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); if (valid) scan->rs_vistuples[ntup++] = targoffset; } LockBuffer(buffer, BUFFER_LOCK_UNLOCK); Assert(ntup <= MaxHeapTuplesPerPage); scan->rs_ntuples = ntup; }