/* * Hot Standby feedback */ static void ProcessStandbyHSFeedbackMessage(void) { StandbyHSFeedbackMessage reply; TransactionId newxmin = InvalidTransactionId; pq_copymsgbytes(&reply_message, (char *) &reply, sizeof(StandbyHSFeedbackMessage)); elog(DEBUG2, "hot standby feedback xmin %u epoch %u", reply.xmin, reply.epoch); /* * Update the WalSender's proc xmin to allow it to be visible to * snapshots. This will hold back the removal of dead rows and thereby * prevent the generation of cleanup conflicts on the standby server. */ if (TransactionIdIsValid(reply.xmin)) { TransactionId nextXid; uint32 nextEpoch; bool epochOK = false; GetNextXidAndEpoch(&nextXid, &nextEpoch); /* * Epoch of oldestXmin should be same as standby or if the counter has * wrapped, then one less than reply. */ if (reply.xmin <= nextXid) { if (reply.epoch == nextEpoch) epochOK = true; } else { if (nextEpoch > 0 && reply.epoch == nextEpoch - 1) epochOK = true; } /* * Feedback from standby must not go backwards, nor should it go * forwards further than our most recent xid. */ if (epochOK && TransactionIdPrecedesOrEquals(reply.xmin, nextXid)) { if (!TransactionIdIsValid(MyProc->xmin)) { TransactionId oldestXmin = GetOldestXmin(true, true); if (TransactionIdPrecedes(oldestXmin, reply.xmin)) newxmin = reply.xmin; else newxmin = oldestXmin; } else { if (TransactionIdPrecedes(MyProc->xmin, reply.xmin)) newxmin = reply.xmin; else newxmin = MyProc->xmin; /* stay the same */ } } } /* * Grab the ProcArrayLock to set xmin, or invalidate for bad reply */ if (MyProc->xmin != newxmin) { LWLockAcquire(ProcArrayLock, LW_SHARED); MyProc->xmin = newxmin; LWLockRelease(ProcArrayLock); } }
/* * This function takes an already open relation and scans its pages, * skipping those that have the corresponding visibility map bit set. * For pages we skip, we find the free space from the free space map * and approximate tuple_len on that basis. For the others, we count * the exact number of dead tuples etc. * * This scan is loosely based on vacuumlazy.c:lazy_scan_heap(), but * we do not try to avoid skipping single pages. */ static void statapprox_heap(Relation rel, output_type *stat) { BlockNumber scanned, nblocks, blkno; Buffer vmbuffer = InvalidBuffer; BufferAccessStrategy bstrategy; TransactionId OldestXmin; uint64 misc_count = 0; OldestXmin = GetOldestXmin(rel, PROCARRAY_FLAGS_VACUUM); bstrategy = GetAccessStrategy(BAS_BULKREAD); nblocks = RelationGetNumberOfBlocks(rel); scanned = 0; for (blkno = 0; blkno < nblocks; blkno++) { Buffer buf; Page page; OffsetNumber offnum, maxoff; Size freespace; CHECK_FOR_INTERRUPTS(); /* * If the page has only visible tuples, then we can find out the free * space from the FSM and move on. */ if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer)) { freespace = GetRecordedFreeSpace(rel, blkno); stat->tuple_len += BLCKSZ - freespace; stat->free_space += freespace; continue; } buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buf, BUFFER_LOCK_SHARE); page = BufferGetPage(buf); /* * It's not safe to call PageGetHeapFreeSpace() on new pages, so we * treat them as being free space for our purposes. */ if (!PageIsNew(page)) stat->free_space += PageGetHeapFreeSpace(page); else stat->free_space += BLCKSZ - SizeOfPageHeaderData; if (PageIsNew(page) || PageIsEmpty(page)) { UnlockReleaseBuffer(buf); continue; } scanned++; /* * Look at each tuple on the page and decide whether it's live or * dead, then count it and its size. Unlike lazy_scan_heap, we can * afford to ignore problems and special cases. */ maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId itemid; HeapTupleData tuple; itemid = PageGetItemId(page, offnum); if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid) || ItemIdIsDead(itemid)) { continue; } Assert(ItemIdIsNormal(itemid)); ItemPointerSet(&(tuple.t_self), blkno, offnum); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tuple.t_tableOid = RelationGetRelid(rel); /* * We count live and dead tuples, but we also need to add up * others in order to feed vac_estimate_reltuples. */ switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf)) { case HEAPTUPLE_RECENTLY_DEAD: misc_count++; /* Fall through */ case HEAPTUPLE_DEAD: stat->dead_tuple_len += tuple.t_len; stat->dead_tuple_count++; break; case HEAPTUPLE_LIVE: stat->tuple_len += tuple.t_len; stat->tuple_count++; break; case HEAPTUPLE_INSERT_IN_PROGRESS: case HEAPTUPLE_DELETE_IN_PROGRESS: misc_count++; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); break; } } UnlockReleaseBuffer(buf); } stat->table_len = (uint64) nblocks *BLCKSZ; stat->tuple_count = vac_estimate_reltuples(rel, false, nblocks, scanned, stat->tuple_count + misc_count); /* * Calculate percentages if the relation has one or more pages. */ if (nblocks != 0) { stat->scanned_percent = 100 * scanned / nblocks; stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len; stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len; stat->free_percent = 100.0 * stat->free_space / stat->table_len; } if (BufferIsValid(vmbuffer)) { ReleaseBuffer(vmbuffer); vmbuffer = InvalidBuffer; } }
/* * IndexBuildHeapScan - scan the heap relation to find tuples to be indexed * * This is called back from an access-method-specific index build procedure * after the AM has done whatever setup it needs. The parent heap relation * is scanned to find tuples that should be entered into the index. Each * such tuple is passed to the AM's callback routine, which does the right * things to add it to the new index. After we return, the AM's index * build procedure does whatever cleanup is needed; in particular, it should * close the heap and index relations. * * The total count of heap tuples is returned. This is for updating pg_class * statistics. (It's annoying not to be able to do that here, but we can't * do it until after the relation is closed.) Note that the index AM itself * must keep track of the number of index tuples; we don't do so here because * the AM might reject some of the tuples for its own reasons, such as being * unable to store NULLs. */ double IndexBuildHeapScan(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo, IndexBuildCallback callback, void *callback_state) { HeapScanDesc scan; HeapTuple heapTuple; TupleDesc heapDescriptor; Datum attdata[INDEX_MAX_KEYS]; char nulls[INDEX_MAX_KEYS]; double reltuples; List *predicate; TupleTable tupleTable; TupleTableSlot *slot; EState *estate; ExprContext *econtext; Snapshot snapshot; TransactionId OldestXmin; /* * sanity checks */ Assert(OidIsValid(indexRelation->rd_rel->relam)); heapDescriptor = RelationGetDescr(heapRelation); /* * Need an EState for evaluation of index expressions and * partial-index predicates. */ estate = CreateExecutorState(); econtext = GetPerTupleExprContext(estate); /* * If this is a predicate (partial) index, we will need to evaluate * the predicate using ExecQual, which requires the current tuple to * be in a slot of a TupleTable. Likewise if there are any * expressions. */ if (indexInfo->ii_Predicate != NIL || indexInfo->ii_Expressions != NIL) { tupleTable = ExecCreateTupleTable(1); slot = ExecAllocTableSlot(tupleTable); ExecSetSlotDescriptor(slot, heapDescriptor, false); /* Arrange for econtext's scan tuple to be the tuple under test */ econtext->ecxt_scantuple = slot; /* Set up execution state for predicate. */ predicate = (List *) ExecPrepareExpr((Expr *) indexInfo->ii_Predicate, estate); } else { tupleTable = NULL; slot = NULL; predicate = NIL; } /* * Ok, begin our scan of the base relation. We use SnapshotAny * because we must retrieve all tuples and do our own time qual * checks. */ if (IsBootstrapProcessingMode()) { snapshot = SnapshotNow; OldestXmin = InvalidTransactionId; } else { snapshot = SnapshotAny; OldestXmin = GetOldestXmin(heapRelation->rd_rel->relisshared); } scan = heap_beginscan(heapRelation, /* relation */ snapshot, /* seeself */ 0, /* number of keys */ (ScanKey) NULL); /* scan key */ reltuples = 0; /* * Scan all tuples in the base relation. */ while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL) { bool tupleIsAlive; CHECK_FOR_INTERRUPTS(); if (snapshot == SnapshotAny) { /* do our own time qual check */ bool indexIt; uint16 sv_infomask; /* * HeapTupleSatisfiesVacuum may update tuple's hint status * bits. We could possibly get away with not locking the * buffer here, since caller should hold ShareLock on the * relation, but let's be conservative about it. */ LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE); sv_infomask = heapTuple->t_data->t_infomask; switch (HeapTupleSatisfiesVacuum(heapTuple->t_data, OldestXmin)) { case HEAPTUPLE_DEAD: indexIt = false; tupleIsAlive = false; break; case HEAPTUPLE_LIVE: indexIt = true; tupleIsAlive = true; break; case HEAPTUPLE_RECENTLY_DEAD: /* * If tuple is recently deleted then we must index it * anyway to keep VACUUM from complaining. */ indexIt = true; tupleIsAlive = false; break; case HEAPTUPLE_INSERT_IN_PROGRESS: /* * Since caller should hold ShareLock or better, we * should not see any tuples inserted by open * transactions --- unless it's our own transaction. * (Consider INSERT followed by CREATE INDEX within a * transaction.) An exception occurs when reindexing * a system catalog, because we often release lock on * system catalogs before committing. */ if (!TransactionIdIsCurrentTransactionId( HeapTupleHeaderGetXmin(heapTuple->t_data)) && !IsSystemRelation(heapRelation)) elog(ERROR, "concurrent insert in progress"); indexIt = true; tupleIsAlive = true; break; case HEAPTUPLE_DELETE_IN_PROGRESS: /* * Since caller should hold ShareLock or better, we * should not see any tuples deleted by open * transactions --- unless it's our own transaction. * (Consider DELETE followed by CREATE INDEX within a * transaction.) An exception occurs when reindexing * a system catalog, because we often release lock on * system catalogs before committing. */ if (!TransactionIdIsCurrentTransactionId( HeapTupleHeaderGetXmax(heapTuple->t_data)) && !IsSystemRelation(heapRelation)) elog(ERROR, "concurrent delete in progress"); indexIt = true; tupleIsAlive = false; break; default: elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); indexIt = tupleIsAlive = false; /* keep compiler quiet */ break; } /* check for hint-bit update by HeapTupleSatisfiesVacuum */ if (sv_infomask != heapTuple->t_data->t_infomask) SetBufferCommitInfoNeedsSave(scan->rs_cbuf); LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK); if (!indexIt) continue; } else { /* heap_getnext did the time qual check */ tupleIsAlive = true; } reltuples += 1; MemoryContextReset(econtext->ecxt_per_tuple_memory); /* Set up for predicate or expression evaluation */ if (slot) ExecStoreTuple(heapTuple, slot, InvalidBuffer, false); /* * In a partial index, discard tuples that don't satisfy the * predicate. We can also discard recently-dead tuples, since * VACUUM doesn't complain about tuple count mismatch for partial * indexes. */ if (predicate != NIL) { if (!tupleIsAlive) continue; if (!ExecQual(predicate, econtext, false)) continue; } /* * For the current heap tuple, extract all the attributes we use * in this index, and note which are null. This also performs * evaluation of any expressions needed. */ FormIndexDatum(indexInfo, heapTuple, heapDescriptor, estate, attdata, nulls); /* * You'd think we should go ahead and build the index tuple here, * but some index AMs want to do further processing on the data * first. So pass the attdata and nulls arrays, instead. */ /* Call the AM's callback routine to process the tuple */ callback(indexRelation, heapTuple, attdata, nulls, tupleIsAlive, callback_state); } heap_endscan(scan); if (tupleTable) ExecDropTupleTable(tupleTable, true); FreeExecutorState(estate); /* These may have been pointing to the now-gone estate */ indexInfo->ii_ExpressionsState = NIL; indexInfo->ii_PredicateState = NIL; return reltuples; }
/* * Returns a list of items whose visibility map information does not match * the status of the tuples on the page. * * If all_visible is passed as true, this will include all items which are * on pages marked as all-visible in the visibility map but which do not * seem to in fact be all-visible. * * If all_frozen is passed as true, this will include all items which are * on pages marked as all-frozen but which do not seem to in fact be frozen. */ static corrupt_items * collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen) { Relation rel; BlockNumber nblocks; corrupt_items *items; BlockNumber blkno; Buffer vmbuffer = InvalidBuffer; BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD); TransactionId OldestXmin = InvalidTransactionId; if (all_visible) { /* Don't pass rel; that will fail in recovery. */ OldestXmin = GetOldestXmin(NULL, true); } rel = relation_open(relid, AccessShareLock); if (rel->rd_rel->relkind != RELKIND_RELATION && rel->rd_rel->relkind != RELKIND_MATVIEW && rel->rd_rel->relkind != RELKIND_TOASTVALUE) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("\"%s\" is not a table, materialized view, or TOAST table", RelationGetRelationName(rel)))); nblocks = RelationGetNumberOfBlocks(rel); /* * Guess an initial array size. We don't expect many corrupted tuples, so * start with a small array. This function uses the "next" field to track * the next offset where we can store an item (which is the same thing as * the number of items found so far) and the "count" field to track the * number of entries allocated. We'll repurpose these fields before * returning. */ items = palloc0(sizeof(corrupt_items)); items->next = 0; items->count = 64; items->tids = palloc(items->count * sizeof(ItemPointerData)); /* Loop over every block in the relation. */ for (blkno = 0; blkno < nblocks; ++blkno) { bool check_frozen = false; bool check_visible = false; Buffer buffer; Page page; OffsetNumber offnum, maxoff; /* Make sure we are interruptible. */ CHECK_FOR_INTERRUPTS(); /* Use the visibility map to decide whether to check this page. */ if (all_frozen && VM_ALL_FROZEN(rel, blkno, &vmbuffer)) check_frozen = true; if (all_visible && VM_ALL_VISIBLE(rel, blkno, &vmbuffer)) check_visible = true; if (!check_visible && !check_frozen) continue; /* Read and lock the page. */ buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); maxoff = PageGetMaxOffsetNumber(page); /* * The visibility map bits might have changed while we were acquiring * the page lock. Recheck to avoid returning spurious results. */ if (check_frozen && !VM_ALL_FROZEN(rel, blkno, &vmbuffer)) check_frozen = false; if (check_visible && !VM_ALL_VISIBLE(rel, blkno, &vmbuffer)) check_visible = false; if (!check_visible && !check_frozen) { UnlockReleaseBuffer(buffer); continue; } /* Iterate over each tuple on the page. */ for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { HeapTupleData tuple; ItemId itemid; itemid = PageGetItemId(page, offnum); /* Unused or redirect line pointers are of no interest. */ if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid)) continue; /* Dead line pointers are neither all-visible nor frozen. */ if (ItemIdIsDead(itemid)) { ItemPointerSet(&(tuple.t_self), blkno, offnum); record_corrupt_item(items, &tuple.t_self); continue; } /* Initialize a HeapTupleData structure for checks below. */ ItemPointerSet(&(tuple.t_self), blkno, offnum); tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); tuple.t_len = ItemIdGetLength(itemid); tuple.t_tableOid = relid; /* * If we're checking whether the page is all-visible, we expect * the tuple to be all-visible. */ if (check_visible && !tuple_all_visible(&tuple, OldestXmin, buffer)) { TransactionId RecomputedOldestXmin; /* * Time has passed since we computed OldestXmin, so it's * possible that this tuple is all-visible in reality even * though it doesn't appear so based on our * previously-computed value. Let's compute a new value so we * can be certain whether there is a problem. * * From a concurrency point of view, it sort of sucks to * retake ProcArrayLock here while we're holding the buffer * exclusively locked, but it should be safe against * deadlocks, because surely GetOldestXmin() should never take * a buffer lock. And this shouldn't happen often, so it's * worth being careful so as to avoid false positives. */ RecomputedOldestXmin = GetOldestXmin(NULL, true); if (!TransactionIdPrecedes(OldestXmin, RecomputedOldestXmin)) record_corrupt_item(items, &tuple.t_self); else { OldestXmin = RecomputedOldestXmin; if (!tuple_all_visible(&tuple, OldestXmin, buffer)) record_corrupt_item(items, &tuple.t_self); } } /* * If we're checking whether the page is all-frozen, we expect the * tuple to be in a state where it will never need freezing. */ if (check_frozen) { if (heap_tuple_needs_eventual_freeze(tuple.t_data)) record_corrupt_item(items, &tuple.t_self); } } UnlockReleaseBuffer(buffer); } /* Clean up. */ if (vmbuffer != InvalidBuffer) ReleaseBuffer(vmbuffer); relation_close(rel, AccessShareLock); /* * Before returning, repurpose the fields to match caller's expectations. * next is now the next item that should be read (rather than written) and * count is now the number of items we wrote (rather than the number we * allocated). */ items->count = items->next; items->next = 0; return items; }
/* * Send hot standby feedback message to primary, plus the current time, * in case they don't have a watch. */ static void XLogWalRcvSendHSFeedback(void) { char buf[sizeof(StandbyHSFeedbackMessage) + 1]; TimestampTz now; TransactionId nextXid; uint32 nextEpoch; TransactionId xmin; /* * If the user doesn't want status to be reported to the master, be sure * to exit before doing anything at all. */ if (wal_receiver_status_interval <= 0 || !hot_standby_feedback) return; /* Get current timestamp. */ now = GetCurrentTimestamp(); /* * Send feedback at most once per wal_receiver_status_interval. */ if (!TimestampDifferenceExceeds(feedback_message.sendTime, now, wal_receiver_status_interval * 1000)) return; /* * If Hot Standby is not yet active there is nothing to send. Check this * after the interval has expired to reduce number of calls. */ if (!HotStandbyActive()) return; /* * Make the expensive call to get the oldest xmin once we are certain * everything else has been checked. */ xmin = GetOldestXmin(true, false); /* * Get epoch and adjust if nextXid and oldestXmin are different sides of * the epoch boundary. */ GetNextXidAndEpoch(&nextXid, &nextEpoch); if (nextXid < xmin) nextEpoch--; /* * Always send feedback message. */ feedback_message.sendTime = now; feedback_message.xmin = xmin; feedback_message.epoch = nextEpoch; elog(DEBUG2, "sending hot standby feedback xmin %u epoch %u", feedback_message.xmin, feedback_message.epoch); /* Prepend with the message type and send it. */ buf[0] = 'h'; memcpy(&buf[1], &feedback_message, sizeof(StandbyHSFeedbackMessage)); walrcv_send(buf, sizeof(StandbyHSFeedbackMessage) + 1); }