Пример #1
0
/*
 * Hot Standby feedback
 */
static void
ProcessStandbyHSFeedbackMessage(void)
{
	StandbyHSFeedbackMessage reply;
	TransactionId newxmin = InvalidTransactionId;

	pq_copymsgbytes(&reply_message, (char *) &reply, sizeof(StandbyHSFeedbackMessage));

	elog(DEBUG2, "hot standby feedback xmin %u epoch %u",
		 reply.xmin,
		 reply.epoch);

	/*
	 * Update the WalSender's proc xmin to allow it to be visible to
	 * snapshots. This will hold back the removal of dead rows and thereby
	 * prevent the generation of cleanup conflicts on the standby server.
	 */
	if (TransactionIdIsValid(reply.xmin))
	{
		TransactionId nextXid;
		uint32		nextEpoch;
		bool		epochOK = false;

		GetNextXidAndEpoch(&nextXid, &nextEpoch);

		/*
		 * Epoch of oldestXmin should be same as standby or if the counter has
		 * wrapped, then one less than reply.
		 */
		if (reply.xmin <= nextXid)
		{
			if (reply.epoch == nextEpoch)
				epochOK = true;
		}
		else
		{
			if (nextEpoch > 0 && reply.epoch == nextEpoch - 1)
				epochOK = true;
		}

		/*
		 * Feedback from standby must not go backwards, nor should it go
		 * forwards further than our most recent xid.
		 */
		if (epochOK && TransactionIdPrecedesOrEquals(reply.xmin, nextXid))
		{
			if (!TransactionIdIsValid(MyProc->xmin))
			{
				TransactionId oldestXmin = GetOldestXmin(true, true);

				if (TransactionIdPrecedes(oldestXmin, reply.xmin))
					newxmin = reply.xmin;
				else
					newxmin = oldestXmin;
			}
			else
			{
				if (TransactionIdPrecedes(MyProc->xmin, reply.xmin))
					newxmin = reply.xmin;
				else
					newxmin = MyProc->xmin;		/* stay the same */
			}
		}
	}

	/*
	 * Grab the ProcArrayLock to set xmin, or invalidate for bad reply
	 */
	if (MyProc->xmin != newxmin)
	{
		LWLockAcquire(ProcArrayLock, LW_SHARED);
		MyProc->xmin = newxmin;
		LWLockRelease(ProcArrayLock);
	}
}
Пример #2
0
/*
 * This function takes an already open relation and scans its pages,
 * skipping those that have the corresponding visibility map bit set.
 * For pages we skip, we find the free space from the free space map
 * and approximate tuple_len on that basis. For the others, we count
 * the exact number of dead tuples etc.
 *
 * This scan is loosely based on vacuumlazy.c:lazy_scan_heap(), but
 * we do not try to avoid skipping single pages.
 */
static void
statapprox_heap(Relation rel, output_type *stat)
{
	BlockNumber scanned,
				nblocks,
				blkno;
	Buffer		vmbuffer = InvalidBuffer;
	BufferAccessStrategy bstrategy;
	TransactionId OldestXmin;
	uint64		misc_count = 0;

	OldestXmin = GetOldestXmin(rel, PROCARRAY_FLAGS_VACUUM);
	bstrategy = GetAccessStrategy(BAS_BULKREAD);

	nblocks = RelationGetNumberOfBlocks(rel);
	scanned = 0;

	for (blkno = 0; blkno < nblocks; blkno++)
	{
		Buffer		buf;
		Page		page;
		OffsetNumber offnum,
					maxoff;
		Size		freespace;

		CHECK_FOR_INTERRUPTS();

		/*
		 * If the page has only visible tuples, then we can find out the free
		 * space from the FSM and move on.
		 */
		if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
		{
			freespace = GetRecordedFreeSpace(rel, blkno);
			stat->tuple_len += BLCKSZ - freespace;
			stat->free_space += freespace;
			continue;
		}

		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
								 RBM_NORMAL, bstrategy);

		LockBuffer(buf, BUFFER_LOCK_SHARE);

		page = BufferGetPage(buf);

		/*
		 * It's not safe to call PageGetHeapFreeSpace() on new pages, so we
		 * treat them as being free space for our purposes.
		 */
		if (!PageIsNew(page))
			stat->free_space += PageGetHeapFreeSpace(page);
		else
			stat->free_space += BLCKSZ - SizeOfPageHeaderData;

		if (PageIsNew(page) || PageIsEmpty(page))
		{
			UnlockReleaseBuffer(buf);
			continue;
		}

		scanned++;

		/*
		 * Look at each tuple on the page and decide whether it's live or
		 * dead, then count it and its size. Unlike lazy_scan_heap, we can
		 * afford to ignore problems and special cases.
		 */
		maxoff = PageGetMaxOffsetNumber(page);

		for (offnum = FirstOffsetNumber;
			 offnum <= maxoff;
			 offnum = OffsetNumberNext(offnum))
		{
			ItemId		itemid;
			HeapTupleData tuple;

			itemid = PageGetItemId(page, offnum);

			if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid) ||
				ItemIdIsDead(itemid))
			{
				continue;
			}

			Assert(ItemIdIsNormal(itemid));

			ItemPointerSet(&(tuple.t_self), blkno, offnum);

			tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
			tuple.t_len = ItemIdGetLength(itemid);
			tuple.t_tableOid = RelationGetRelid(rel);

			/*
			 * We count live and dead tuples, but we also need to add up
			 * others in order to feed vac_estimate_reltuples.
			 */
			switch (HeapTupleSatisfiesVacuum(&tuple, OldestXmin, buf))
			{
				case HEAPTUPLE_RECENTLY_DEAD:
					misc_count++;
					/* Fall through */
				case HEAPTUPLE_DEAD:
					stat->dead_tuple_len += tuple.t_len;
					stat->dead_tuple_count++;
					break;
				case HEAPTUPLE_LIVE:
					stat->tuple_len += tuple.t_len;
					stat->tuple_count++;
					break;
				case HEAPTUPLE_INSERT_IN_PROGRESS:
				case HEAPTUPLE_DELETE_IN_PROGRESS:
					misc_count++;
					break;
				default:
					elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
					break;
			}
		}

		UnlockReleaseBuffer(buf);
	}

	stat->table_len = (uint64) nblocks *BLCKSZ;

	stat->tuple_count = vac_estimate_reltuples(rel, false, nblocks, scanned,
											 stat->tuple_count + misc_count);

	/*
	 * Calculate percentages if the relation has one or more pages.
	 */
	if (nblocks != 0)
	{
		stat->scanned_percent = 100 * scanned / nblocks;
		stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len;
		stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len;
		stat->free_percent = 100.0 * stat->free_space / stat->table_len;
	}

	if (BufferIsValid(vmbuffer))
	{
		ReleaseBuffer(vmbuffer);
		vmbuffer = InvalidBuffer;
	}
}
Пример #3
0
/*
 * IndexBuildHeapScan - scan the heap relation to find tuples to be indexed
 *
 * This is called back from an access-method-specific index build procedure
 * after the AM has done whatever setup it needs.  The parent heap relation
 * is scanned to find tuples that should be entered into the index.  Each
 * such tuple is passed to the AM's callback routine, which does the right
 * things to add it to the new index.  After we return, the AM's index
 * build procedure does whatever cleanup is needed; in particular, it should
 * close the heap and index relations.
 *
 * The total count of heap tuples is returned.	This is for updating pg_class
 * statistics.	(It's annoying not to be able to do that here, but we can't
 * do it until after the relation is closed.)  Note that the index AM itself
 * must keep track of the number of index tuples; we don't do so here because
 * the AM might reject some of the tuples for its own reasons, such as being
 * unable to store NULLs.
 */
double
IndexBuildHeapScan(Relation heapRelation,
				   Relation indexRelation,
				   IndexInfo *indexInfo,
				   IndexBuildCallback callback,
				   void *callback_state)
{
	HeapScanDesc scan;
	HeapTuple	heapTuple;
	TupleDesc	heapDescriptor;
	Datum		attdata[INDEX_MAX_KEYS];
	char		nulls[INDEX_MAX_KEYS];
	double		reltuples;
	List	   *predicate;
	TupleTable	tupleTable;
	TupleTableSlot *slot;
	EState	   *estate;
	ExprContext *econtext;
	Snapshot	snapshot;
	TransactionId OldestXmin;

	/*
	 * sanity checks
	 */
	Assert(OidIsValid(indexRelation->rd_rel->relam));

	heapDescriptor = RelationGetDescr(heapRelation);

	/*
	 * Need an EState for evaluation of index expressions and
	 * partial-index predicates.
	 */
	estate = CreateExecutorState();
	econtext = GetPerTupleExprContext(estate);

	/*
	 * If this is a predicate (partial) index, we will need to evaluate
	 * the predicate using ExecQual, which requires the current tuple to
	 * be in a slot of a TupleTable.  Likewise if there are any
	 * expressions.
	 */
	if (indexInfo->ii_Predicate != NIL || indexInfo->ii_Expressions != NIL)
	{
		tupleTable = ExecCreateTupleTable(1);
		slot = ExecAllocTableSlot(tupleTable);
		ExecSetSlotDescriptor(slot, heapDescriptor, false);

		/* Arrange for econtext's scan tuple to be the tuple under test */
		econtext->ecxt_scantuple = slot;

		/* Set up execution state for predicate. */
		predicate = (List *)
			ExecPrepareExpr((Expr *) indexInfo->ii_Predicate,
							estate);
	}
	else
	{
		tupleTable = NULL;
		slot = NULL;
		predicate = NIL;
	}

	/*
	 * Ok, begin our scan of the base relation.  We use SnapshotAny
	 * because we must retrieve all tuples and do our own time qual
	 * checks.
	 */
	if (IsBootstrapProcessingMode())
	{
		snapshot = SnapshotNow;
		OldestXmin = InvalidTransactionId;
	}
	else
	{
		snapshot = SnapshotAny;
		OldestXmin = GetOldestXmin(heapRelation->rd_rel->relisshared);
	}

	scan = heap_beginscan(heapRelation, /* relation */
						  snapshot,		/* seeself */
						  0,	/* number of keys */
						  (ScanKey) NULL);		/* scan key */

	reltuples = 0;

	/*
	 * Scan all tuples in the base relation.
	 */
	while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
	{
		bool		tupleIsAlive;

		CHECK_FOR_INTERRUPTS();

		if (snapshot == SnapshotAny)
		{
			/* do our own time qual check */
			bool		indexIt;
			uint16		sv_infomask;

			/*
			 * HeapTupleSatisfiesVacuum may update tuple's hint status
			 * bits. We could possibly get away with not locking the
			 * buffer here, since caller should hold ShareLock on the
			 * relation, but let's be conservative about it.
			 */
			LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
			sv_infomask = heapTuple->t_data->t_infomask;

			switch (HeapTupleSatisfiesVacuum(heapTuple->t_data, OldestXmin))
			{
				case HEAPTUPLE_DEAD:
					indexIt = false;
					tupleIsAlive = false;
					break;
				case HEAPTUPLE_LIVE:
					indexIt = true;
					tupleIsAlive = true;
					break;
				case HEAPTUPLE_RECENTLY_DEAD:

					/*
					 * If tuple is recently deleted then we must index it
					 * anyway to keep VACUUM from complaining.
					 */
					indexIt = true;
					tupleIsAlive = false;
					break;
				case HEAPTUPLE_INSERT_IN_PROGRESS:

					/*
					 * Since caller should hold ShareLock or better, we
					 * should not see any tuples inserted by open
					 * transactions --- unless it's our own transaction.
					 * (Consider INSERT followed by CREATE INDEX within a
					 * transaction.)  An exception occurs when reindexing
					 * a system catalog, because we often release lock on
					 * system catalogs before committing.
					 */
					if (!TransactionIdIsCurrentTransactionId(
							  HeapTupleHeaderGetXmin(heapTuple->t_data))
						&& !IsSystemRelation(heapRelation))
						elog(ERROR, "concurrent insert in progress");
					indexIt = true;
					tupleIsAlive = true;
					break;
				case HEAPTUPLE_DELETE_IN_PROGRESS:

					/*
					 * Since caller should hold ShareLock or better, we
					 * should not see any tuples deleted by open
					 * transactions --- unless it's our own transaction.
					 * (Consider DELETE followed by CREATE INDEX within a
					 * transaction.)  An exception occurs when reindexing
					 * a system catalog, because we often release lock on
					 * system catalogs before committing.
					 */
					if (!TransactionIdIsCurrentTransactionId(
							  HeapTupleHeaderGetXmax(heapTuple->t_data))
						&& !IsSystemRelation(heapRelation))
						elog(ERROR, "concurrent delete in progress");
					indexIt = true;
					tupleIsAlive = false;
					break;
				default:
					elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
					indexIt = tupleIsAlive = false;		/* keep compiler quiet */
					break;
			}

			/* check for hint-bit update by HeapTupleSatisfiesVacuum */
			if (sv_infomask != heapTuple->t_data->t_infomask)
				SetBufferCommitInfoNeedsSave(scan->rs_cbuf);

			LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);

			if (!indexIt)
				continue;
		}
		else
		{
			/* heap_getnext did the time qual check */
			tupleIsAlive = true;
		}

		reltuples += 1;

		MemoryContextReset(econtext->ecxt_per_tuple_memory);

		/* Set up for predicate or expression evaluation */
		if (slot)
			ExecStoreTuple(heapTuple, slot, InvalidBuffer, false);

		/*
		 * In a partial index, discard tuples that don't satisfy the
		 * predicate.  We can also discard recently-dead tuples, since
		 * VACUUM doesn't complain about tuple count mismatch for partial
		 * indexes.
		 */
		if (predicate != NIL)
		{
			if (!tupleIsAlive)
				continue;
			if (!ExecQual(predicate, econtext, false))
				continue;
		}

		/*
		 * For the current heap tuple, extract all the attributes we use
		 * in this index, and note which are null.	This also performs
		 * evaluation of any expressions needed.
		 */
		FormIndexDatum(indexInfo,
					   heapTuple,
					   heapDescriptor,
					   estate,
					   attdata,
					   nulls);

		/*
		 * You'd think we should go ahead and build the index tuple here,
		 * but some index AMs want to do further processing on the data
		 * first.  So pass the attdata and nulls arrays, instead.
		 */

		/* Call the AM's callback routine to process the tuple */
		callback(indexRelation, heapTuple, attdata, nulls, tupleIsAlive,
				 callback_state);
	}

	heap_endscan(scan);

	if (tupleTable)
		ExecDropTupleTable(tupleTable, true);

	FreeExecutorState(estate);

	/* These may have been pointing to the now-gone estate */
	indexInfo->ii_ExpressionsState = NIL;
	indexInfo->ii_PredicateState = NIL;

	return reltuples;
}
Пример #4
0
/*
 * Returns a list of items whose visibility map information does not match
 * the status of the tuples on the page.
 *
 * If all_visible is passed as true, this will include all items which are
 * on pages marked as all-visible in the visibility map but which do not
 * seem to in fact be all-visible.
 *
 * If all_frozen is passed as true, this will include all items which are
 * on pages marked as all-frozen but which do not seem to in fact be frozen.
 */
static corrupt_items *
collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
{
    Relation	rel;
    BlockNumber nblocks;
    corrupt_items *items;
    BlockNumber blkno;
    Buffer		vmbuffer = InvalidBuffer;
    BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
    TransactionId OldestXmin = InvalidTransactionId;

    if (all_visible)
    {
        /* Don't pass rel; that will fail in recovery. */
        OldestXmin = GetOldestXmin(NULL, true);
    }

    rel = relation_open(relid, AccessShareLock);

    if (rel->rd_rel->relkind != RELKIND_RELATION &&
            rel->rd_rel->relkind != RELKIND_MATVIEW &&
            rel->rd_rel->relkind != RELKIND_TOASTVALUE)
        ereport(ERROR,
                (errcode(ERRCODE_WRONG_OBJECT_TYPE),
                 errmsg("\"%s\" is not a table, materialized view, or TOAST table",
                        RelationGetRelationName(rel))));

    nblocks = RelationGetNumberOfBlocks(rel);

    /*
     * Guess an initial array size. We don't expect many corrupted tuples, so
     * start with a small array.  This function uses the "next" field to track
     * the next offset where we can store an item (which is the same thing as
     * the number of items found so far) and the "count" field to track the
     * number of entries allocated.  We'll repurpose these fields before
     * returning.
     */
    items = palloc0(sizeof(corrupt_items));
    items->next = 0;
    items->count = 64;
    items->tids = palloc(items->count * sizeof(ItemPointerData));

    /* Loop over every block in the relation. */
    for (blkno = 0; blkno < nblocks; ++blkno)
    {
        bool		check_frozen = false;
        bool		check_visible = false;
        Buffer		buffer;
        Page		page;
        OffsetNumber offnum,
                     maxoff;

        /* Make sure we are interruptible. */
        CHECK_FOR_INTERRUPTS();

        /* Use the visibility map to decide whether to check this page. */
        if (all_frozen && VM_ALL_FROZEN(rel, blkno, &vmbuffer))
            check_frozen = true;
        if (all_visible && VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
            check_visible = true;
        if (!check_visible && !check_frozen)
            continue;

        /* Read and lock the page. */
        buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
                                    bstrategy);
        LockBuffer(buffer, BUFFER_LOCK_SHARE);

        page = BufferGetPage(buffer);
        maxoff = PageGetMaxOffsetNumber(page);

        /*
         * The visibility map bits might have changed while we were acquiring
         * the page lock.  Recheck to avoid returning spurious results.
         */
        if (check_frozen && !VM_ALL_FROZEN(rel, blkno, &vmbuffer))
            check_frozen = false;
        if (check_visible && !VM_ALL_VISIBLE(rel, blkno, &vmbuffer))
            check_visible = false;
        if (!check_visible && !check_frozen)
        {
            UnlockReleaseBuffer(buffer);
            continue;
        }

        /* Iterate over each tuple on the page. */
        for (offnum = FirstOffsetNumber;
                offnum <= maxoff;
                offnum = OffsetNumberNext(offnum))
        {
            HeapTupleData tuple;
            ItemId		itemid;

            itemid = PageGetItemId(page, offnum);

            /* Unused or redirect line pointers are of no interest. */
            if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
                continue;

            /* Dead line pointers are neither all-visible nor frozen. */
            if (ItemIdIsDead(itemid))
            {
                ItemPointerSet(&(tuple.t_self), blkno, offnum);
                record_corrupt_item(items, &tuple.t_self);
                continue;
            }

            /* Initialize a HeapTupleData structure for checks below. */
            ItemPointerSet(&(tuple.t_self), blkno, offnum);
            tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
            tuple.t_len = ItemIdGetLength(itemid);
            tuple.t_tableOid = relid;

            /*
             * If we're checking whether the page is all-visible, we expect
             * the tuple to be all-visible.
             */
            if (check_visible &&
                    !tuple_all_visible(&tuple, OldestXmin, buffer))
            {
                TransactionId RecomputedOldestXmin;

                /*
                 * Time has passed since we computed OldestXmin, so it's
                 * possible that this tuple is all-visible in reality even
                 * though it doesn't appear so based on our
                 * previously-computed value.  Let's compute a new value so we
                 * can be certain whether there is a problem.
                 *
                 * From a concurrency point of view, it sort of sucks to
                 * retake ProcArrayLock here while we're holding the buffer
                 * exclusively locked, but it should be safe against
                 * deadlocks, because surely GetOldestXmin() should never take
                 * a buffer lock. And this shouldn't happen often, so it's
                 * worth being careful so as to avoid false positives.
                 */
                RecomputedOldestXmin = GetOldestXmin(NULL, true);

                if (!TransactionIdPrecedes(OldestXmin, RecomputedOldestXmin))
                    record_corrupt_item(items, &tuple.t_self);
                else
                {
                    OldestXmin = RecomputedOldestXmin;
                    if (!tuple_all_visible(&tuple, OldestXmin, buffer))
                        record_corrupt_item(items, &tuple.t_self);
                }
            }

            /*
             * If we're checking whether the page is all-frozen, we expect the
             * tuple to be in a state where it will never need freezing.
             */
            if (check_frozen)
            {
                if (heap_tuple_needs_eventual_freeze(tuple.t_data))
                    record_corrupt_item(items, &tuple.t_self);
            }
        }

        UnlockReleaseBuffer(buffer);
    }

    /* Clean up. */
    if (vmbuffer != InvalidBuffer)
        ReleaseBuffer(vmbuffer);
    relation_close(rel, AccessShareLock);

    /*
     * Before returning, repurpose the fields to match caller's expectations.
     * next is now the next item that should be read (rather than written) and
     * count is now the number of items we wrote (rather than the number we
     * allocated).
     */
    items->count = items->next;
    items->next = 0;

    return items;
}
Пример #5
0
/*
 * Send hot standby feedback message to primary, plus the current time,
 * in case they don't have a watch.
 */
static void
XLogWalRcvSendHSFeedback(void)
{
	char		buf[sizeof(StandbyHSFeedbackMessage) + 1];
	TimestampTz now;
	TransactionId nextXid;
	uint32		nextEpoch;
	TransactionId xmin;

	/*
	 * If the user doesn't want status to be reported to the master, be sure
	 * to exit before doing anything at all.
	 */
	if (wal_receiver_status_interval <= 0 || !hot_standby_feedback)
		return;

	/* Get current timestamp. */
	now = GetCurrentTimestamp();

	/*
	 * Send feedback at most once per wal_receiver_status_interval.
	 */
	if (!TimestampDifferenceExceeds(feedback_message.sendTime, now,
									wal_receiver_status_interval * 1000))
		return;

	/*
	 * If Hot Standby is not yet active there is nothing to send. Check this
	 * after the interval has expired to reduce number of calls.
	 */
	if (!HotStandbyActive())
		return;

	/*
	 * Make the expensive call to get the oldest xmin once we are certain
	 * everything else has been checked.
	 */
	xmin = GetOldestXmin(true, false);

	/*
	 * Get epoch and adjust if nextXid and oldestXmin are different sides of
	 * the epoch boundary.
	 */
	GetNextXidAndEpoch(&nextXid, &nextEpoch);
	if (nextXid < xmin)
		nextEpoch--;

	/*
	 * Always send feedback message.
	 */
	feedback_message.sendTime = now;
	feedback_message.xmin = xmin;
	feedback_message.epoch = nextEpoch;

	elog(DEBUG2, "sending hot standby feedback xmin %u epoch %u",
		 feedback_message.xmin,
		 feedback_message.epoch);

	/* Prepend with the message type and send it. */
	buf[0] = 'h';
	memcpy(&buf[1], &feedback_message, sizeof(StandbyHSFeedbackMessage));
	walrcv_send(buf, sizeof(StandbyHSFeedbackMessage) + 1);
}