示例#1
0
/*
 * Returns the network latency in ms, note that this includes any
 * difference in clock settings between the servers, as well as timezone.
 */
int
GetReplicationTransferLatency(void)
{
	WalRcvData *walrcv = WalRcv;

	TimestampTz lastMsgSendTime;
	TimestampTz lastMsgReceiptTime;

	long		secs = 0;
	int			usecs = 0;
	int			ms;

	SpinLockAcquire(&walrcv->mutex);
	lastMsgSendTime = walrcv->lastMsgSendTime;
	lastMsgReceiptTime = walrcv->lastMsgReceiptTime;
	SpinLockRelease(&walrcv->mutex);

	TimestampDifference(lastMsgSendTime,
						lastMsgReceiptTime,
						&secs, &usecs);

	ms = ((int) secs * 1000) + (usecs / 1000);

	return ms;
}
示例#2
0
/*
 * Returns the replication apply delay in ms
 */
int
GetReplicationApplyDelay(void)
{
	/* use volatile pointer to prevent code rearrangement */
	volatile WalRcvData *walrcv = WalRcv;

	XLogRecPtr	receivePtr;
	XLogRecPtr	replayPtr;

	long		secs;
	int			usecs;

	SpinLockAcquire(&walrcv->mutex);
	receivePtr = walrcv->receivedUpto;
	SpinLockRelease(&walrcv->mutex);

	replayPtr = GetXLogReplayRecPtr(NULL);

	if (XLByteLE(receivePtr, replayPtr))
		return 0;

	TimestampDifference(GetCurrentChunkReplayStartTime(),
						GetCurrentTimestamp(),
						&secs, &usecs);

	return (((int) secs * 1000) + (usecs / 1000));
}
示例#3
0
/*
 * Returns the replication apply delay in ms or -1
 * if the apply delay info is not available
 */
int
GetReplicationApplyDelay(void)
{
	WalRcvData *walrcv = WalRcv;
	XLogRecPtr	receivePtr;
	XLogRecPtr	replayPtr;

	long		secs;
	int			usecs;

	TimestampTz chunkReplayStartTime;

	SpinLockAcquire(&walrcv->mutex);
	receivePtr = walrcv->receivedUpto;
	SpinLockRelease(&walrcv->mutex);

	replayPtr = GetXLogReplayRecPtr(NULL);

	if (receivePtr == replayPtr)
		return 0;

	chunkReplayStartTime = GetCurrentChunkReplayStartTime();

	if (chunkReplayStartTime == 0)
		return -1;

	TimestampDifference(chunkReplayStartTime,
						GetCurrentTimestamp(),
						&secs, &usecs);

	return (((int) secs * 1000) + (usecs / 1000));
}
示例#4
0
/*
 * Helper function that actually kicks off the command on the libpq connection.
 */
static void
dispatchCommand(CdbDispatchResult * dispatchResult,
				const char *query_text,
				int query_text_len)
{
	SegmentDatabaseDescriptor *segdbDesc = dispatchResult->segdbDesc;
	TimestampTz beforeSend = 0;
	long secs;
	int	usecs;

	if (DEBUG1 >= log_min_messages)
		beforeSend = GetCurrentTimestamp();

	if (PQisBusy(segdbDesc->conn))
		elog(LOG, "Trying to send to busy connection %s: asyncStatus %d",
				  segdbDesc->whoami,
				  segdbDesc->conn->asyncStatus);

	if (cdbconn_isBadConnection(segdbDesc))
	{
		char *msg = PQerrorMessage(dispatchResult->segdbDesc->conn);
		dispatchResult->stillRunning = false;
		ereport(ERROR,
				(errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
				 errmsg("Connection lost before dispatch to segment %s: %s",
						 dispatchResult->segdbDesc->whoami, msg ? msg : "unknown error")));
	}

	/*
	 * Submit the command asynchronously.
	 */
	if (PQsendGpQuery_shared(dispatchResult->segdbDesc->conn, (char *) query_text, query_text_len) == 0)
	{
		char *msg = PQerrorMessage(dispatchResult->segdbDesc->conn);
		dispatchResult->stillRunning = false;
		ereport(ERROR,
				(errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
				 errmsg("Command could not be dispatch to segment %s: %s",
						 dispatchResult->segdbDesc->whoami, msg ? msg : "unknown error")));
	}

	if (DEBUG1 >= log_min_messages)
	{
		TimestampDifference(beforeSend, GetCurrentTimestamp(), &secs, &usecs);

		if (secs != 0 || usecs > 1000)	/* Time > 1ms? */
			elog(LOG, "time for PQsendGpQuery_shared %ld.%06d", secs, usecs);
	}

	/*
	 * We'll keep monitoring this QE -- whether or not the command
	 * was dispatched -- in order to check for a lost connection
	 * or any other errors that libpq might have in store for us.
	 */
	dispatchResult->stillRunning = true;
	dispatchResult->hasDispatched = true;

	ELOG_DISPATCHER_DEBUG("Command dispatched to QE (%s)", dispatchResult->segdbDesc->whoami);
}
示例#5
0
/*
 * Get next block number or InvalidBlockNumber when we're done.
 *
 * Uses linear probing algorithm for picking next block.
 */
Datum
tsm_system_time_nextblock(PG_FUNCTION_ARGS)
{
	TableSampleDesc	   *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
	SystemSamplerData  *sampler = (SystemSamplerData *) tsdesc->tsmdata;

	sampler->lb = (sampler->lb + sampler->step) % sampler->nblocks;
	sampler->doneblocks++;

	/* All blocks have been read, we're done */
	if (sampler->doneblocks > sampler->nblocks)
		PG_RETURN_UINT32(InvalidBlockNumber);

	/*
	 * Update the estimations for time limit at least 10 times per estimated
	 * number of returned blocks to handle variations in block read speed.
	 */
	if (sampler->doneblocks % Max(sampler->estblocks/10, 1) == 0)
	{
		TimestampTz	now = GetCurrentTimestamp();
		long        secs;
		int         usecs;
		int			usecs_remaining;
		int			time_per_block;

		TimestampDifference(sampler->start_time, now, &secs, &usecs);
		usecs += (int) secs * 1000000;

		time_per_block = usecs / sampler->doneblocks;

		/* No time left, end. */
		TimestampDifference(now, sampler->end_time, &secs, &usecs);
		if (secs <= 0 && usecs <= 0)
			PG_RETURN_UINT32(InvalidBlockNumber);

		/* Remaining microseconds */
		usecs_remaining = usecs + (int) secs * 1000000;

		/* Recalculate estimated returned number of blocks */
		if (time_per_block < usecs_remaining && time_per_block > 0)
			sampler->estblocks = sampler->time * time_per_block;
	}

	PG_RETURN_UINT32(sampler->lb);
}
示例#6
0
/*
 * Check for statement timeout.  If the timeout time has come,
 * trigger a query-cancel interrupt; if not, reschedule the SIGALRM
 * interrupt to occur at the right time.
 *
 * Returns true if okay, false if failed to set the interrupt.
 */
static bool
CheckStatementTimeout(void)
{
	TimestampTz now;

	if (!statement_timeout_active)
		return true;			/* do nothing if not active */

	/* QD takes care of timeouts for QE. */
	if (Gp_role == GP_ROLE_EXECUTE)
		return true;

	now = GetCurrentTimestamp();

	if (now >= statement_fin_time)
	{
		/* Time to die */
		statement_timeout_active = false;
		cancel_from_timeout = true;
		elog(LOG,"Issuing cancel signal (SIGINT) to my self (pid = %d) for statement timeout.",
			 MyProcPid);
#ifdef HAVE_SETSID
		/* try to signal whole process group */
		kill(-MyProcPid, SIGINT);
#endif
		kill(MyProcPid, SIGINT);
	}
	else
	{
		/* Not time yet, so (re)schedule the interrupt */
		long		secs;
		int			usecs;
		struct itimerval timeval;

		TimestampDifference(now, statement_fin_time,
							&secs, &usecs);

		/*
		 * It's possible that the difference is less than a microsecond;
		 * ensure we don't cancel, rather than set, the interrupt.
		 */
		if (secs == 0 && usecs == 0)
			usecs = 1;
		MemSet(&timeval, 0, sizeof(struct itimerval));
		timeval.it_value.tv_sec = secs;
		timeval.it_value.tv_usec = usecs;
		if (setitimer(ITIMER_REAL, &timeval, NULL))
			return false;
	}

	return true;
}
示例#7
0
/*
 * Helper function to thread_DispatchCommand that actually kicks off the
 * command on the libpq connection.
 *
 * NOTE: since this is called via a thread, the same rules apply as to
 *		 thread_DispatchCommand absolutely no elog'ing.
 */
static void
dispatchCommand(CdbDispatchResult * dispatchResult,
				const char *query_text, int query_text_len)
{
	SegmentDatabaseDescriptor *segdbDesc = dispatchResult->segdbDesc;
	PGconn *conn = segdbDesc->conn;
	TimestampTz beforeSend = 0;
	long secs;
	int	usecs;

	if (DEBUG1 >= log_min_messages)
		beforeSend = GetCurrentTimestamp();

	/*
	 * Submit the command asynchronously.
	 */
	if (PQsendGpQuery_shared(conn, (char *) query_text, query_text_len) == 0)
	{
		char	   *msg = PQerrorMessage(segdbDesc->conn);

		if (DEBUG3 >= log_min_messages)
			write_log("PQsendMPPQuery_shared error %s %s",
					  segdbDesc->whoami, msg ? msg : "");

		/*
		 * Note the error.
		 */
		cdbdisp_appendMessage(dispatchResult, LOG,
							  ERRCODE_GP_INTERCONNECTION_ERROR,
							  "Command could not be sent to segment db %s;  %s",
							  segdbDesc->whoami, msg ? msg : "");
		PQfinish(conn);
		segdbDesc->conn = NULL;
		dispatchResult->stillRunning = false;
	}

	if (DEBUG1 >= log_min_messages)
	{
		TimestampDifference(beforeSend, GetCurrentTimestamp(), &secs, &usecs);

		if (secs != 0 || usecs > 1000)	/* Time > 1ms? */
			write_log("time for PQsendGpQuery_shared %ld.%06d", secs, usecs);
	}

	dispatchResult->hasDispatched = true;
	/*
	 * We'll keep monitoring this QE -- whether or not the command
	 * was dispatched -- in order to check for a lost connection
	 * or any other errors that libpq might have in store for us.
	 */
}
示例#8
0
static void
calcCpuUsage(StringInfoData *str,
			 int64 usageBegin, TimestampTz timestampBegin,
			 int64 usageEnd, TimestampTz timestampEnd)
{
	int64 duration;
	long secs;
	int usecs;
	int64 usage;

	usage = usageEnd - usageBegin;

	TimestampDifference(timestampBegin, timestampEnd, &secs, &usecs);

	duration = secs * 1000000 + usecs;

	appendStringInfo(str, "\"%d\":%.2f",
					 GpIdentity.segindex,
					 ResGroupOps_ConvertCpuUsageToPercent(usage, duration));
}
示例#9
0
/*
 * Schedule alarm for the next active timeout, if any
 *
 * We assume the caller has obtained the current time, or a close-enough
 * approximation.
 */
static void
schedule_alarm(TimestampTz now)
{
	if (num_active_timeouts > 0)
	{
		struct itimerval timeval;
		long		secs;
		int			usecs;

		MemSet(&timeval, 0, sizeof(struct itimerval));

		/* Get the time remaining till the nearest pending timeout */
		TimestampDifference(now, active_timeouts[0]->fin_time,
							&secs, &usecs);

		/*
		 * It's possible that the difference is less than a microsecond;
		 * ensure we don't cancel, rather than set, the interrupt.
		 */
		if (secs == 0 && usecs == 0)
			usecs = 1;

		timeval.it_value.tv_sec = secs;
		timeval.it_value.tv_usec = usecs;

		/*
		 * We must enable the signal handler before calling setitimer(); if we
		 * did it in the other order, we'd have a race condition wherein the
		 * interrupt could occur before we can set alarm_enabled, so that the
		 * signal handler would fail to do anything.
		 *
		 * Because we didn't bother to reset the timer in disable_alarm(),
		 * it's possible that a previously-set interrupt will fire between
		 * enable_alarm() and setitimer().  This is safe, however.  There are
		 * two possible outcomes:
		 *
		 * 1. The signal handler finds nothing to do (because the nearest
		 * timeout event is still in the future).  It will re-set the timer
		 * and return.  Then we'll overwrite the timer value with a new one.
		 * This will mean that the timer fires a little later than we
		 * intended, but only by the amount of time it takes for the signal
		 * handler to do nothing useful, which shouldn't be much.
		 *
		 * 2. The signal handler executes and removes one or more timeout
		 * events.  When it returns, either the queue is now empty or the
		 * frontmost event is later than the one we looked at above.  So we'll
		 * overwrite the timer value with one that is too soon (plus or minus
		 * the signal handler's execution time), causing a useless interrupt
		 * to occur.  But the handler will then re-set the timer and
		 * everything will still work as expected.
		 *
		 * Since these cases are of very low probability (the window here
		 * being quite narrow), it's not worth adding cycles to the mainline
		 * code to prevent occasional wasted interrupts.
		 */
		enable_alarm();

		/* Set the alarm timer */
		if (setitimer(ITIMER_REAL, &timeval, NULL) != 0)
			elog(FATAL, "could not enable SIGALRM timer: %m");
	}
}
示例#10
0
/*
 *	lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
 *
 *		This routine vacuums a single heap, cleans out its indexes, and
 *		updates its relpages and reltuples statistics.
 *
 *		At entry, we have already established a transaction and opened
 *		and locked the relation.
 */
void
lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
				BufferAccessStrategy bstrategy)
{
	LVRelStats *vacrelstats;
	Relation   *Irel;
	int			nindexes;
	BlockNumber possibly_freeable;
	PGRUsage	ru0;
	TimestampTz starttime = 0;
 	long		secs;
 	int			usecs;
 	double		read_rate,
				write_rate;
	bool		scan_all;
	TransactionId freezeTableLimit;
	BlockNumber new_rel_pages;
	double		new_rel_tuples;
	BlockNumber new_rel_allvisible;
	TransactionId new_frozen_xid;

	/* measure elapsed time iff autovacuum logging requires it */
	if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
	{
		pg_rusage_init(&ru0);
		starttime = GetCurrentTimestamp();
	}

	if (vacstmt->options & VACOPT_VERBOSE)
		elevel = INFO;
	else
		elevel = DEBUG2;

	vac_strategy = bstrategy;

	vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age,
						  onerel->rd_rel->relisshared,
						  &OldestXmin, &FreezeLimit, &freezeTableLimit);
	scan_all = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid,
											 freezeTableLimit);

	vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));

	vacrelstats->old_rel_pages = onerel->rd_rel->relpages;
	vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples;
	vacrelstats->num_index_scans = 0;

	/* Open all indexes of the relation */
	vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel);
	vacrelstats->hasindex = (nindexes > 0);

	/* Do the vacuuming */
	lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, scan_all);

	/* Done with indexes */
	vac_close_indexes(nindexes, Irel, NoLock);

	/*
	 * Optionally truncate the relation.
	 *
	 * Don't even think about it unless we have a shot at releasing a goodly
	 * number of pages.  Otherwise, the time taken isn't worth it.
	 */
	possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
	if (possibly_freeable > 0 &&
		(possibly_freeable >= REL_TRUNCATE_MINIMUM ||
		 possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION))
		lazy_truncate_heap(onerel, vacrelstats);

	/* Vacuum the Free Space Map */
	FreeSpaceMapVacuum(onerel);

	/*
	 * Update statistics in pg_class.
	 *
	 * A corner case here is that if we scanned no pages at all because every
	 * page is all-visible, we should not update relpages/reltuples, because
	 * we have no new information to contribute.  In particular this keeps
	 * us from replacing relpages=reltuples=0 (which means "unknown tuple
	 * density") with nonzero relpages and reltuples=0 (which means "zero
	 * tuple density") unless there's some actual evidence for the latter.
	 *
	 * We do update relallvisible even in the corner case, since if the
	 * table is all-visible we'd definitely like to know that.  But clamp
	 * the value to be not more than what we're setting relpages to.
	 *
	 * Also, don't change relfrozenxid if we skipped any pages, since then
	 * we don't know for certain that all tuples have a newer xmin.
	 */
	new_rel_pages = vacrelstats->rel_pages;
	new_rel_tuples = vacrelstats->new_rel_tuples;
	if (vacrelstats->scanned_pages == 0 && new_rel_pages > 0)
	{
		new_rel_pages = vacrelstats->old_rel_pages;
		new_rel_tuples = vacrelstats->old_rel_tuples;
	}

	new_rel_allvisible = visibilitymap_count(onerel);
	if (new_rel_allvisible > new_rel_pages)
		new_rel_allvisible = new_rel_pages;

	new_frozen_xid = FreezeLimit;
	if (vacrelstats->scanned_pages < vacrelstats->rel_pages)
		new_frozen_xid = InvalidTransactionId;

	vac_update_relstats(onerel,
						new_rel_pages,
						new_rel_tuples,
						new_rel_allvisible,
						vacrelstats->hasindex,
						new_frozen_xid);

	/* report results to the stats collector, too */
	pgstat_report_vacuum(RelationGetRelid(onerel),
						 onerel->rd_rel->relisshared,
						 new_rel_tuples);

	/* and log the action if appropriate */
	if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
	{
		TimestampTz	endtime = GetCurrentTimestamp();

		if (Log_autovacuum_min_duration == 0 ||
			TimestampDifferenceExceeds(starttime, endtime,
									   Log_autovacuum_min_duration))
		{
			TimestampDifference(starttime, endtime, &secs, &usecs);

			read_rate = 0;
			write_rate = 0;
			if ((secs > 0) || (usecs > 0))
			{
				read_rate = (double) BLCKSZ * VacuumPageMiss / (1024 * 1024) /
					(secs + usecs / 1000000.0);
				write_rate = (double) BLCKSZ * VacuumPageDirty / (1024 * 1024) /
 					(secs + usecs / 1000000.0);
			}
			ereport(LOG,
					(errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"
							"pages: %d removed, %d remain\n"
							"tuples: %.0f removed, %.0f remain\n"
							"buffer usage: %d hits, %d misses, %d dirtied\n"
							"avg read rate: %.3f MiB/s, avg write rate: %.3f MiB/s\n"
							"system usage: %s",
							get_database_name(MyDatabaseId),
							get_namespace_name(RelationGetNamespace(onerel)),
							RelationGetRelationName(onerel),
							vacrelstats->num_index_scans,
							vacrelstats->pages_removed,
							vacrelstats->rel_pages,
							vacrelstats->tuples_deleted,
							vacrelstats->new_rel_tuples,
							VacuumPageHit,
							VacuumPageMiss,
							VacuumPageDirty,
							read_rate,write_rate,
							pg_rusage_show(&ru0))));
		}
	}
}
示例#11
0
/* Main loop of walsender process */
static int
WalSndLoop(void)
{
	char	   *output_message;
	bool		caughtup = false;

	/*
	 * Allocate buffer that will be used for each output message.  We do this
	 * just once to reduce palloc overhead.  The buffer must be made large
	 * enough for maximum-sized messages.
	 */
	output_message = palloc(1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE);

	/*
	 * Allocate buffer that will be used for processing reply messages.  As
	 * above, do this just once to reduce palloc overhead.
	 */
	initStringInfo(&reply_message);

	/* Initialize the last reply timestamp */
	last_reply_timestamp = GetCurrentTimestamp();

	/* Loop forever, unless we get an error */
	for (;;)
	{
		/* Clear any already-pending wakeups */
		ResetLatch(&MyWalSnd->latch);

		/*
		 * Emergency bailout if postmaster has died.  This is to avoid the
		 * necessity for manual cleanup of all postmaster children.
		 */
		if (!PostmasterIsAlive())
			exit(1);

		/* Process any requests or signals received recently */
		if (got_SIGHUP)
		{
			got_SIGHUP = false;
			ProcessConfigFile(PGC_SIGHUP);
			SyncRepInitConfig();
		}

		/* Normal exit from the walsender is here */
		if (walsender_shutdown_requested)
		{
			/* Inform the standby that XLOG streaming is done */
			pq_puttextmessage('C', "COPY 0");
			pq_flush();

			proc_exit(0);
		}

		/* Check for input from the client */
		ProcessRepliesIfAny();

		/*
		 * If we don't have any pending data in the output buffer, try to send
		 * some more.  If there is some, we don't bother to call XLogSend
		 * again until we've flushed it ... but we'd better assume we are not
		 * caught up.
		 */
		if (!pq_is_send_pending())
			XLogSend(output_message, &caughtup);
		else
			caughtup = false;

		/* Try to flush pending output to the client */
		if (pq_flush_if_writable() != 0)
			break;

		/* If nothing remains to be sent right now ... */
		if (caughtup && !pq_is_send_pending())
		{
			/*
			 * If we're in catchup state, move to streaming.  This is an
			 * important state change for users to know about, since before
			 * this point data loss might occur if the primary dies and we
			 * need to failover to the standby. The state change is also
			 * important for synchronous replication, since commits that
			 * started to wait at that point might wait for some time.
			 */
			if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
			{
				ereport(DEBUG1,
						(errmsg("standby \"%s\" has now caught up with primary",
								application_name)));
				WalSndSetState(WALSNDSTATE_STREAMING);
			}

			/*
			 * When SIGUSR2 arrives, we send any outstanding logs up to the
			 * shutdown checkpoint record (i.e., the latest record) and exit.
			 * This may be a normal termination at shutdown, or a promotion,
			 * the walsender is not sure which.
			 */
			if (walsender_ready_to_stop)
			{
				/* ... let's just be real sure we're caught up ... */
				XLogSend(output_message, &caughtup);
				if (caughtup && !pq_is_send_pending())
				{
					walsender_shutdown_requested = true;
					continue;		/* don't want to wait more */
				}
			}
		}

		/*
		 * We don't block if not caught up, unless there is unsent data
		 * pending in which case we'd better block until the socket is
		 * write-ready.  This test is only needed for the case where XLogSend
		 * loaded a subset of the available data but then pq_flush_if_writable
		 * flushed it all --- we should immediately try to send more.
		 */
		if (caughtup || pq_is_send_pending())
		{
			TimestampTz finish_time = 0;
			long		sleeptime = -1;
			int			wakeEvents;

			wakeEvents = WL_LATCH_SET | WL_POSTMASTER_DEATH |
				WL_SOCKET_READABLE;
			if (pq_is_send_pending())
				wakeEvents |= WL_SOCKET_WRITEABLE;

			/* Determine time until replication timeout */
			if (replication_timeout > 0)
			{
				long		secs;
				int			usecs;

				finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp,
														  replication_timeout);
				TimestampDifference(GetCurrentTimestamp(),
									finish_time, &secs, &usecs);
				sleeptime = secs * 1000 + usecs / 1000;
				/* Avoid Assert in WaitLatchOrSocket if timeout is past */
				if (sleeptime < 0)
					sleeptime = 0;
				wakeEvents |= WL_TIMEOUT;
			}

			/* Sleep until something happens or replication timeout */
			WaitLatchOrSocket(&MyWalSnd->latch, wakeEvents,
							  MyProcPort->sock, sleeptime);

			/*
			 * Check for replication timeout.  Note we ignore the corner case
			 * possibility that the client replied just as we reached the
			 * timeout ... he's supposed to reply *before* that.
			 */
			if (replication_timeout > 0 &&
				GetCurrentTimestamp() >= finish_time)
			{
				/*
				 * Since typically expiration of replication timeout means
				 * communication problem, we don't send the error message to
				 * the standby.
				 */
				ereport(COMMERROR,
						(errmsg("terminating walsender process due to replication timeout")));
				break;
			}
		}
	}

	/*
	 * Get here on send failure.  Clean up and exit.
	 *
	 * Reset whereToSendOutput to prevent ereport from attempting to send any
	 * more messages to the standby.
	 */
	if (whereToSendOutput == DestRemote)
		whereToSendOutput = DestNone;

	proc_exit(0);
	return 1;					/* keep the compiler quiet */
}
示例#12
0
/*
 * ProcSleep -- put a process to sleep on the specified lock
 *
 * Caller must have set MyProc->heldLocks to reflect locks already held
 * on the lockable object by this process (under all XIDs).
 *
 * The lock table's partition lock must be held at entry, and will be held
 * at exit.
 *
 * Result: STATUS_OK if we acquired the lock, STATUS_ERROR if not (deadlock).
 *
 * ASSUME: that no one will fiddle with the queue until after
 *		we release the partition lock.
 *
 * NOTES: The process queue is now a priority queue for locking.
 */
int
ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
{
	LOCKMODE	lockmode = locallock->tag.mode;
	LOCK	   *lock = locallock->lock;
	PROCLOCK   *proclock = locallock->proclock;
	uint32		hashcode = locallock->hashcode;
	LWLock	   *partitionLock = LockHashPartitionLock(hashcode);
	PROC_QUEUE *waitQueue = &(lock->waitProcs);
	LOCKMASK	myHeldLocks = MyProc->heldLocks;
	bool		early_deadlock = false;
	bool		allow_autovacuum_cancel = true;
	int			myWaitStatus;
	PGPROC	   *proc;
	int			i;

	/*
	 * Determine where to add myself in the wait queue.
	 *
	 * Normally I should go at the end of the queue.  However, if I already
	 * hold locks that conflict with the request of any previous waiter, put
	 * myself in the queue just in front of the first such waiter. This is not
	 * a necessary step, since deadlock detection would move me to before that
	 * waiter anyway; but it's relatively cheap to detect such a conflict
	 * immediately, and avoid delaying till deadlock timeout.
	 *
	 * Special case: if I find I should go in front of some waiter, check to
	 * see if I conflict with already-held locks or the requests before that
	 * waiter.  If not, then just grant myself the requested lock immediately.
	 * This is the same as the test for immediate grant in LockAcquire, except
	 * we are only considering the part of the wait queue before my insertion
	 * point.
	 */
	if (myHeldLocks != 0)
	{
		LOCKMASK	aheadRequests = 0;

		proc = (PGPROC *) waitQueue->links.next;
		for (i = 0; i < waitQueue->size; i++)
		{
			/* Must he wait for me? */
			if (lockMethodTable->conflictTab[proc->waitLockMode] & myHeldLocks)
			{
				/* Must I wait for him ? */
				if (lockMethodTable->conflictTab[lockmode] & proc->heldLocks)
				{
					/*
					 * Yes, so we have a deadlock.  Easiest way to clean up
					 * correctly is to call RemoveFromWaitQueue(), but we
					 * can't do that until we are *on* the wait queue. So, set
					 * a flag to check below, and break out of loop.  Also,
					 * record deadlock info for later message.
					 */
					RememberSimpleDeadLock(MyProc, lockmode, lock, proc);
					early_deadlock = true;
					break;
				}
				/* I must go before this waiter.  Check special case. */
				if ((lockMethodTable->conflictTab[lockmode] & aheadRequests) == 0 &&
					LockCheckConflicts(lockMethodTable,
									   lockmode,
									   lock,
									   proclock) == STATUS_OK)
				{
					/* Skip the wait and just grant myself the lock. */
					GrantLock(lock, proclock, lockmode);
					GrantAwaitedLock();
					return STATUS_OK;
				}
				/* Break out of loop to put myself before him */
				break;
			}
			/* Nope, so advance to next waiter */
			aheadRequests |= LOCKBIT_ON(proc->waitLockMode);
			proc = (PGPROC *) proc->links.next;
		}

		/*
		 * If we fall out of loop normally, proc points to waitQueue head, so
		 * we will insert at tail of queue as desired.
		 */
	}
	else
	{
		/* I hold no locks, so I can't push in front of anyone. */
		proc = (PGPROC *) &(waitQueue->links);
	}

	/*
	 * Insert self into queue, ahead of the given proc (or at tail of queue).
	 */
	SHMQueueInsertBefore(&(proc->links), &(MyProc->links));
	waitQueue->size++;

	lock->waitMask |= LOCKBIT_ON(lockmode);

	/* Set up wait information in PGPROC object, too */
	MyProc->waitLock = lock;
	MyProc->waitProcLock = proclock;
	MyProc->waitLockMode = lockmode;

	MyProc->waitStatus = STATUS_WAITING;

	/*
	 * If we detected deadlock, give up without waiting.  This must agree with
	 * CheckDeadLock's recovery code, except that we shouldn't release the
	 * semaphore since we haven't tried to lock it yet.
	 */
	if (early_deadlock)
	{
		RemoveFromWaitQueue(MyProc, hashcode);
		return STATUS_ERROR;
	}

	/* mark that we are waiting for a lock */
	lockAwaited = locallock;

	/*
	 * Release the lock table's partition lock.
	 *
	 * NOTE: this may also cause us to exit critical-section state, possibly
	 * allowing a cancel/die interrupt to be accepted. This is OK because we
	 * have recorded the fact that we are waiting for a lock, and so
	 * LockErrorCleanup will clean up if cancel/die happens.
	 */
	LWLockRelease(partitionLock);

	/*
	 * Also, now that we will successfully clean up after an ereport, it's
	 * safe to check to see if there's a buffer pin deadlock against the
	 * Startup process.  Of course, that's only necessary if we're doing Hot
	 * Standby and are not the Startup process ourselves.
	 */
	if (RecoveryInProgress() && !InRecovery)
		CheckRecoveryConflictDeadlock();

	/* Reset deadlock_state before enabling the timeout handler */
	deadlock_state = DS_NOT_YET_CHECKED;
	got_deadlock_timeout = false;

	/*
	 * Set timer so we can wake up after awhile and check for a deadlock. If a
	 * deadlock is detected, the handler releases the process's semaphore and
	 * sets MyProc->waitStatus = STATUS_ERROR, allowing us to know that we
	 * must report failure rather than success.
	 *
	 * By delaying the check until we've waited for a bit, we can avoid
	 * running the rather expensive deadlock-check code in most cases.
	 *
	 * If LockTimeout is set, also enable the timeout for that.  We can save a
	 * few cycles by enabling both timeout sources in one call.
	 */
	if (LockTimeout > 0)
	{
		EnableTimeoutParams timeouts[2];

		timeouts[0].id = DEADLOCK_TIMEOUT;
		timeouts[0].type = TMPARAM_AFTER;
		timeouts[0].delay_ms = DeadlockTimeout;
		timeouts[1].id = LOCK_TIMEOUT;
		timeouts[1].type = TMPARAM_AFTER;
		timeouts[1].delay_ms = LockTimeout;
		enable_timeouts(timeouts, 2);
	}
	else
		enable_timeout_after(DEADLOCK_TIMEOUT, DeadlockTimeout);

	/*
	 * If somebody wakes us between LWLockRelease and WaitLatch, the latch
	 * will not wait. But a set latch does not necessarily mean that the lock
	 * is free now, as there are many other sources for latch sets than
	 * somebody releasing the lock.
	 *
	 * We process interrupts whenever the latch has been set, so cancel/die
	 * interrupts are processed quickly. This means we must not mind losing
	 * control to a cancel/die interrupt here.  We don't, because we have no
	 * shared-state-change work to do after being granted the lock (the
	 * grantor did it all).  We do have to worry about canceling the deadlock
	 * timeout and updating the locallock table, but if we lose control to an
	 * error, LockErrorCleanup will fix that up.
	 */
	do
	{
		WaitLatch(MyLatch, WL_LATCH_SET, 0);
		ResetLatch(MyLatch);
		/* check for deadlocks first, as that's probably log-worthy */
		if (got_deadlock_timeout)
		{
			CheckDeadLock();
			got_deadlock_timeout = false;
		}
		CHECK_FOR_INTERRUPTS();

		/*
		 * waitStatus could change from STATUS_WAITING to something else
		 * asynchronously.  Read it just once per loop to prevent surprising
		 * behavior (such as missing log messages).
		 */
		myWaitStatus = *((volatile int *) &MyProc->waitStatus);

		/*
		 * If we are not deadlocked, but are waiting on an autovacuum-induced
		 * task, send a signal to interrupt it.
		 */
		if (deadlock_state == DS_BLOCKED_BY_AUTOVACUUM && allow_autovacuum_cancel)
		{
			PGPROC	   *autovac = GetBlockingAutoVacuumPgproc();
			PGXACT	   *autovac_pgxact = &ProcGlobal->allPgXact[autovac->pgprocno];

			LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

			/*
			 * Only do it if the worker is not working to protect against Xid
			 * wraparound.
			 */
			if ((autovac_pgxact->vacuumFlags & PROC_IS_AUTOVACUUM) &&
				!(autovac_pgxact->vacuumFlags & PROC_VACUUM_FOR_WRAPAROUND))
			{
				int			pid = autovac->pid;
				StringInfoData locktagbuf;
				StringInfoData logbuf;	/* errdetail for server log */

				initStringInfo(&locktagbuf);
				initStringInfo(&logbuf);
				DescribeLockTag(&locktagbuf, &lock->tag);
				appendStringInfo(&logbuf,
								 _("Process %d waits for %s on %s."),
								 MyProcPid,
							  GetLockmodeName(lock->tag.locktag_lockmethodid,
											  lockmode),
								 locktagbuf.data);

				/* release lock as quickly as possible */
				LWLockRelease(ProcArrayLock);

				ereport(LOG,
					  (errmsg("sending cancel to blocking autovacuum PID %d",
							  pid),
					   errdetail_log("%s", logbuf.data)));

				pfree(logbuf.data);
				pfree(locktagbuf.data);

				/* send the autovacuum worker Back to Old Kent Road */
				if (kill(pid, SIGINT) < 0)
				{
					/* Just a warning to allow multiple callers */
					ereport(WARNING,
							(errmsg("could not send signal to process %d: %m",
									pid)));
				}
			}
			else
				LWLockRelease(ProcArrayLock);

			/* prevent signal from being resent more than once */
			allow_autovacuum_cancel = false;
		}

		/*
		 * If awoken after the deadlock check interrupt has run, and
		 * log_lock_waits is on, then report about the wait.
		 */
		if (log_lock_waits && deadlock_state != DS_NOT_YET_CHECKED)
		{
			StringInfoData buf,
						lock_waiters_sbuf,
						lock_holders_sbuf;
			const char *modename;
			long		secs;
			int			usecs;
			long		msecs;
			SHM_QUEUE  *procLocks;
			PROCLOCK   *proclock;
			bool		first_holder = true,
						first_waiter = true;
			int			lockHoldersNum = 0;

			initStringInfo(&buf);
			initStringInfo(&lock_waiters_sbuf);
			initStringInfo(&lock_holders_sbuf);

			DescribeLockTag(&buf, &locallock->tag.lock);
			modename = GetLockmodeName(locallock->tag.lock.locktag_lockmethodid,
									   lockmode);
			TimestampDifference(get_timeout_start_time(DEADLOCK_TIMEOUT),
								GetCurrentTimestamp(),
								&secs, &usecs);
			msecs = secs * 1000 + usecs / 1000;
			usecs = usecs % 1000;

			/*
			 * we loop over the lock's procLocks to gather a list of all
			 * holders and waiters. Thus we will be able to provide more
			 * detailed information for lock debugging purposes.
			 *
			 * lock->procLocks contains all processes which hold or wait for
			 * this lock.
			 */

			LWLockAcquire(partitionLock, LW_SHARED);

			procLocks = &(lock->procLocks);
			proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
											   offsetof(PROCLOCK, lockLink));

			while (proclock)
			{
				/*
				 * we are a waiter if myProc->waitProcLock == proclock; we are
				 * a holder if it is NULL or something different
				 */
				if (proclock->tag.myProc->waitProcLock == proclock)
				{
					if (first_waiter)
					{
						appendStringInfo(&lock_waiters_sbuf, "%d",
										 proclock->tag.myProc->pid);
						first_waiter = false;
					}
					else
						appendStringInfo(&lock_waiters_sbuf, ", %d",
										 proclock->tag.myProc->pid);
				}
				else
				{
					if (first_holder)
					{
						appendStringInfo(&lock_holders_sbuf, "%d",
										 proclock->tag.myProc->pid);
						first_holder = false;
					}
					else
						appendStringInfo(&lock_holders_sbuf, ", %d",
										 proclock->tag.myProc->pid);

					lockHoldersNum++;
				}

				proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->lockLink,
											   offsetof(PROCLOCK, lockLink));
			}

			LWLockRelease(partitionLock);

			if (deadlock_state == DS_SOFT_DEADLOCK)
				ereport(LOG,
						(errmsg("process %d avoided deadlock for %s on %s by rearranging queue order after %ld.%03d ms",
								MyProcPid, modename, buf.data, msecs, usecs),
						 (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.",
						   "Processes holding the lock: %s. Wait queue: %s.",
											   lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data))));
			else if (deadlock_state == DS_HARD_DEADLOCK)
			{
				/*
				 * This message is a bit redundant with the error that will be
				 * reported subsequently, but in some cases the error report
				 * might not make it to the log (eg, if it's caught by an
				 * exception handler), and we want to ensure all long-wait
				 * events get logged.
				 */
				ereport(LOG,
						(errmsg("process %d detected deadlock while waiting for %s on %s after %ld.%03d ms",
								MyProcPid, modename, buf.data, msecs, usecs),
						 (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.",
						   "Processes holding the lock: %s. Wait queue: %s.",
											   lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data))));
			}

			if (myWaitStatus == STATUS_WAITING)
				ereport(LOG,
						(errmsg("process %d still waiting for %s on %s after %ld.%03d ms",
								MyProcPid, modename, buf.data, msecs, usecs),
						 (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.",
						   "Processes holding the lock: %s. Wait queue: %s.",
											   lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data))));
			else if (myWaitStatus == STATUS_OK)
				ereport(LOG,
					(errmsg("process %d acquired %s on %s after %ld.%03d ms",
							MyProcPid, modename, buf.data, msecs, usecs)));
			else
			{
				Assert(myWaitStatus == STATUS_ERROR);

				/*
				 * Currently, the deadlock checker always kicks its own
				 * process, which means that we'll only see STATUS_ERROR when
				 * deadlock_state == DS_HARD_DEADLOCK, and there's no need to
				 * print redundant messages.  But for completeness and
				 * future-proofing, print a message if it looks like someone
				 * else kicked us off the lock.
				 */
				if (deadlock_state != DS_HARD_DEADLOCK)
					ereport(LOG,
							(errmsg("process %d failed to acquire %s on %s after %ld.%03d ms",
								MyProcPid, modename, buf.data, msecs, usecs),
							 (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.",
						   "Processes holding the lock: %s. Wait queue: %s.",
												   lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data))));
			}

			/*
			 * At this point we might still need to wait for the lock. Reset
			 * state so we don't print the above messages again.
			 */
			deadlock_state = DS_NO_DEADLOCK;

			pfree(buf.data);
			pfree(lock_holders_sbuf.data);
			pfree(lock_waiters_sbuf.data);
		}
	} while (myWaitStatus == STATUS_WAITING);

	/*
	 * Disable the timers, if they are still running.  As in LockErrorCleanup,
	 * we must preserve the LOCK_TIMEOUT indicator flag: if a lock timeout has
	 * already caused QueryCancelPending to become set, we want the cancel to
	 * be reported as a lock timeout, not a user cancel.
	 */
	if (LockTimeout > 0)
	{
		DisableTimeoutParams timeouts[2];

		timeouts[0].id = DEADLOCK_TIMEOUT;
		timeouts[0].keep_indicator = false;
		timeouts[1].id = LOCK_TIMEOUT;
		timeouts[1].keep_indicator = true;
		disable_timeouts(timeouts, 2);
	}
	else
		disable_timeout(DEADLOCK_TIMEOUT, false);

	/*
	 * Re-acquire the lock table's partition lock.  We have to do this to hold
	 * off cancel/die interrupts before we can mess with lockAwaited (else we
	 * might have a missed or duplicated locallock update).
	 */
	LWLockAcquire(partitionLock, LW_EXCLUSIVE);

	/*
	 * We no longer want LockErrorCleanup to do anything.
	 */
	lockAwaited = NULL;

	/*
	 * If we got the lock, be sure to remember it in the locallock table.
	 */
	if (MyProc->waitStatus == STATUS_OK)
		GrantAwaitedLock();

	/*
	 * We don't have to do anything else, because the awaker did all the
	 * necessary update of the lock table and MyProc.
	 */
	return MyProc->waitStatus;
}
示例#13
0
/* Main loop of walsender process */
static int
WalSndLoop(void)
{
    char	   *output_message;
    bool		caughtup = false;

    /*
     * Allocate buffer that will be used for each output message.  We do this
     * just once to reduce palloc overhead.  The buffer must be made large
     * enough for maximum-sized messages.
     */
    output_message = palloc(1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE);

    /*
     * Allocate buffer that will be used for processing reply messages.  As
     * above, do this just once to reduce palloc overhead.
     */
    initStringInfo(&reply_message);

    /* Initialize the last reply timestamp */
    last_reply_timestamp = GetCurrentTimestamp();

    /* Loop forever, unless we get an error */
    for (;;)
    {
        /*
         * Emergency bailout if postmaster has died.  This is to avoid the
         * necessity for manual cleanup of all postmaster children.
         */
        if (!PostmasterIsAlive(true))
            exit(1);

        /* Process any requests or signals received recently */
        if (got_SIGHUP)
        {
            got_SIGHUP = false;
            ProcessConfigFile(PGC_SIGHUP);
            SyncRepInitConfig();
        }

        /* Normal exit from the walsender is here */
        if (walsender_shutdown_requested)
        {
            /* Inform the standby that XLOG streaming was done */
            pq_puttextmessage('C', "COPY 0");
            pq_flush();

            proc_exit(0);
        }

        /*
         * If we don't have any pending data in the output buffer, try to send
         * some more.
         */
        if (!pq_is_send_pending())
        {
            XLogSend(output_message, &caughtup);

            /*
             * Even if we wrote all the WAL that was available when we started
             * sending, more might have arrived while we were sending this
             * batch. We had the latch set while sending, so we have not
             * received any signals from that time. Let's arm the latch again,
             * and after that check that we're still up-to-date.
             */
            if (caughtup && !pq_is_send_pending())
            {
                ResetLatch(&MyWalSnd->latch);

                XLogSend(output_message, &caughtup);
            }
        }

        /* Flush pending output to the client */
        if (pq_flush_if_writable() != 0)
            break;

        /*
         * When SIGUSR2 arrives, we send any outstanding logs up to the
         * shutdown checkpoint record (i.e., the latest record) and exit.
         */
        if (walsender_ready_to_stop && !pq_is_send_pending())
        {
            XLogSend(output_message, &caughtup);
            ProcessRepliesIfAny();
            if (caughtup && !pq_is_send_pending())
                walsender_shutdown_requested = true;
        }

        if ((caughtup || pq_is_send_pending()) &&
                !got_SIGHUP &&
                !walsender_shutdown_requested)
        {
            TimestampTz finish_time = 0;
            long		sleeptime;

            /* Reschedule replication timeout */
            if (replication_timeout > 0)
            {
                long		secs;
                int			usecs;

                finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp,
                              replication_timeout);
                TimestampDifference(GetCurrentTimestamp(),
                                    finish_time, &secs, &usecs);
                sleeptime = secs * 1000 + usecs / 1000;
                if (WalSndDelay < sleeptime)
                    sleeptime = WalSndDelay;
            }
            else
            {
                /*
                 * XXX: Without timeout, we don't really need the periodic
                 * wakeups anymore, WaitLatchOrSocket should reliably wake up
                 * as soon as something interesting happens.
                 */
                sleeptime = WalSndDelay;
            }

            /* Sleep */
            WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
                              true, pq_is_send_pending(),
                              sleeptime);

            /* Check for replication timeout */
            if (replication_timeout > 0 &&
                    GetCurrentTimestamp() >= finish_time)
            {
                /*
                 * Since typically expiration of replication timeout means
                 * communication problem, we don't send the error message to
                 * the standby.
                 */
                ereport(COMMERROR,
                        (errmsg("terminating walsender process due to replication timeout")));
                break;
            }
        }

        /*
         * If we're in catchup state, see if its time to move to streaming.
         * This is an important state change for users, since before this
         * point data loss might occur if the primary dies and we need to
         * failover to the standby. The state change is also important for
         * synchronous replication, since commits that started to wait at that
         * point might wait for some time.
         */
        if (MyWalSnd->state == WALSNDSTATE_CATCHUP && caughtup)
        {
            ereport(DEBUG1,
                    (errmsg("standby \"%s\" has now caught up with primary",
                            application_name)));
            WalSndSetState(WALSNDSTATE_STREAMING);
        }

        ProcessRepliesIfAny();
    }

    /*
     * Get here on send failure.  Clean up and exit.
     *
     * Reset whereToSendOutput to prevent ereport from attempting to send any
     * more messages to the standby.
     */
    if (whereToSendOutput == DestRemote)
        whereToSendOutput = DestNone;

    proc_exit(0);
    return 1;					/* keep the compiler quiet */
}
示例#14
0
文件: autostats.c 项目: 50wu/gpdb
/*
 * This method takes a decision to run analyze based on the query and the number of modified tuples based
 * on the policy set via gp_autostats_mode. The following modes are currently supported:
 * none			:	no automatic analyzes are issued. simply return.
 * on_change	:	if the number of modified tuples > gp_onchange_threshold, then an automatic analyze is issued.
 * on_no_stats	:	if the operation is a ctas/insert-select and there are no stats on the modified table,
 *					an automatic analyze is issued.
 */
void
auto_stats(AutoStatsCmdType cmdType, Oid relationOid, uint64 ntuples, bool inFunction)
{
	TimestampTz start;
	bool		policyCheck = false;

	start = GetCurrentTimestamp();

	if (Gp_role != GP_ROLE_DISPATCH || relationOid == InvalidOid || rel_is_partitioned(relationOid))
	{
		return;
	}

	Assert(relationOid != InvalidOid);
	Assert(cmdType >= 0 && cmdType <= AUTOSTATS_CMDTYPE_SENTINEL);		/* it is a valid command
																		 * as per auto-stats */

	GpAutoStatsModeValue actual_gp_autostats_mode;

	if (inFunction)
	{
		actual_gp_autostats_mode = gp_autostats_mode_in_functions;
	}
	else
	{
		actual_gp_autostats_mode = gp_autostats_mode;
	}

	switch (actual_gp_autostats_mode)
	{
		case GP_AUTOSTATS_ON_CHANGE:
			policyCheck = autostats_on_change_check(cmdType, ntuples);
			break;
		case GP_AUTOSTATS_ON_NO_STATS:
			policyCheck = autostats_on_no_stats_check(cmdType, relationOid);
			break;
		default:
			Assert(actual_gp_autostats_mode == GP_AUTOSTATS_NONE);
			policyCheck = false;
			break;
	}

	if (!policyCheck)
	{
		elog(DEBUG3, "In mode %s, command %s on (dboid,tableoid)=(%d,%d) modifying " UINT64_FORMAT " tuples did not issue Auto-ANALYZE.",
			 gpvars_show_gp_autostats_mode(),
			 autostats_cmdtype_to_string(cmdType),
			 MyDatabaseId,
			 relationOid,
			 ntuples);

		return;
	}

	if (log_autostats)
	{
		const char *autostats_mode;

		if (inFunction)
		{
			autostats_mode = gpvars_show_gp_autostats_mode_in_functions();
		}
		else
		{
			autostats_mode = gpvars_show_gp_autostats_mode();
		}
		elog(LOG, "In mode %s, command %s on (dboid,tableoid)=(%d,%d) modifying " UINT64_FORMAT " tuples caused Auto-ANALYZE.",
			 autostats_mode,
			 autostats_cmdtype_to_string(cmdType),
			 MyDatabaseId,
			 relationOid,
			 ntuples);
	}

	autostats_issue_analyze(relationOid);

	if (log_duration)
	{
		long		secs;
		int			usecs;
		int			msecs;

		TimestampDifference(start, GetCurrentTimestamp(), &secs, &usecs);
		msecs = usecs / 1000;
		elog(LOG, "duration: %ld.%03d ms Auto-ANALYZE", secs * 1000 + msecs, usecs % 1000);
	}
}
示例#15
0
文件: cfs.c 项目: knizhnik/postgresql
/*
 * Perform garbage collection (if required) of file
 * @param map_path path to file map file (*.map). 
 */
static bool cfs_gc_file(char* map_path)
{
	int md = open(map_path, O_RDWR|PG_BINARY, 0);
	FileMap* map;
	uint32 physSize;
	uint32 usedSize;
	uint32 virtSize;
	int suf = strlen(map_path)-4;
	int fd = -1, fd2 = -1, md2 = -1;
	bool succeed = true;

	if (md < 0) { 
		elog(LOG, "Failed to open map file %s: %m", map_path);
		return false;
	}
	map = cfs_mmap(md);
	if (map == MAP_FAILED) {
		elog(LOG, "Failed to map file %s: %m", map_path);
		close(md);
		return false;
	}
	usedSize = pg_atomic_read_u32(&map->usedSize);
	physSize = pg_atomic_read_u32(&map->physSize);
	virtSize = pg_atomic_read_u32(&map->virtSize);
		
	if ((physSize - usedSize)*100 > physSize*cfs_gc_threshold) /* do we need to perform defragmentation? */
	{ 
		long delay = CFS_LOCK_MIN_TIMEOUT;		
		char* file_path = (char*)palloc(suf+1);
		char* map_bck_path = (char*)palloc(suf+10);
		char* file_bck_path = (char*)palloc(suf+5);
		FileMap* newMap = (FileMap*)palloc0(sizeof(FileMap));
		uint32 newSize = 0;
		inode_t** inodes = (inode_t**)palloc(RELSEG_SIZE*sizeof(inode_t*));
		bool remove_backups = true;
		int n_pages = virtSize / BLCKSZ;
		TimestampTz startTime, endTime;
		long secs;
		int usecs;
		int i;
		
		startTime = GetCurrentTimestamp();

		memcpy(file_path, map_path, suf);
		file_path[suf] = '\0';
		strcat(strcpy(map_bck_path, map_path), ".bck");
		strcat(strcpy(file_bck_path, file_path), ".bck");

		while (true) { 
			uint32 access_count = 0;
			if (pg_atomic_compare_exchange_u32(&map->lock, &access_count, CFS_GC_LOCK)) {				
				break;
			}
			if (access_count >= CFS_GC_LOCK) { 
				/* Uhhh... looks like last GC was interrupted.
				 * Try to recover file
				 */
				if (access(file_bck_path, R_OK) != 0) {
					/* There is no backup file: new map should be constructed */					
					md2 = open(map_bck_path, O_RDWR|PG_BINARY, 0);
					if (md2 >= 0) { 
						/* Recover map */
						if (!cfs_read_file(md2, newMap, sizeof(FileMap))) { 
							elog(LOG, "Failed to read file %s: %m", map_bck_path);
							goto Cleanup;
						}
						close(md2);
						md2 = -1;
						newSize = pg_atomic_read_u32(&newMap->usedSize);
						remove_backups = false;
						goto ReplaceMap;
					}
				} else { 
					/* Presence of backup file means that we still have unchanged data and map files.
					 * Just remove backup files, grab lock and continue processing
					 */
					unlink(file_bck_path);
					unlink(map_bck_path);
					break;
				}
			}
			pg_usleep(delay);
			if (delay < CFS_LOCK_MAX_TIMEOUT) { 
				delay *= 2;
			}
		}				 			
		md2 = open(map_bck_path, O_CREAT|O_RDWR|PG_BINARY|O_TRUNC, 0600);
		if (md2 < 0) { 
			goto Cleanup;
		}
		for (i = 0; i < n_pages; i++) { 
			newMap->inodes[i] = map->inodes[i];
		    inodes[i] = &newMap->inodes[i];
		}
		/* sort inodes by offset to improve read locality */
		qsort(inodes, n_pages, sizeof(inode_t*), cfs_cmp_page_offs);
		
		fd = open(file_path, O_RDWR|PG_BINARY, 0);
		if (fd < 0) { 
			goto Cleanup;
		}
		
		fd2 = open(file_bck_path, O_CREAT|O_RDWR|PG_BINARY|O_TRUNC, 0600);
		if (fd2 < 0) { 
			goto Cleanup;
		}
		
		for (i = 0; i < n_pages; i++) { 
			int size = CFS_INODE_SIZE(*inodes[i]);
			if (size != 0) { 
				char block[BLCKSZ];
				off_t rc PG_USED_FOR_ASSERTS_ONLY;
				uint32 offs = CFS_INODE_OFFS(*inodes[i]);
				Assert(size <= BLCKSZ);	
				rc = lseek(fd, offs, SEEK_SET);
				Assert(rc == offs);
				
				if (!cfs_read_file(fd, block, size)) { 
					elog(LOG, "Failed to read file %s: %m", file_path);
					goto Cleanup;
				}
				
				if (!cfs_write_file(fd2, block, size)) { 
					elog(LOG, "Failed to write file %s: %m", file_bck_path);
					goto Cleanup;
				}
				offs = newSize;
				newSize += size;
				*inodes[i] = CFS_INODE(size, offs);
			}
		}
		pg_atomic_write_u32(&map->usedSize, newSize);

		if (close(fd) < 0) { 
			elog(LOG, "Failed to close file %s: %m", file_path);
			goto Cleanup;
		}
		fd = -1;

		/* Persist copy of data file */
		if (pg_fsync(fd2) < 0) { 
			elog(LOG, "Failed to sync file %s: %m", file_bck_path);
			goto Cleanup;
		}
		if (close(fd2) < 0) { 
			elog(LOG, "Failed to close file %s: %m", file_bck_path);
			goto Cleanup;
		}
		fd2 = -1;

		/* Persist copy of map file */
		if (!cfs_write_file(md2, &newMap, sizeof(newMap))) { 
			elog(LOG, "Failed to write file %s: %m", map_bck_path);
			goto Cleanup;
		}
		if (pg_fsync(md2) < 0) { 
			elog(LOG, "Failed to sync file %s: %m", map_bck_path);
			goto Cleanup;
		}
		if (close(md2) < 0) { 
			elog(LOG, "Failed to close file %s: %m", map_bck_path);
			goto Cleanup;
		}
		md2 = -1;

		/* Persist map with CFS_GC_LOCK set: in case of crash we will know that map may be changed by GC */
		if (cfs_msync(map) < 0) {
			elog(LOG, "Failed to sync map %s: %m", map_path);
			goto Cleanup;
		}
		if (pg_fsync(md) < 0) { 
			elog(LOG, "Failed to sync file %s: %m", map_path);
			goto Cleanup;
		}
		
		/* 
		 * Now all information necessary for recovery is stored.
		 * We are ready to replace existed file with defragmented one.
		 * Use rename and rely on file system to provide atomicity of this operation.
		 */
		remove_backups = false;
		if (rename(file_bck_path, file_path) < 0) { 
			elog(LOG, "Failed to rename file %s: %m", file_path);
			goto Cleanup;
		}
	  ReplaceMap:
		/* At this moment defragmented file version is stored. We can perfrom in-place update of map.
		 * If crash happens at this point, map can be recovered from backup file */
		memcpy(map->inodes, newMap->inodes, n_pages * sizeof(inode_t));
		pg_atomic_write_u32(&map->usedSize, newSize);
		pg_atomic_write_u32(&map->physSize, newSize);
		map->generation += 1; /* force all backends to reopen the file */
		
		/* Before removing backup files and releasing locks we need to flush updated map file */
		if (cfs_msync(map) < 0) {
			elog(LOG, "Failed to sync map %s: %m", map_path);
			goto Cleanup;
		}
		if (pg_fsync(md) < 0) { 
			elog(LOG, "Failed to sync file %s: %m", map_path);
		  Cleanup:
			if (fd >= 0) close(fd);
			if (fd2 >= 0) close(fd2);
			if (md2 >= 0) close(md2);
			if (remove_backups) { 
				unlink(file_bck_path);
				unlink(map_bck_path);		
				remove_backups = false;
			}	
			succeed = false;
		} else { 
			remove_backups = true; /* now backups are not need any more */
		}
		pg_atomic_fetch_sub_u32(&map->lock, CFS_GC_LOCK); /* release lock */

		/* remove map backup file */
		if (remove_backups && unlink(map_bck_path)) {
			elog(LOG, "Failed to unlink file %s: %m", map_bck_path);
			succeed = false;
		}
		
		endTime = GetCurrentTimestamp();
		TimestampDifference(startTime, endTime, &secs, &usecs);

		elog(LOG, "%d: defragment file %s: old size %d, new size %d, logical size %d, used %d, compression ratio %f, time %ld usec",
			 MyProcPid, file_path, physSize, newSize, virtSize, usedSize, (double)virtSize/newSize,
			 secs*USECS_PER_SEC + usecs);

		pfree(file_path);
		pfree(file_bck_path);
		pfree(map_bck_path);
		pfree(inodes);
		pfree(newMap);
		
		if (cfs_gc_delay != 0) { 
			int rc = WaitLatch(MyLatch,
							   WL_TIMEOUT | WL_POSTMASTER_DEATH,
							   cfs_gc_delay /* ms */ );
			if (rc & WL_POSTMASTER_DEATH) {
				exit(1);
			}
		}
	} else if (cfs_state->max_iterations == 1) { 
		elog(LOG, "%d: file %.*s: physical size %d, logical size %d, used %d, compression ratio %f",
			 MyProcPid, suf, map_path, physSize, virtSize, usedSize, (double)virtSize/physSize);
	}
	
	if (cfs_munmap(map) < 0) { 
		elog(LOG, "Failed to unmap file %s: %m", map_path);
		succeed = false;
	}
	if (close(md) < 0) { 
		elog(LOG, "Failed to close file %s: %m", map_path);
		succeed = false;
	}
	return succeed;
}
示例#16
0
/*
 * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
 * to resolve conflicts with other backends holding buffer pins.
 *
 * We either resolve conflicts immediately or set a SIGALRM to wake us at
 * the limit of our patience. The sleep in LockBufferForCleanup() is
 * performed here, for code clarity.
 *
 * Resolve conflict by sending a SIGUSR1 reason to all backends to check if
 * they hold one of the buffer pins that is blocking Startup process. If so,
 * backends will take an appropriate error action, ERROR or FATAL.
 *
 * We also check for deadlocks before we wait, though applications that cause
 * these will be extremely rare.  Deadlocks occur because if queries
 * wait on a lock, that must be behind an AccessExclusiveLock, which can only
 * be cleared if the Startup process replays a transaction completion record.
 * If Startup process is also waiting then that is a deadlock. The deadlock
 * can occur if the query is waiting and then the Startup sleeps, or if
 * Startup is sleeping and the query waits on a lock. We protect against
 * only the former sequence here, the latter sequence is checked prior to
 * the query sleeping, in CheckRecoveryConflictDeadlock().
 */
void
ResolveRecoveryConflictWithBufferPin(void)
{
	bool		sig_alarm_enabled = false;

	Assert(InHotStandby);

	if (MaxStandbyDelay == 0)
	{
		/*
		 * We don't want to wait, so just tell everybody holding the pin to
		 * get out of town.
		 */
		SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
	}
	else if (MaxStandbyDelay < 0)
	{
		/*
		 * Send out a request to check for buffer pin deadlocks before we
		 * wait. This is fairly cheap, so no need to wait for deadlock timeout
		 * before trying to send it out.
		 */
		SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
	}
	else
	{
		TimestampTz then = GetLatestXLogTime();
		TimestampTz now = GetCurrentTimestamp();

		/* Are we past max_standby_delay? */
		if (TimestampDifferenceExceeds(then, now, MaxStandbyDelay))
		{
			/*
			 * We're already behind, so clear a path as quickly as possible.
			 */
			SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
		}
		else
		{
			TimestampTz fin_time;		/* Expected wake-up time by timer */
			long		timer_delay_secs;		/* Amount of time we set timer
												 * for */
			int			timer_delay_usecs;

			/*
			 * Send out a request to check for buffer pin deadlocks before we
			 * wait. This is fairly cheap, so no need to wait for deadlock
			 * timeout before trying to send it out.
			 */
			SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);

			/*
			 * How much longer we should wait?
			 */
			fin_time = TimestampTzPlusMilliseconds(then, MaxStandbyDelay);

			TimestampDifference(now, fin_time,
								&timer_delay_secs, &timer_delay_usecs);

			/*
			 * It's possible that the difference is less than a microsecond;
			 * ensure we don't cancel, rather than set, the interrupt.
			 */
			if (timer_delay_secs == 0 && timer_delay_usecs == 0)
				timer_delay_usecs = 1;

			if (enable_standby_sig_alarm(timer_delay_secs, timer_delay_usecs, fin_time))
				sig_alarm_enabled = true;
			else
				elog(FATAL, "could not set timer for process wakeup");
		}
	}

	/* Wait to be signaled by UnpinBuffer() */
	ProcWaitForSignal();

	if (sig_alarm_enabled)
	{
		if (!disable_standby_sig_alarm())
			elog(FATAL, "could not disable timer for process wakeup");
	}
}