Exemplo n.º 1
0
/*
 * Determine the cutoff time at which we want to start canceling conflicting
 * transactions.  Returns zero (a time safely in the past) if we are willing
 * to wait forever.
 */
static TimestampTz
GetStandbyLimitTime(void)
{
	TimestampTz rtime;
	bool		fromStream;

	/*
	 * The cutoff time is the last WAL data receipt time plus the appropriate
	 * delay variable.	Delay of -1 means wait forever.
	 */
	GetXLogReceiptTime(&rtime, &fromStream);
	if (fromStream)
	{
		if (max_standby_streaming_delay < 0)
			return 0;			/* wait forever */
		return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
	}
	else
	{
		if (max_standby_archive_delay < 0)
			return 0;			/* wait forever */
		return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
	}
}
Exemplo n.º 2
0
/*
 * Enable the specified timeout to fire after the specified delay.
 *
 * Delay is given in milliseconds.
 */
void
enable_timeout_after(TimeoutId id, int delay_ms)
{
	TimestampTz now;
	TimestampTz fin_time;

	/* Disable timeout interrupts for safety. */
	disable_alarm();

	/* Queue the timeout at the appropriate time. */
	now = GetCurrentTimestamp();
	fin_time = TimestampTzPlusMilliseconds(now, delay_ms);
	enable_timeout(id, now, fin_time);

	/* Set the timer interrupt. */
	schedule_alarm(now);
}
Exemplo n.º 3
0
/*
 * Reset state (called by ReScan).
 */
Datum
tsm_system_time_reset(PG_FUNCTION_ARGS)
{
	TableSampleDesc	   *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
	SystemSamplerData  *sampler = (SystemSamplerData *) tsdesc->tsmdata;

	sampler->lt = InvalidOffsetNumber;
	sampler->start_time = GetCurrentTimestamp();
	sampler->end_time = TimestampTzPlusMilliseconds(sampler->start_time,
													sampler->time);
	sampler->estblocks = 2;
	sampler->doneblocks = 0;

	sampler_random_init_state(sampler->seed, sampler->randstate);
	sampler->step = random_relative_prime(sampler->nblocks, sampler->randstate);
	sampler->lb = sampler_random_fract(sampler->randstate) * (sampler->nblocks / sampler->step);

	PG_RETURN_VOID();
}
Exemplo n.º 4
0
/*
 * Initializes the state.
 */
Datum
tsm_system_time_init(PG_FUNCTION_ARGS)
{
	TableSampleDesc	   *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
	uint32				seed = PG_GETARG_UINT32(1);
	int32				time = PG_ARGISNULL(2) ? -1 : PG_GETARG_INT32(2);
	HeapScanDesc		scan = tsdesc->heapScan;
	SystemSamplerData  *sampler;

	if (time < 1)
		ereport(ERROR,
				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
				 errmsg("invalid time limit"),
				 errhint("Time limit must be positive integer value.")));

	sampler = palloc0(sizeof(SystemSamplerData));

	/* Remember initial values for reinit */
	sampler->seed = seed;
	sampler->nblocks = scan->rs_nblocks;
	sampler->lt = InvalidOffsetNumber;
	sampler->estblocks = 2;
	sampler->doneblocks = 0;
	sampler->time = time;
	sampler->start_time = GetCurrentTimestamp();
	sampler->end_time = TimestampTzPlusMilliseconds(sampler->start_time,
													sampler->time);

	sampler_random_init_state(sampler->seed, sampler->randstate);

	/* Find relative prime as step size for linear probing. */
	sampler->step = random_relative_prime(sampler->nblocks, sampler->randstate);
	/*
	 * Randomize start position so that blocks close to step size don't have
	 * higher probability of being chosen on very short scan.
	 */
	sampler->lb = sampler_random_fract(sampler->randstate) * (sampler->nblocks / sampler->step);

	tsdesc->tsmdata = (void *) sampler;

	PG_RETURN_VOID();
}
Exemplo n.º 5
0
/*
 * Enable multiple timeouts at once.
 *
 * This works like calling enable_timeout_after() and/or enable_timeout_at()
 * multiple times.  Use this to reduce the number of GetCurrentTimestamp()
 * and setitimer() calls needed to establish multiple timeouts.
 */
void
enable_timeouts(const EnableTimeoutParams *timeouts, int count)
{
	TimestampTz now;
	int			i;

	/* Disable timeout interrupts for safety. */
	disable_alarm();

	/* Queue the timeout(s) at the appropriate times. */
	now = GetCurrentTimestamp();

	for (i = 0; i < count; i++)
	{
		TimeoutId	id = timeouts[i].id;
		TimestampTz fin_time;

		switch (timeouts[i].type)
		{
			case TMPARAM_AFTER:
				fin_time = TimestampTzPlusMilliseconds(now,
													   timeouts[i].delay_ms);
				enable_timeout(id, now, fin_time);
				break;

			case TMPARAM_AT:
				enable_timeout(id, now, timeouts[i].fin_time);
				break;

			default:
				elog(ERROR, "unrecognized timeout type %d",
					 (int) timeouts[i].type);
				break;
		}
	}

	/* Set the timer interrupt. */
	schedule_alarm(now);
}
Exemplo n.º 6
0
/* Main loop of walsender process */
static int
WalSndLoop(void)
{
	char	   *output_message;
	bool		caughtup = false;

	/*
	 * Allocate buffer that will be used for each output message.  We do this
	 * just once to reduce palloc overhead.  The buffer must be made large
	 * enough for maximum-sized messages.
	 */
	output_message = palloc(1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE);

	/*
	 * Allocate buffer that will be used for processing reply messages.  As
	 * above, do this just once to reduce palloc overhead.
	 */
	initStringInfo(&reply_message);

	/* Initialize the last reply timestamp */
	last_reply_timestamp = GetCurrentTimestamp();

	/* Loop forever, unless we get an error */
	for (;;)
	{
		/* Clear any already-pending wakeups */
		ResetLatch(&MyWalSnd->latch);

		/*
		 * Emergency bailout if postmaster has died.  This is to avoid the
		 * necessity for manual cleanup of all postmaster children.
		 */
		if (!PostmasterIsAlive())
			exit(1);

		/* Process any requests or signals received recently */
		if (got_SIGHUP)
		{
			got_SIGHUP = false;
			ProcessConfigFile(PGC_SIGHUP);
			SyncRepInitConfig();
		}

		/* Normal exit from the walsender is here */
		if (walsender_shutdown_requested)
		{
			/* Inform the standby that XLOG streaming is done */
			pq_puttextmessage('C', "COPY 0");
			pq_flush();

			proc_exit(0);
		}

		/* Check for input from the client */
		ProcessRepliesIfAny();

		/*
		 * If we don't have any pending data in the output buffer, try to send
		 * some more.  If there is some, we don't bother to call XLogSend
		 * again until we've flushed it ... but we'd better assume we are not
		 * caught up.
		 */
		if (!pq_is_send_pending())
			XLogSend(output_message, &caughtup);
		else
			caughtup = false;

		/* Try to flush pending output to the client */
		if (pq_flush_if_writable() != 0)
			break;

		/* If nothing remains to be sent right now ... */
		if (caughtup && !pq_is_send_pending())
		{
			/*
			 * If we're in catchup state, move to streaming.  This is an
			 * important state change for users to know about, since before
			 * this point data loss might occur if the primary dies and we
			 * need to failover to the standby. The state change is also
			 * important for synchronous replication, since commits that
			 * started to wait at that point might wait for some time.
			 */
			if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
			{
				ereport(DEBUG1,
						(errmsg("standby \"%s\" has now caught up with primary",
								application_name)));
				WalSndSetState(WALSNDSTATE_STREAMING);
			}

			/*
			 * When SIGUSR2 arrives, we send any outstanding logs up to the
			 * shutdown checkpoint record (i.e., the latest record) and exit.
			 * This may be a normal termination at shutdown, or a promotion,
			 * the walsender is not sure which.
			 */
			if (walsender_ready_to_stop)
			{
				/* ... let's just be real sure we're caught up ... */
				XLogSend(output_message, &caughtup);
				if (caughtup && !pq_is_send_pending())
				{
					walsender_shutdown_requested = true;
					continue;		/* don't want to wait more */
				}
			}
		}

		/*
		 * We don't block if not caught up, unless there is unsent data
		 * pending in which case we'd better block until the socket is
		 * write-ready.  This test is only needed for the case where XLogSend
		 * loaded a subset of the available data but then pq_flush_if_writable
		 * flushed it all --- we should immediately try to send more.
		 */
		if (caughtup || pq_is_send_pending())
		{
			TimestampTz finish_time = 0;
			long		sleeptime = -1;
			int			wakeEvents;

			wakeEvents = WL_LATCH_SET | WL_POSTMASTER_DEATH |
				WL_SOCKET_READABLE;
			if (pq_is_send_pending())
				wakeEvents |= WL_SOCKET_WRITEABLE;

			/* Determine time until replication timeout */
			if (replication_timeout > 0)
			{
				long		secs;
				int			usecs;

				finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp,
														  replication_timeout);
				TimestampDifference(GetCurrentTimestamp(),
									finish_time, &secs, &usecs);
				sleeptime = secs * 1000 + usecs / 1000;
				/* Avoid Assert in WaitLatchOrSocket if timeout is past */
				if (sleeptime < 0)
					sleeptime = 0;
				wakeEvents |= WL_TIMEOUT;
			}

			/* Sleep until something happens or replication timeout */
			WaitLatchOrSocket(&MyWalSnd->latch, wakeEvents,
							  MyProcPort->sock, sleeptime);

			/*
			 * Check for replication timeout.  Note we ignore the corner case
			 * possibility that the client replied just as we reached the
			 * timeout ... he's supposed to reply *before* that.
			 */
			if (replication_timeout > 0 &&
				GetCurrentTimestamp() >= finish_time)
			{
				/*
				 * Since typically expiration of replication timeout means
				 * communication problem, we don't send the error message to
				 * the standby.
				 */
				ereport(COMMERROR,
						(errmsg("terminating walsender process due to replication timeout")));
				break;
			}
		}
	}

	/*
	 * Get here on send failure.  Clean up and exit.
	 *
	 * Reset whereToSendOutput to prevent ereport from attempting to send any
	 * more messages to the standby.
	 */
	if (whereToSendOutput == DestRemote)
		whereToSendOutput = DestNone;

	proc_exit(0);
	return 1;					/* keep the compiler quiet */
}
Exemplo n.º 7
0
/* Main loop of walsender process */
static int
WalSndLoop(void)
{
    char	   *output_message;
    bool		caughtup = false;

    /*
     * Allocate buffer that will be used for each output message.  We do this
     * just once to reduce palloc overhead.  The buffer must be made large
     * enough for maximum-sized messages.
     */
    output_message = palloc(1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE);

    /*
     * Allocate buffer that will be used for processing reply messages.  As
     * above, do this just once to reduce palloc overhead.
     */
    initStringInfo(&reply_message);

    /* Initialize the last reply timestamp */
    last_reply_timestamp = GetCurrentTimestamp();

    /* Loop forever, unless we get an error */
    for (;;)
    {
        /*
         * Emergency bailout if postmaster has died.  This is to avoid the
         * necessity for manual cleanup of all postmaster children.
         */
        if (!PostmasterIsAlive(true))
            exit(1);

        /* Process any requests or signals received recently */
        if (got_SIGHUP)
        {
            got_SIGHUP = false;
            ProcessConfigFile(PGC_SIGHUP);
            SyncRepInitConfig();
        }

        /* Normal exit from the walsender is here */
        if (walsender_shutdown_requested)
        {
            /* Inform the standby that XLOG streaming was done */
            pq_puttextmessage('C', "COPY 0");
            pq_flush();

            proc_exit(0);
        }

        /*
         * If we don't have any pending data in the output buffer, try to send
         * some more.
         */
        if (!pq_is_send_pending())
        {
            XLogSend(output_message, &caughtup);

            /*
             * Even if we wrote all the WAL that was available when we started
             * sending, more might have arrived while we were sending this
             * batch. We had the latch set while sending, so we have not
             * received any signals from that time. Let's arm the latch again,
             * and after that check that we're still up-to-date.
             */
            if (caughtup && !pq_is_send_pending())
            {
                ResetLatch(&MyWalSnd->latch);

                XLogSend(output_message, &caughtup);
            }
        }

        /* Flush pending output to the client */
        if (pq_flush_if_writable() != 0)
            break;

        /*
         * When SIGUSR2 arrives, we send any outstanding logs up to the
         * shutdown checkpoint record (i.e., the latest record) and exit.
         */
        if (walsender_ready_to_stop && !pq_is_send_pending())
        {
            XLogSend(output_message, &caughtup);
            ProcessRepliesIfAny();
            if (caughtup && !pq_is_send_pending())
                walsender_shutdown_requested = true;
        }

        if ((caughtup || pq_is_send_pending()) &&
                !got_SIGHUP &&
                !walsender_shutdown_requested)
        {
            TimestampTz finish_time = 0;
            long		sleeptime;

            /* Reschedule replication timeout */
            if (replication_timeout > 0)
            {
                long		secs;
                int			usecs;

                finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp,
                              replication_timeout);
                TimestampDifference(GetCurrentTimestamp(),
                                    finish_time, &secs, &usecs);
                sleeptime = secs * 1000 + usecs / 1000;
                if (WalSndDelay < sleeptime)
                    sleeptime = WalSndDelay;
            }
            else
            {
                /*
                 * XXX: Without timeout, we don't really need the periodic
                 * wakeups anymore, WaitLatchOrSocket should reliably wake up
                 * as soon as something interesting happens.
                 */
                sleeptime = WalSndDelay;
            }

            /* Sleep */
            WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock,
                              true, pq_is_send_pending(),
                              sleeptime);

            /* Check for replication timeout */
            if (replication_timeout > 0 &&
                    GetCurrentTimestamp() >= finish_time)
            {
                /*
                 * Since typically expiration of replication timeout means
                 * communication problem, we don't send the error message to
                 * the standby.
                 */
                ereport(COMMERROR,
                        (errmsg("terminating walsender process due to replication timeout")));
                break;
            }
        }

        /*
         * If we're in catchup state, see if its time to move to streaming.
         * This is an important state change for users, since before this
         * point data loss might occur if the primary dies and we need to
         * failover to the standby. The state change is also important for
         * synchronous replication, since commits that started to wait at that
         * point might wait for some time.
         */
        if (MyWalSnd->state == WALSNDSTATE_CATCHUP && caughtup)
        {
            ereport(DEBUG1,
                    (errmsg("standby \"%s\" has now caught up with primary",
                            application_name)));
            WalSndSetState(WALSNDSTATE_STREAMING);
        }

        ProcessRepliesIfAny();
    }

    /*
     * Get here on send failure.  Clean up and exit.
     *
     * Reset whereToSendOutput to prevent ereport from attempting to send any
     * more messages to the standby.
     */
    if (whereToSendOutput == DestRemote)
        whereToSendOutput = DestNone;

    proc_exit(0);
    return 1;					/* keep the compiler quiet */
}
Exemplo n.º 8
0
/*
 * Receive a log stream starting at the specified position.
 *
 * If sysidentifier is specified, validate that both the system
 * identifier and the timeline matches the specified ones
 * (by sending an extra IDENTIFY_SYSTEM command)
 *
 * All received segments will be written to the directory
 * specified by basedir.
 *
 * The stream_stop callback will be called every time data
 * is received, and whenever a segment is completed. If it returns
 * true, the streaming will stop and the function
 * return. As long as it returns false, streaming will continue
 * indefinitely.
 *
 * standby_message_timeout controls how often we send a message
 * back to the master letting it know our progress, in seconds.
 * This message will only contain the write location, and never
 * flush or replay.
 *
 * Note: The log position *must* be at a log segment start!
 */
bool
ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline,
				  char *sysidentifier, char *basedir,
				  stream_stop_callback stream_stop,
				  int standby_message_timeout, bool rename_partial)
{
	char		query[128];
	char		current_walfile_name[MAXPGPATH];
	PGresult   *res;
	char	   *copybuf = NULL;
	int64		last_status = -1;
	XLogRecPtr	blockpos = InvalidXLogRecPtr;

	if (sysidentifier != NULL)
	{
		/* Validate system identifier and timeline hasn't changed */
		res = PQexec(conn, "IDENTIFY_SYSTEM");
		if (PQresultStatus(res) != PGRES_TUPLES_OK)
		{
			fprintf(stderr,
					_("%s: could not send replication command \"%s\": %s"),
					progname, "IDENTIFY_SYSTEM", PQerrorMessage(conn));
			PQclear(res);
			return false;
		}
		if (PQnfields(res) != 3 || PQntuples(res) != 1)
		{
			fprintf(stderr,
					_("%s: could not identify system: got %d rows and %d fields, expected %d rows and %d fields\n"),
					progname, PQntuples(res), PQnfields(res), 1, 3);
			PQclear(res);
			return false;
		}
		if (strcmp(sysidentifier, PQgetvalue(res, 0, 0)) != 0)
		{
			fprintf(stderr,
					_("%s: system identifier does not match between base backup and streaming connection\n"),
					progname);
			PQclear(res);
			return false;
		}
		if (timeline != atoi(PQgetvalue(res, 0, 1)))
		{
			fprintf(stderr,
					_("%s: timeline does not match between base backup and streaming connection\n"),
					progname);
			PQclear(res);
			return false;
		}
		PQclear(res);
	}

	/* Initiate the replication stream at specified location */
	snprintf(query, sizeof(query), "START_REPLICATION %X/%X", startpos.xlogid, startpos.xrecoff);
	res = PQexec(conn, query);
	if (PQresultStatus(res) != PGRES_COPY_BOTH)
	{
		fprintf(stderr, _("%s: could not send replication command \"%s\": %s"),
				progname, "START_REPLICATION", PQresultErrorMessage(res));
		PQclear(res);
		return false;
	}
	PQclear(res);

	/*
	 * Receive the actual xlog data
	 */
	while (1)
	{
		int			r;
		int			xlogoff;
		int			bytes_left;
		int			bytes_written;
		int64		now;

		if (copybuf != NULL)
		{
			PQfreemem(copybuf);
			copybuf = NULL;
		}

		/*
		 * Check if we should continue streaming, or abort at this point.
		 */
		if (stream_stop && stream_stop(blockpos, timeline, false))
		{
			if (walfile != -1 && !close_walfile(basedir, current_walfile_name,
												rename_partial))
				/* Potential error message is written by close_walfile */
				goto error;
			return true;
		}

		/*
		 * Potentially send a status message to the master
		 */
		now = localGetCurrentTimestamp();
		if (standby_message_timeout > 0 &&
			localTimestampDifferenceExceeds(last_status, now,
											standby_message_timeout))
		{
			/* Time to send feedback! */
			char		replybuf[sizeof(StandbyReplyMessage) + 1];
			StandbyReplyMessage *replymsg;

			replymsg = (StandbyReplyMessage *) (replybuf + 1);
			replymsg->write = blockpos;
			replymsg->flush = InvalidXLogRecPtr;
			replymsg->apply = InvalidXLogRecPtr;
			replymsg->sendTime = now;
			replybuf[0] = 'r';

			if (PQputCopyData(conn, replybuf, sizeof(replybuf)) <= 0 ||
				PQflush(conn))
			{
				fprintf(stderr, _("%s: could not send feedback packet: %s"),
						progname, PQerrorMessage(conn));
				goto error;
			}

			last_status = now;
		}

		r = PQgetCopyData(conn, &copybuf, 1);
		if (r == 0)
		{
			/*
			 * In async mode, and no data available. We block on reading but
			 * not more than the specified timeout, so that we can send a
			 * response back to the client.
			 */
			fd_set		input_mask;
			struct timeval timeout;
			struct timeval *timeoutptr;

			FD_ZERO(&input_mask);
			FD_SET(PQsocket(conn), &input_mask);
			if (standby_message_timeout)
			{
				TimestampTz targettime;
				long		secs;
				int			usecs;

				targettime = TimestampTzPlusMilliseconds(last_status,
												standby_message_timeout - 1);
				localTimestampDifference(now,
										 targettime,
										 &secs,
										 &usecs);
				if (secs <= 0)
					timeout.tv_sec = 1; /* Always sleep at least 1 sec */
				else
					timeout.tv_sec = secs;
				timeout.tv_usec = usecs;
				timeoutptr = &timeout;
			}
			else
				timeoutptr = NULL;

			r = select(PQsocket(conn) + 1, &input_mask, NULL, NULL, timeoutptr);
			if (r == 0 || (r < 0 && errno == EINTR))
			{
				/*
				 * Got a timeout or signal. Continue the loop and either
				 * deliver a status packet to the server or just go back into
				 * blocking.
				 */
				continue;
			}
			else if (r < 0)
			{
				fprintf(stderr, _("%s: select() failed: %s\n"),
						progname, strerror(errno));
				goto error;
			}
			/* Else there is actually data on the socket */
			if (PQconsumeInput(conn) == 0)
			{
				fprintf(stderr,
						_("%s: could not receive data from WAL stream: %s"),
						progname, PQerrorMessage(conn));
				goto error;
			}
			continue;
		}
		if (r == -1)
			/* End of copy stream */
			break;
		if (r == -2)
		{
			fprintf(stderr, _("%s: could not read COPY data: %s"),
					progname, PQerrorMessage(conn));
			goto error;
		}
		if (copybuf[0] == 'k')
		{
			/*
			 * keepalive message, sent in 9.2 and newer. We just ignore this
			 * message completely, but need to skip past it in the stream.
			 */
			if (r != STREAMING_KEEPALIVE_SIZE)
			{
				fprintf(stderr,
						_("%s: keepalive message has incorrect size %d\n"),
						progname, r);
				goto error;
			}
			continue;
		}
		else if (copybuf[0] != 'w')
		{
			fprintf(stderr, _("%s: unrecognized streaming header: \"%c\"\n"),
					progname, copybuf[0]);
			goto error;
		}
		if (r < STREAMING_HEADER_SIZE + 1)
		{
			fprintf(stderr, _("%s: streaming header too small: %d\n"),
					progname, r);
			goto error;
		}

		/* Extract WAL location for this block */
		memcpy(&blockpos, copybuf + 1, 8);
		xlogoff = blockpos.xrecoff % XLOG_SEG_SIZE;

		/*
		 * Verify that the initial location in the stream matches where we
		 * think we are.
		 */
		if (walfile == -1)
		{
			/* No file open yet */
			if (xlogoff != 0)
			{
				fprintf(stderr,
						_("%s: received transaction log record for offset %u with no file open\n"),
						progname, xlogoff);
				goto error;
			}
		}
		else
		{
			/* More data in existing segment */
			/* XXX: store seek value don't reseek all the time */
			if (lseek(walfile, 0, SEEK_CUR) != xlogoff)
			{
				fprintf(stderr,
						_("%s: got WAL data offset %08x, expected %08x\n"),
						progname, xlogoff, (int) lseek(walfile, 0, SEEK_CUR));
				goto error;
			}
		}

		bytes_left = r - STREAMING_HEADER_SIZE;
		bytes_written = 0;

		while (bytes_left)
		{
			int			bytes_to_write;

			/*
			 * If crossing a WAL boundary, only write up until we reach
			 * XLOG_SEG_SIZE.
			 */
			if (xlogoff + bytes_left > XLOG_SEG_SIZE)
				bytes_to_write = XLOG_SEG_SIZE - xlogoff;
			else
				bytes_to_write = bytes_left;

			if (walfile == -1)
			{
				walfile = open_walfile(blockpos, timeline,
									   basedir, current_walfile_name);
				if (walfile == -1)
					/* Error logged by open_walfile */
					goto error;
			}

			if (write(walfile,
					  copybuf + STREAMING_HEADER_SIZE + bytes_written,
					  bytes_to_write) != bytes_to_write)
			{
				fprintf(stderr,
				  _("%s: could not write %u bytes to WAL file \"%s\": %s\n"),
						progname, bytes_to_write, current_walfile_name,
						strerror(errno));
				goto error;
			}

			/* Write was successful, advance our position */
			bytes_written += bytes_to_write;
			bytes_left -= bytes_to_write;
			XLByteAdvance(blockpos, bytes_to_write);
			xlogoff += bytes_to_write;

			/* Did we reach the end of a WAL segment? */
			if (blockpos.xrecoff % XLOG_SEG_SIZE == 0)
			{
				if (!close_walfile(basedir, current_walfile_name, false))
					/* Error message written in close_walfile() */
					goto error;

				xlogoff = 0;

				if (stream_stop != NULL)
				{
					/*
					 * Callback when the segment finished, and return if it
					 * told us to.
					 */
					if (stream_stop(blockpos, timeline, true))
						return true;
				}
			}
		}
		/* No more data left to write, start receiving next copy packet */
	}

	/*
	 * The only way to get out of the loop is if the server shut down the
	 * replication stream. If it's a controlled shutdown, the server will send
	 * a shutdown message, and we'll return the latest xlog location that has
	 * been streamed.
	 */

	res = PQgetResult(conn);
	if (PQresultStatus(res) != PGRES_COMMAND_OK)
	{
		fprintf(stderr,
				_("%s: unexpected termination of replication stream: %s"),
				progname, PQresultErrorMessage(res));
		goto error;
	}
	PQclear(res);

	/* Complain if we've not reached stop point yet */
	if (stream_stop != NULL && !stream_stop(blockpos, timeline, false))
	{
		fprintf(stderr, _("%s: replication stream was terminated before stop point\n"),
				progname);
		goto error;
	}

	if (copybuf != NULL)
		PQfreemem(copybuf);
	if (walfile != -1 && close(walfile) != 0)
		fprintf(stderr, _("%s: could not close file \"%s\": %s\n"),
				progname, current_walfile_name, strerror(errno));
	walfile = -1;
	return true;

error:
	if (copybuf != NULL)
		PQfreemem(copybuf);
	if (walfile != -1 && close(walfile) != 0)
		fprintf(stderr, _("%s: could not close file \"%s\": %s\n"),
				progname, current_walfile_name, strerror(errno));
	walfile = -1;
	return false;
}
Exemplo n.º 9
0
/*
 * Main entry point for bgwriter process
 *
 * This is invoked from AuxiliaryProcessMain, which has already created the
 * basic execution environment, but not enabled signals yet.
 */
void
BackgroundWriterMain(void)
{
	sigjmp_buf	local_sigjmp_buf;
	MemoryContext bgwriter_context;
	bool		prev_hibernate;

	/*
	 * Properly accept or ignore signals the postmaster might send us.
	 *
	 * bgwriter doesn't participate in ProcSignal signalling, but a SIGUSR1
	 * handler is still needed for latch wakeups.
	 */
	pqsignal(SIGHUP, BgSigHupHandler);	/* set flag to read config file */
	pqsignal(SIGINT, SIG_IGN);
	pqsignal(SIGTERM, ReqShutdownHandler);		/* shutdown */
	pqsignal(SIGQUIT, bg_quickdie);		/* hard crash time */
	pqsignal(SIGALRM, SIG_IGN);
	pqsignal(SIGPIPE, SIG_IGN);
	pqsignal(SIGUSR1, bgwriter_sigusr1_handler);
	pqsignal(SIGUSR2, SIG_IGN);

	/*
	 * Reset some signals that are accepted by postmaster but not here
	 */
	pqsignal(SIGCHLD, SIG_DFL);
	pqsignal(SIGTTIN, SIG_DFL);
	pqsignal(SIGTTOU, SIG_DFL);
	pqsignal(SIGCONT, SIG_DFL);
	pqsignal(SIGWINCH, SIG_DFL);

	/* We allow SIGQUIT (quickdie) at all times */
	sigdelset(&BlockSig, SIGQUIT);

	/*
	 * Create a resource owner to keep track of our resources (currently only
	 * buffer pins).
	 */
	CurrentResourceOwner = ResourceOwnerCreate(NULL, "Background Writer");

	/*
	 * We just started, assume there has been either a shutdown or
	 * end-of-recovery snapshot.
	 */
	last_snapshot_ts = GetCurrentTimestamp();

	/*
	 * Create a memory context that we will do all our work in.  We do this so
	 * that we can reset the context during error recovery and thereby avoid
	 * possible memory leaks.  Formerly this code just ran in
	 * TopMemoryContext, but resetting that would be a really bad idea.
	 */
	bgwriter_context = AllocSetContextCreate(TopMemoryContext,
											 "Background Writer",
											 ALLOCSET_DEFAULT_MINSIZE,
											 ALLOCSET_DEFAULT_INITSIZE,
											 ALLOCSET_DEFAULT_MAXSIZE);
	MemoryContextSwitchTo(bgwriter_context);

	/*
	 * If an exception is encountered, processing resumes here.
	 *
	 * See notes in postgres.c about the design of this coding.
	 */
	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
	{
		/* Since not using PG_TRY, must reset error stack by hand */
		error_context_stack = NULL;

		/* Prevent interrupts while cleaning up */
		HOLD_INTERRUPTS();

		/* Report the error to the server log */
		EmitErrorReport();

		/*
		 * These operations are really just a minimal subset of
		 * AbortTransaction().  We don't have very many resources to worry
		 * about in bgwriter, but we do have LWLocks, buffers, and temp files.
		 */
		LWLockReleaseAll();
		AbortBufferIO();
		UnlockBuffers();
		/* buffer pins are released here: */
		ResourceOwnerRelease(CurrentResourceOwner,
							 RESOURCE_RELEASE_BEFORE_LOCKS,
							 false, true);
		/* we needn't bother with the other ResourceOwnerRelease phases */
		AtEOXact_Buffers(false);
		AtEOXact_SMgr();
		AtEOXact_Files();
		AtEOXact_HashTables(false);

		/*
		 * Now return to normal top-level context and clear ErrorContext for
		 * next time.
		 */
		MemoryContextSwitchTo(bgwriter_context);
		FlushErrorState();

		/* Flush any leaked data in the top-level context */
		MemoryContextResetAndDeleteChildren(bgwriter_context);

		/* Now we can allow interrupts again */
		RESUME_INTERRUPTS();

		/*
		 * Sleep at least 1 second after any error.  A write error is likely
		 * to be repeated, and we don't want to be filling the error logs as
		 * fast as we can.
		 */
		pg_usleep(1000000L);

		/*
		 * Close all open files after any error.  This is helpful on Windows,
		 * where holding deleted files open causes various strange errors.
		 * It's not clear we need it elsewhere, but shouldn't hurt.
		 */
		smgrcloseall();

		/* Report wait end here, when there is no further possibility of wait */
		pgstat_report_wait_end();
	}

	/* We can now handle ereport(ERROR) */
	PG_exception_stack = &local_sigjmp_buf;

	/*
	 * Unblock signals (they were blocked when the postmaster forked us)
	 */
	PG_SETMASK(&UnBlockSig);

	/*
	 * Reset hibernation state after any error.
	 */
	prev_hibernate = false;

	/*
	 * Loop forever
	 */
	for (;;)
	{
		bool		can_hibernate;
		int			rc;

		/* Clear any already-pending wakeups */
		ResetLatch(MyLatch);

		if (got_SIGHUP)
		{
			got_SIGHUP = false;
			ProcessConfigFile(PGC_SIGHUP);
		}
		if (shutdown_requested)
		{
			/*
			 * From here on, elog(ERROR) should end with exit(1), not send
			 * control back to the sigsetjmp block above
			 */
			ExitOnAnyError = true;
			/* Normal exit from the bgwriter is here */
			proc_exit(0);		/* done */
		}

		/*
		 * Do one cycle of dirty-buffer writing.
		 */
		can_hibernate = BgBufferSync();

		/*
		 * Send off activity statistics to the stats collector
		 */
		pgstat_send_bgwriter();

		if (FirstCallSinceLastCheckpoint())
		{
			/*
			 * After any checkpoint, close all smgr files.  This is so we
			 * won't hang onto smgr references to deleted files indefinitely.
			 */
			smgrcloseall();
		}

		/*
		 * Log a new xl_running_xacts every now and then so replication can
		 * get into a consistent state faster (think of suboverflowed
		 * snapshots) and clean up resources (locks, KnownXids*) more
		 * frequently. The costs of this are relatively low, so doing it 4
		 * times (LOG_SNAPSHOT_INTERVAL_MS) a minute seems fine.
		 *
		 * We assume the interval for writing xl_running_xacts is
		 * significantly bigger than BgWriterDelay, so we don't complicate the
		 * overall timeout handling but just assume we're going to get called
		 * often enough even if hibernation mode is active. It's not that
		 * important that log_snap_interval_ms is met strictly. To make sure
		 * we're not waking the disk up unnecessarily on an idle system we
		 * check whether there has been any WAL inserted since the last time
		 * we've logged a running xacts.
		 *
		 * We do this logging in the bgwriter as its the only process that is
		 * run regularly and returns to its mainloop all the time. E.g.
		 * Checkpointer, when active, is barely ever in its mainloop and thus
		 * makes it hard to log regularly.
		 */
		if (XLogStandbyInfoActive() && !RecoveryInProgress())
		{
			TimestampTz timeout = 0;
			TimestampTz now = GetCurrentTimestamp();

			timeout = TimestampTzPlusMilliseconds(last_snapshot_ts,
												  LOG_SNAPSHOT_INTERVAL_MS);

			/*
			 * only log if enough time has passed and some xlog record has
			 * been inserted.
			 */
			if (now >= timeout &&
				last_snapshot_lsn != GetXLogInsertRecPtr())
			{
				last_snapshot_lsn = LogStandbySnapshot();
				last_snapshot_ts = now;
			}
		}

		/*
		 * Sleep until we are signaled or BgWriterDelay has elapsed.
		 *
		 * Note: the feedback control loop in BgBufferSync() expects that we
		 * will call it every BgWriterDelay msec.  While it's not critical for
		 * correctness that that be exact, the feedback loop might misbehave
		 * if we stray too far from that.  Hence, avoid loading this process
		 * down with latch events that are likely to happen frequently during
		 * normal operation.
		 */
		rc = WaitLatch(MyLatch,
					   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
					   BgWriterDelay /* ms */ );

		/*
		 * If no latch event and BgBufferSync says nothing's happening, extend
		 * the sleep in "hibernation" mode, where we sleep for much longer
		 * than bgwriter_delay says.  Fewer wakeups save electricity.  When a
		 * backend starts using buffers again, it will wake us up by setting
		 * our latch.  Because the extra sleep will persist only as long as no
		 * buffer allocations happen, this should not distort the behavior of
		 * BgBufferSync's control loop too badly; essentially, it will think
		 * that the system-wide idle interval didn't exist.
		 *
		 * There is a race condition here, in that a backend might allocate a
		 * buffer between the time BgBufferSync saw the alloc count as zero
		 * and the time we call StrategyNotifyBgWriter.  While it's not
		 * critical that we not hibernate anyway, we try to reduce the odds of
		 * that by only hibernating when BgBufferSync says nothing's happening
		 * for two consecutive cycles.  Also, we mitigate any possible
		 * consequences of a missed wakeup by not hibernating forever.
		 */
		if (rc == WL_TIMEOUT && can_hibernate && prev_hibernate)
		{
			/* Ask for notification at next buffer allocation */
			StrategyNotifyBgWriter(MyProc->pgprocno);
			/* Sleep ... */
			rc = WaitLatch(MyLatch,
						   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
						   BgWriterDelay * HIBERNATE_FACTOR);
			/* Reset the notification request in case we timed out */
			StrategyNotifyBgWriter(-1);
		}

		/*
		 * Emergency bailout if postmaster has died.  This is to avoid the
		 * necessity for manual cleanup of all postmaster children.
		 */
		if (rc & WL_POSTMASTER_DEATH)
			exit(1);

		prev_hibernate = can_hibernate;
	}
}
Exemplo n.º 10
0
/*
 * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
 * to resolve conflicts with other backends holding buffer pins.
 *
 * We either resolve conflicts immediately or set a SIGALRM to wake us at
 * the limit of our patience. The sleep in LockBufferForCleanup() is
 * performed here, for code clarity.
 *
 * Resolve conflict by sending a SIGUSR1 reason to all backends to check if
 * they hold one of the buffer pins that is blocking Startup process. If so,
 * backends will take an appropriate error action, ERROR or FATAL.
 *
 * We also check for deadlocks before we wait, though applications that cause
 * these will be extremely rare.  Deadlocks occur because if queries
 * wait on a lock, that must be behind an AccessExclusiveLock, which can only
 * be cleared if the Startup process replays a transaction completion record.
 * If Startup process is also waiting then that is a deadlock. The deadlock
 * can occur if the query is waiting and then the Startup sleeps, or if
 * Startup is sleeping and the query waits on a lock. We protect against
 * only the former sequence here, the latter sequence is checked prior to
 * the query sleeping, in CheckRecoveryConflictDeadlock().
 */
void
ResolveRecoveryConflictWithBufferPin(void)
{
	bool		sig_alarm_enabled = false;

	Assert(InHotStandby);

	if (MaxStandbyDelay == 0)
	{
		/*
		 * We don't want to wait, so just tell everybody holding the pin to
		 * get out of town.
		 */
		SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
	}
	else if (MaxStandbyDelay < 0)
	{
		/*
		 * Send out a request to check for buffer pin deadlocks before we
		 * wait. This is fairly cheap, so no need to wait for deadlock timeout
		 * before trying to send it out.
		 */
		SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
	}
	else
	{
		TimestampTz then = GetLatestXLogTime();
		TimestampTz now = GetCurrentTimestamp();

		/* Are we past max_standby_delay? */
		if (TimestampDifferenceExceeds(then, now, MaxStandbyDelay))
		{
			/*
			 * We're already behind, so clear a path as quickly as possible.
			 */
			SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
		}
		else
		{
			TimestampTz fin_time;		/* Expected wake-up time by timer */
			long		timer_delay_secs;		/* Amount of time we set timer
												 * for */
			int			timer_delay_usecs;

			/*
			 * Send out a request to check for buffer pin deadlocks before we
			 * wait. This is fairly cheap, so no need to wait for deadlock
			 * timeout before trying to send it out.
			 */
			SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);

			/*
			 * How much longer we should wait?
			 */
			fin_time = TimestampTzPlusMilliseconds(then, MaxStandbyDelay);

			TimestampDifference(now, fin_time,
								&timer_delay_secs, &timer_delay_usecs);

			/*
			 * It's possible that the difference is less than a microsecond;
			 * ensure we don't cancel, rather than set, the interrupt.
			 */
			if (timer_delay_secs == 0 && timer_delay_usecs == 0)
				timer_delay_usecs = 1;

			if (enable_standby_sig_alarm(timer_delay_secs, timer_delay_usecs, fin_time))
				sig_alarm_enabled = true;
			else
				elog(FATAL, "could not set timer for process wakeup");
		}
	}

	/* Wait to be signaled by UnpinBuffer() */
	ProcWaitForSignal();

	if (sig_alarm_enabled)
	{
		if (!disable_standby_sig_alarm())
			elog(FATAL, "could not disable timer for process wakeup");
	}
}
static void
grab_ExecutorEnd(QueryDesc * queryDesc)
{
	Datum           values[10];
	bool            nulls[10] = {false, false, false, false, false, false, false, false, false, false};
	Relation        dump_heap;
	RangeVar       *dump_table_rv;
	HeapTuple       tuple;
	Oid             namespaceId;

	/* lookup schema */
	namespaceId = GetSysCacheOid1(NAMESPACENAME, CStringGetDatum(EXTENSION_SCHEMA));
	if (OidIsValid(namespaceId)) {
		/* lookup table */
		if (OidIsValid(get_relname_relid(EXTENSION_LOG_TABLE, namespaceId))) {

			/* get table heap */
			dump_table_rv = makeRangeVar(EXTENSION_SCHEMA, EXTENSION_LOG_TABLE, -1);
			dump_heap = heap_openrv(dump_table_rv, RowExclusiveLock);

			/* transaction info */
			values[0] = Int32GetDatum(GetCurrentTransactionId());
			values[1] = Int32GetDatum(GetCurrentCommandId(false));
			values[2] = Int32GetDatum(MyProcPid);
			values[3] = Int32GetDatum(GetUserId());

			/* query timing */
			if (queryDesc->totaltime != NULL) {
				InstrEndLoop(queryDesc->totaltime);
				values[4] = TimestampGetDatum(
							      TimestampTzPlusMilliseconds(GetCurrentTimestamp(),
				  (queryDesc->totaltime->total * -1000.0)));
				values[5] = Float8GetDatum(queryDesc->totaltime->total);
			} else {
				nulls[4] = true;
				nulls[5] = true;
			}

			/* query command type */
			values[6] = Int32GetDatum(queryDesc->operation);

			/* query text */
			values[7] = CStringGetDatum(
				    cstring_to_text(queryDesc->sourceText));

			/* query params */
			if (queryDesc->params != NULL) {
				int             numParams = queryDesc->params->numParams;
				Oid             out_func_oid, ptype;
				Datum           pvalue;
				bool            isvarlena;
				FmgrInfo       *out_functions;

				bool            arr_nulls[numParams];
				size_t          arr_nelems = (size_t) numParams;
				Datum          *arr_val_elems = palloc(sizeof(Datum) * arr_nelems);
				Datum          *arr_typ_elems = palloc(sizeof(Datum) * arr_nelems);
				char            elem_val_byval, elem_val_align, elem_typ_byval,
				                elem_typ_align;
				int16           elem_val_len, elem_typ_len;
				int             elem_dims[1], elem_lbs[1];

				int paramno;

				/* init */
				out_functions = (FmgrInfo *) palloc(
					    (numParams) * sizeof(FmgrInfo));
				get_typlenbyvalalign(TEXTOID, &elem_val_len, &elem_val_byval, &elem_val_align);
				get_typlenbyvalalign(REGTYPEOID, &elem_typ_len, &elem_typ_byval, &elem_typ_align);
				elem_dims[0] = arr_nelems;
				elem_lbs[0] = 1;

				for (paramno = 0; paramno < numParams; paramno++) {
					pvalue = queryDesc->params->params[paramno].value;
					ptype = queryDesc->params->params[paramno].ptype;
					getTypeOutputInfo(ptype, &out_func_oid, &isvarlena);
					fmgr_info(out_func_oid, &out_functions[paramno]);

					arr_typ_elems[paramno] = ptype;

					arr_nulls[paramno] = true;
					if (!queryDesc->params->params[paramno].isnull) {
						arr_nulls[paramno] = false;
						arr_val_elems[paramno] = PointerGetDatum(
							    cstring_to_text(
									    OutputFunctionCall(&out_functions[paramno], pvalue)));
					}
				}
				values[8] = PointerGetDatum(
							 construct_md_array(
							      arr_val_elems,
								  arr_nulls,
									  1,
								  elem_dims,
								   elem_lbs,
								    TEXTOID,
							       elem_val_len,
							     elem_val_byval,
							   elem_val_align));
				values[9] = PointerGetDatum(
							    construct_array(
							      arr_typ_elems,
								 arr_nelems,
								 REGTYPEOID,
							       elem_typ_len,
							     elem_typ_byval,
							   elem_typ_align));

				pfree(out_functions);
				pfree(arr_val_elems);

			} else {
				nulls[8] = true;
				nulls[9] = true;
			}

			/* insert */
			tuple = heap_form_tuple(dump_heap->rd_att, values, nulls);
			simple_heap_insert(dump_heap, tuple);
			heap_close(dump_heap, RowExclusiveLock);
		}
	}
	if (prev_ExecutorEnd)
		prev_ExecutorEnd(queryDesc);
	else
		standard_ExecutorEnd(queryDesc);
}
Exemplo n.º 12
0
/*
 * Enable the SIGALRM interrupt to fire after the specified delay
 *
 * Delay is given in milliseconds.	Caller should be sure a SIGALRM
 * signal handler is installed before this is called.
 *
 * This code properly handles nesting of deadlock timeout alarms within
 * statement timeout alarms.
 *
 * Returns TRUE if okay, FALSE on failure.
 */
bool
enable_sig_alarm(int delayms, bool is_statement_timeout)
{
 	TimestampTz fin_time;
	struct itimerval timeval;

	if (is_statement_timeout)
	{
		/*
		 * Begin statement-level timeout
		 *
		 * Note that we compute statement_fin_time with reference to the
		 * statement_timestamp, but apply the specified delay without any
		 * correction; that is, we ignore whatever time has elapsed since
		 * statement_timestamp was set.  In the normal case only a small
		 * interval will have elapsed and so this doesn't matter, but there
		 * are corner cases (involving multi-statement query strings with
		 * embedded COMMIT or ROLLBACK) where we might re-initialize the
		 * statement timeout long after initial receipt of the message. In
		 * such cases the enforcement of the statement timeout will be a bit
		 * inconsistent.  This annoyance is judged not worth the cost of
		 * performing an additional gettimeofday() here.
		 */
		Assert(!deadlock_timeout_active);
		fin_time = GetCurrentStatementStartTimestamp();
		fin_time = TimestampTzPlusMilliseconds(fin_time, delayms);
		statement_fin_time = fin_time;
		cancel_from_timeout = false;
		statement_timeout_active = true;
	}
	else if (statement_timeout_active)
	{
		/*
		 * Begin deadlock timeout with statement-level timeout active
		 *
		 * Here, we want to interrupt at the closer of the two timeout times.
		 * If fin_time >= statement_fin_time then we need not touch the
		 * existing timer setting; else set up to interrupt at the deadlock
		 * timeout time.
		 *
		 * NOTE: in this case it is possible that this routine will be
		 * interrupted by the previously-set timer alarm.  This is okay
		 * because the signal handler will do only what it should do according
		 * to the state variables.	The deadlock checker may get run earlier
		 * than normal, but that does no harm.
		 */
		fin_time = GetCurrentTimestamp();
		fin_time = TimestampTzPlusMilliseconds(fin_time, delayms);
		deadlock_timeout_active = true;
		if (fin_time >= statement_fin_time)
			return true;
	}
	else
	{
		/* Begin deadlock timeout with no statement-level timeout */
		deadlock_timeout_active = true;
	}

	/* If we reach here, okay to set the timer interrupt */
	MemSet(&timeval, 0, sizeof(struct itimerval));
	timeval.it_value.tv_sec = delayms / 1000;
	timeval.it_value.tv_usec = (delayms % 1000) * 1000;
	if (setitimer(ITIMER_REAL, &timeval, NULL))
		return false;
	return true;
}