/* * Determine the cutoff time at which we want to start canceling conflicting * transactions. Returns zero (a time safely in the past) if we are willing * to wait forever. */ static TimestampTz GetStandbyLimitTime(void) { TimestampTz rtime; bool fromStream; /* * The cutoff time is the last WAL data receipt time plus the appropriate * delay variable. Delay of -1 means wait forever. */ GetXLogReceiptTime(&rtime, &fromStream); if (fromStream) { if (max_standby_streaming_delay < 0) return 0; /* wait forever */ return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay); } else { if (max_standby_archive_delay < 0) return 0; /* wait forever */ return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay); } }
/* * Enable the specified timeout to fire after the specified delay. * * Delay is given in milliseconds. */ void enable_timeout_after(TimeoutId id, int delay_ms) { TimestampTz now; TimestampTz fin_time; /* Disable timeout interrupts for safety. */ disable_alarm(); /* Queue the timeout at the appropriate time. */ now = GetCurrentTimestamp(); fin_time = TimestampTzPlusMilliseconds(now, delay_ms); enable_timeout(id, now, fin_time); /* Set the timer interrupt. */ schedule_alarm(now); }
/* * Reset state (called by ReScan). */ Datum tsm_system_time_reset(PG_FUNCTION_ARGS) { TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata; sampler->lt = InvalidOffsetNumber; sampler->start_time = GetCurrentTimestamp(); sampler->end_time = TimestampTzPlusMilliseconds(sampler->start_time, sampler->time); sampler->estblocks = 2; sampler->doneblocks = 0; sampler_random_init_state(sampler->seed, sampler->randstate); sampler->step = random_relative_prime(sampler->nblocks, sampler->randstate); sampler->lb = sampler_random_fract(sampler->randstate) * (sampler->nblocks / sampler->step); PG_RETURN_VOID(); }
/* * Initializes the state. */ Datum tsm_system_time_init(PG_FUNCTION_ARGS) { TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0); uint32 seed = PG_GETARG_UINT32(1); int32 time = PG_ARGISNULL(2) ? -1 : PG_GETARG_INT32(2); HeapScanDesc scan = tsdesc->heapScan; SystemSamplerData *sampler; if (time < 1) ereport(ERROR, (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), errmsg("invalid time limit"), errhint("Time limit must be positive integer value."))); sampler = palloc0(sizeof(SystemSamplerData)); /* Remember initial values for reinit */ sampler->seed = seed; sampler->nblocks = scan->rs_nblocks; sampler->lt = InvalidOffsetNumber; sampler->estblocks = 2; sampler->doneblocks = 0; sampler->time = time; sampler->start_time = GetCurrentTimestamp(); sampler->end_time = TimestampTzPlusMilliseconds(sampler->start_time, sampler->time); sampler_random_init_state(sampler->seed, sampler->randstate); /* Find relative prime as step size for linear probing. */ sampler->step = random_relative_prime(sampler->nblocks, sampler->randstate); /* * Randomize start position so that blocks close to step size don't have * higher probability of being chosen on very short scan. */ sampler->lb = sampler_random_fract(sampler->randstate) * (sampler->nblocks / sampler->step); tsdesc->tsmdata = (void *) sampler; PG_RETURN_VOID(); }
/* * Enable multiple timeouts at once. * * This works like calling enable_timeout_after() and/or enable_timeout_at() * multiple times. Use this to reduce the number of GetCurrentTimestamp() * and setitimer() calls needed to establish multiple timeouts. */ void enable_timeouts(const EnableTimeoutParams *timeouts, int count) { TimestampTz now; int i; /* Disable timeout interrupts for safety. */ disable_alarm(); /* Queue the timeout(s) at the appropriate times. */ now = GetCurrentTimestamp(); for (i = 0; i < count; i++) { TimeoutId id = timeouts[i].id; TimestampTz fin_time; switch (timeouts[i].type) { case TMPARAM_AFTER: fin_time = TimestampTzPlusMilliseconds(now, timeouts[i].delay_ms); enable_timeout(id, now, fin_time); break; case TMPARAM_AT: enable_timeout(id, now, timeouts[i].fin_time); break; default: elog(ERROR, "unrecognized timeout type %d", (int) timeouts[i].type); break; } } /* Set the timer interrupt. */ schedule_alarm(now); }
/* Main loop of walsender process */ static int WalSndLoop(void) { char *output_message; bool caughtup = false; /* * Allocate buffer that will be used for each output message. We do this * just once to reduce palloc overhead. The buffer must be made large * enough for maximum-sized messages. */ output_message = palloc(1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE); /* * Allocate buffer that will be used for processing reply messages. As * above, do this just once to reduce palloc overhead. */ initStringInfo(&reply_message); /* Initialize the last reply timestamp */ last_reply_timestamp = GetCurrentTimestamp(); /* Loop forever, unless we get an error */ for (;;) { /* Clear any already-pending wakeups */ ResetLatch(&MyWalSnd->latch); /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive()) exit(1); /* Process any requests or signals received recently */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); SyncRepInitConfig(); } /* Normal exit from the walsender is here */ if (walsender_shutdown_requested) { /* Inform the standby that XLOG streaming is done */ pq_puttextmessage('C', "COPY 0"); pq_flush(); proc_exit(0); } /* Check for input from the client */ ProcessRepliesIfAny(); /* * If we don't have any pending data in the output buffer, try to send * some more. If there is some, we don't bother to call XLogSend * again until we've flushed it ... but we'd better assume we are not * caught up. */ if (!pq_is_send_pending()) XLogSend(output_message, &caughtup); else caughtup = false; /* Try to flush pending output to the client */ if (pq_flush_if_writable() != 0) break; /* If nothing remains to be sent right now ... */ if (caughtup && !pq_is_send_pending()) { /* * If we're in catchup state, move to streaming. This is an * important state change for users to know about, since before * this point data loss might occur if the primary dies and we * need to failover to the standby. The state change is also * important for synchronous replication, since commits that * started to wait at that point might wait for some time. */ if (MyWalSnd->state == WALSNDSTATE_CATCHUP) { ereport(DEBUG1, (errmsg("standby \"%s\" has now caught up with primary", application_name))); WalSndSetState(WALSNDSTATE_STREAMING); } /* * When SIGUSR2 arrives, we send any outstanding logs up to the * shutdown checkpoint record (i.e., the latest record) and exit. * This may be a normal termination at shutdown, or a promotion, * the walsender is not sure which. */ if (walsender_ready_to_stop) { /* ... let's just be real sure we're caught up ... */ XLogSend(output_message, &caughtup); if (caughtup && !pq_is_send_pending()) { walsender_shutdown_requested = true; continue; /* don't want to wait more */ } } } /* * We don't block if not caught up, unless there is unsent data * pending in which case we'd better block until the socket is * write-ready. This test is only needed for the case where XLogSend * loaded a subset of the available data but then pq_flush_if_writable * flushed it all --- we should immediately try to send more. */ if (caughtup || pq_is_send_pending()) { TimestampTz finish_time = 0; long sleeptime = -1; int wakeEvents; wakeEvents = WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE; if (pq_is_send_pending()) wakeEvents |= WL_SOCKET_WRITEABLE; /* Determine time until replication timeout */ if (replication_timeout > 0) { long secs; int usecs; finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp, replication_timeout); TimestampDifference(GetCurrentTimestamp(), finish_time, &secs, &usecs); sleeptime = secs * 1000 + usecs / 1000; /* Avoid Assert in WaitLatchOrSocket if timeout is past */ if (sleeptime < 0) sleeptime = 0; wakeEvents |= WL_TIMEOUT; } /* Sleep until something happens or replication timeout */ WaitLatchOrSocket(&MyWalSnd->latch, wakeEvents, MyProcPort->sock, sleeptime); /* * Check for replication timeout. Note we ignore the corner case * possibility that the client replied just as we reached the * timeout ... he's supposed to reply *before* that. */ if (replication_timeout > 0 && GetCurrentTimestamp() >= finish_time) { /* * Since typically expiration of replication timeout means * communication problem, we don't send the error message to * the standby. */ ereport(COMMERROR, (errmsg("terminating walsender process due to replication timeout"))); break; } } } /* * Get here on send failure. Clean up and exit. * * Reset whereToSendOutput to prevent ereport from attempting to send any * more messages to the standby. */ if (whereToSendOutput == DestRemote) whereToSendOutput = DestNone; proc_exit(0); return 1; /* keep the compiler quiet */ }
/* Main loop of walsender process */ static int WalSndLoop(void) { char *output_message; bool caughtup = false; /* * Allocate buffer that will be used for each output message. We do this * just once to reduce palloc overhead. The buffer must be made large * enough for maximum-sized messages. */ output_message = palloc(1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE); /* * Allocate buffer that will be used for processing reply messages. As * above, do this just once to reduce palloc overhead. */ initStringInfo(&reply_message); /* Initialize the last reply timestamp */ last_reply_timestamp = GetCurrentTimestamp(); /* Loop forever, unless we get an error */ for (;;) { /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive(true)) exit(1); /* Process any requests or signals received recently */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); SyncRepInitConfig(); } /* Normal exit from the walsender is here */ if (walsender_shutdown_requested) { /* Inform the standby that XLOG streaming was done */ pq_puttextmessage('C', "COPY 0"); pq_flush(); proc_exit(0); } /* * If we don't have any pending data in the output buffer, try to send * some more. */ if (!pq_is_send_pending()) { XLogSend(output_message, &caughtup); /* * Even if we wrote all the WAL that was available when we started * sending, more might have arrived while we were sending this * batch. We had the latch set while sending, so we have not * received any signals from that time. Let's arm the latch again, * and after that check that we're still up-to-date. */ if (caughtup && !pq_is_send_pending()) { ResetLatch(&MyWalSnd->latch); XLogSend(output_message, &caughtup); } } /* Flush pending output to the client */ if (pq_flush_if_writable() != 0) break; /* * When SIGUSR2 arrives, we send any outstanding logs up to the * shutdown checkpoint record (i.e., the latest record) and exit. */ if (walsender_ready_to_stop && !pq_is_send_pending()) { XLogSend(output_message, &caughtup); ProcessRepliesIfAny(); if (caughtup && !pq_is_send_pending()) walsender_shutdown_requested = true; } if ((caughtup || pq_is_send_pending()) && !got_SIGHUP && !walsender_shutdown_requested) { TimestampTz finish_time = 0; long sleeptime; /* Reschedule replication timeout */ if (replication_timeout > 0) { long secs; int usecs; finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp, replication_timeout); TimestampDifference(GetCurrentTimestamp(), finish_time, &secs, &usecs); sleeptime = secs * 1000 + usecs / 1000; if (WalSndDelay < sleeptime) sleeptime = WalSndDelay; } else { /* * XXX: Without timeout, we don't really need the periodic * wakeups anymore, WaitLatchOrSocket should reliably wake up * as soon as something interesting happens. */ sleeptime = WalSndDelay; } /* Sleep */ WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock, true, pq_is_send_pending(), sleeptime); /* Check for replication timeout */ if (replication_timeout > 0 && GetCurrentTimestamp() >= finish_time) { /* * Since typically expiration of replication timeout means * communication problem, we don't send the error message to * the standby. */ ereport(COMMERROR, (errmsg("terminating walsender process due to replication timeout"))); break; } } /* * If we're in catchup state, see if its time to move to streaming. * This is an important state change for users, since before this * point data loss might occur if the primary dies and we need to * failover to the standby. The state change is also important for * synchronous replication, since commits that started to wait at that * point might wait for some time. */ if (MyWalSnd->state == WALSNDSTATE_CATCHUP && caughtup) { ereport(DEBUG1, (errmsg("standby \"%s\" has now caught up with primary", application_name))); WalSndSetState(WALSNDSTATE_STREAMING); } ProcessRepliesIfAny(); } /* * Get here on send failure. Clean up and exit. * * Reset whereToSendOutput to prevent ereport from attempting to send any * more messages to the standby. */ if (whereToSendOutput == DestRemote) whereToSendOutput = DestNone; proc_exit(0); return 1; /* keep the compiler quiet */ }
/* * Receive a log stream starting at the specified position. * * If sysidentifier is specified, validate that both the system * identifier and the timeline matches the specified ones * (by sending an extra IDENTIFY_SYSTEM command) * * All received segments will be written to the directory * specified by basedir. * * The stream_stop callback will be called every time data * is received, and whenever a segment is completed. If it returns * true, the streaming will stop and the function * return. As long as it returns false, streaming will continue * indefinitely. * * standby_message_timeout controls how often we send a message * back to the master letting it know our progress, in seconds. * This message will only contain the write location, and never * flush or replay. * * Note: The log position *must* be at a log segment start! */ bool ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, char *sysidentifier, char *basedir, stream_stop_callback stream_stop, int standby_message_timeout, bool rename_partial) { char query[128]; char current_walfile_name[MAXPGPATH]; PGresult *res; char *copybuf = NULL; int64 last_status = -1; XLogRecPtr blockpos = InvalidXLogRecPtr; if (sysidentifier != NULL) { /* Validate system identifier and timeline hasn't changed */ res = PQexec(conn, "IDENTIFY_SYSTEM"); if (PQresultStatus(res) != PGRES_TUPLES_OK) { fprintf(stderr, _("%s: could not send replication command \"%s\": %s"), progname, "IDENTIFY_SYSTEM", PQerrorMessage(conn)); PQclear(res); return false; } if (PQnfields(res) != 3 || PQntuples(res) != 1) { fprintf(stderr, _("%s: could not identify system: got %d rows and %d fields, expected %d rows and %d fields\n"), progname, PQntuples(res), PQnfields(res), 1, 3); PQclear(res); return false; } if (strcmp(sysidentifier, PQgetvalue(res, 0, 0)) != 0) { fprintf(stderr, _("%s: system identifier does not match between base backup and streaming connection\n"), progname); PQclear(res); return false; } if (timeline != atoi(PQgetvalue(res, 0, 1))) { fprintf(stderr, _("%s: timeline does not match between base backup and streaming connection\n"), progname); PQclear(res); return false; } PQclear(res); } /* Initiate the replication stream at specified location */ snprintf(query, sizeof(query), "START_REPLICATION %X/%X", startpos.xlogid, startpos.xrecoff); res = PQexec(conn, query); if (PQresultStatus(res) != PGRES_COPY_BOTH) { fprintf(stderr, _("%s: could not send replication command \"%s\": %s"), progname, "START_REPLICATION", PQresultErrorMessage(res)); PQclear(res); return false; } PQclear(res); /* * Receive the actual xlog data */ while (1) { int r; int xlogoff; int bytes_left; int bytes_written; int64 now; if (copybuf != NULL) { PQfreemem(copybuf); copybuf = NULL; } /* * Check if we should continue streaming, or abort at this point. */ if (stream_stop && stream_stop(blockpos, timeline, false)) { if (walfile != -1 && !close_walfile(basedir, current_walfile_name, rename_partial)) /* Potential error message is written by close_walfile */ goto error; return true; } /* * Potentially send a status message to the master */ now = localGetCurrentTimestamp(); if (standby_message_timeout > 0 && localTimestampDifferenceExceeds(last_status, now, standby_message_timeout)) { /* Time to send feedback! */ char replybuf[sizeof(StandbyReplyMessage) + 1]; StandbyReplyMessage *replymsg; replymsg = (StandbyReplyMessage *) (replybuf + 1); replymsg->write = blockpos; replymsg->flush = InvalidXLogRecPtr; replymsg->apply = InvalidXLogRecPtr; replymsg->sendTime = now; replybuf[0] = 'r'; if (PQputCopyData(conn, replybuf, sizeof(replybuf)) <= 0 || PQflush(conn)) { fprintf(stderr, _("%s: could not send feedback packet: %s"), progname, PQerrorMessage(conn)); goto error; } last_status = now; } r = PQgetCopyData(conn, ©buf, 1); if (r == 0) { /* * In async mode, and no data available. We block on reading but * not more than the specified timeout, so that we can send a * response back to the client. */ fd_set input_mask; struct timeval timeout; struct timeval *timeoutptr; FD_ZERO(&input_mask); FD_SET(PQsocket(conn), &input_mask); if (standby_message_timeout) { TimestampTz targettime; long secs; int usecs; targettime = TimestampTzPlusMilliseconds(last_status, standby_message_timeout - 1); localTimestampDifference(now, targettime, &secs, &usecs); if (secs <= 0) timeout.tv_sec = 1; /* Always sleep at least 1 sec */ else timeout.tv_sec = secs; timeout.tv_usec = usecs; timeoutptr = &timeout; } else timeoutptr = NULL; r = select(PQsocket(conn) + 1, &input_mask, NULL, NULL, timeoutptr); if (r == 0 || (r < 0 && errno == EINTR)) { /* * Got a timeout or signal. Continue the loop and either * deliver a status packet to the server or just go back into * blocking. */ continue; } else if (r < 0) { fprintf(stderr, _("%s: select() failed: %s\n"), progname, strerror(errno)); goto error; } /* Else there is actually data on the socket */ if (PQconsumeInput(conn) == 0) { fprintf(stderr, _("%s: could not receive data from WAL stream: %s"), progname, PQerrorMessage(conn)); goto error; } continue; } if (r == -1) /* End of copy stream */ break; if (r == -2) { fprintf(stderr, _("%s: could not read COPY data: %s"), progname, PQerrorMessage(conn)); goto error; } if (copybuf[0] == 'k') { /* * keepalive message, sent in 9.2 and newer. We just ignore this * message completely, but need to skip past it in the stream. */ if (r != STREAMING_KEEPALIVE_SIZE) { fprintf(stderr, _("%s: keepalive message has incorrect size %d\n"), progname, r); goto error; } continue; } else if (copybuf[0] != 'w') { fprintf(stderr, _("%s: unrecognized streaming header: \"%c\"\n"), progname, copybuf[0]); goto error; } if (r < STREAMING_HEADER_SIZE + 1) { fprintf(stderr, _("%s: streaming header too small: %d\n"), progname, r); goto error; } /* Extract WAL location for this block */ memcpy(&blockpos, copybuf + 1, 8); xlogoff = blockpos.xrecoff % XLOG_SEG_SIZE; /* * Verify that the initial location in the stream matches where we * think we are. */ if (walfile == -1) { /* No file open yet */ if (xlogoff != 0) { fprintf(stderr, _("%s: received transaction log record for offset %u with no file open\n"), progname, xlogoff); goto error; } } else { /* More data in existing segment */ /* XXX: store seek value don't reseek all the time */ if (lseek(walfile, 0, SEEK_CUR) != xlogoff) { fprintf(stderr, _("%s: got WAL data offset %08x, expected %08x\n"), progname, xlogoff, (int) lseek(walfile, 0, SEEK_CUR)); goto error; } } bytes_left = r - STREAMING_HEADER_SIZE; bytes_written = 0; while (bytes_left) { int bytes_to_write; /* * If crossing a WAL boundary, only write up until we reach * XLOG_SEG_SIZE. */ if (xlogoff + bytes_left > XLOG_SEG_SIZE) bytes_to_write = XLOG_SEG_SIZE - xlogoff; else bytes_to_write = bytes_left; if (walfile == -1) { walfile = open_walfile(blockpos, timeline, basedir, current_walfile_name); if (walfile == -1) /* Error logged by open_walfile */ goto error; } if (write(walfile, copybuf + STREAMING_HEADER_SIZE + bytes_written, bytes_to_write) != bytes_to_write) { fprintf(stderr, _("%s: could not write %u bytes to WAL file \"%s\": %s\n"), progname, bytes_to_write, current_walfile_name, strerror(errno)); goto error; } /* Write was successful, advance our position */ bytes_written += bytes_to_write; bytes_left -= bytes_to_write; XLByteAdvance(blockpos, bytes_to_write); xlogoff += bytes_to_write; /* Did we reach the end of a WAL segment? */ if (blockpos.xrecoff % XLOG_SEG_SIZE == 0) { if (!close_walfile(basedir, current_walfile_name, false)) /* Error message written in close_walfile() */ goto error; xlogoff = 0; if (stream_stop != NULL) { /* * Callback when the segment finished, and return if it * told us to. */ if (stream_stop(blockpos, timeline, true)) return true; } } } /* No more data left to write, start receiving next copy packet */ } /* * The only way to get out of the loop is if the server shut down the * replication stream. If it's a controlled shutdown, the server will send * a shutdown message, and we'll return the latest xlog location that has * been streamed. */ res = PQgetResult(conn); if (PQresultStatus(res) != PGRES_COMMAND_OK) { fprintf(stderr, _("%s: unexpected termination of replication stream: %s"), progname, PQresultErrorMessage(res)); goto error; } PQclear(res); /* Complain if we've not reached stop point yet */ if (stream_stop != NULL && !stream_stop(blockpos, timeline, false)) { fprintf(stderr, _("%s: replication stream was terminated before stop point\n"), progname); goto error; } if (copybuf != NULL) PQfreemem(copybuf); if (walfile != -1 && close(walfile) != 0) fprintf(stderr, _("%s: could not close file \"%s\": %s\n"), progname, current_walfile_name, strerror(errno)); walfile = -1; return true; error: if (copybuf != NULL) PQfreemem(copybuf); if (walfile != -1 && close(walfile) != 0) fprintf(stderr, _("%s: could not close file \"%s\": %s\n"), progname, current_walfile_name, strerror(errno)); walfile = -1; return false; }
/* * Main entry point for bgwriter process * * This is invoked from AuxiliaryProcessMain, which has already created the * basic execution environment, but not enabled signals yet. */ void BackgroundWriterMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext bgwriter_context; bool prev_hibernate; /* * Properly accept or ignore signals the postmaster might send us. * * bgwriter doesn't participate in ProcSignal signalling, but a SIGUSR1 * handler is still needed for latch wakeups. */ pqsignal(SIGHUP, BgSigHupHandler); /* set flag to read config file */ pqsignal(SIGINT, SIG_IGN); pqsignal(SIGTERM, ReqShutdownHandler); /* shutdown */ pqsignal(SIGQUIT, bg_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, bgwriter_sigusr1_handler); pqsignal(SIGUSR2, SIG_IGN); /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); /* * Create a resource owner to keep track of our resources (currently only * buffer pins). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Background Writer"); /* * We just started, assume there has been either a shutdown or * end-of-recovery snapshot. */ last_snapshot_ts = GetCurrentTimestamp(); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ bgwriter_context = AllocSetContextCreate(TopMemoryContext, "Background Writer", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(bgwriter_context); /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in bgwriter, but we do have LWLocks, buffers, and temp files. */ LWLockReleaseAll(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_SMgr(); AtEOXact_Files(); AtEOXact_HashTables(false); /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(bgwriter_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(bgwriter_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); /* Report wait end here, when there is no further possibility of wait */ pgstat_report_wait_end(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Reset hibernation state after any error. */ prev_hibernate = false; /* * Loop forever */ for (;;) { bool can_hibernate; int rc; /* Clear any already-pending wakeups */ ResetLatch(MyLatch); if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } if (shutdown_requested) { /* * From here on, elog(ERROR) should end with exit(1), not send * control back to the sigsetjmp block above */ ExitOnAnyError = true; /* Normal exit from the bgwriter is here */ proc_exit(0); /* done */ } /* * Do one cycle of dirty-buffer writing. */ can_hibernate = BgBufferSync(); /* * Send off activity statistics to the stats collector */ pgstat_send_bgwriter(); if (FirstCallSinceLastCheckpoint()) { /* * After any checkpoint, close all smgr files. This is so we * won't hang onto smgr references to deleted files indefinitely. */ smgrcloseall(); } /* * Log a new xl_running_xacts every now and then so replication can * get into a consistent state faster (think of suboverflowed * snapshots) and clean up resources (locks, KnownXids*) more * frequently. The costs of this are relatively low, so doing it 4 * times (LOG_SNAPSHOT_INTERVAL_MS) a minute seems fine. * * We assume the interval for writing xl_running_xacts is * significantly bigger than BgWriterDelay, so we don't complicate the * overall timeout handling but just assume we're going to get called * often enough even if hibernation mode is active. It's not that * important that log_snap_interval_ms is met strictly. To make sure * we're not waking the disk up unnecessarily on an idle system we * check whether there has been any WAL inserted since the last time * we've logged a running xacts. * * We do this logging in the bgwriter as its the only process that is * run regularly and returns to its mainloop all the time. E.g. * Checkpointer, when active, is barely ever in its mainloop and thus * makes it hard to log regularly. */ if (XLogStandbyInfoActive() && !RecoveryInProgress()) { TimestampTz timeout = 0; TimestampTz now = GetCurrentTimestamp(); timeout = TimestampTzPlusMilliseconds(last_snapshot_ts, LOG_SNAPSHOT_INTERVAL_MS); /* * only log if enough time has passed and some xlog record has * been inserted. */ if (now >= timeout && last_snapshot_lsn != GetXLogInsertRecPtr()) { last_snapshot_lsn = LogStandbySnapshot(); last_snapshot_ts = now; } } /* * Sleep until we are signaled or BgWriterDelay has elapsed. * * Note: the feedback control loop in BgBufferSync() expects that we * will call it every BgWriterDelay msec. While it's not critical for * correctness that that be exact, the feedback loop might misbehave * if we stray too far from that. Hence, avoid loading this process * down with latch events that are likely to happen frequently during * normal operation. */ rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, BgWriterDelay /* ms */ ); /* * If no latch event and BgBufferSync says nothing's happening, extend * the sleep in "hibernation" mode, where we sleep for much longer * than bgwriter_delay says. Fewer wakeups save electricity. When a * backend starts using buffers again, it will wake us up by setting * our latch. Because the extra sleep will persist only as long as no * buffer allocations happen, this should not distort the behavior of * BgBufferSync's control loop too badly; essentially, it will think * that the system-wide idle interval didn't exist. * * There is a race condition here, in that a backend might allocate a * buffer between the time BgBufferSync saw the alloc count as zero * and the time we call StrategyNotifyBgWriter. While it's not * critical that we not hibernate anyway, we try to reduce the odds of * that by only hibernating when BgBufferSync says nothing's happening * for two consecutive cycles. Also, we mitigate any possible * consequences of a missed wakeup by not hibernating forever. */ if (rc == WL_TIMEOUT && can_hibernate && prev_hibernate) { /* Ask for notification at next buffer allocation */ StrategyNotifyBgWriter(MyProc->pgprocno); /* Sleep ... */ rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, BgWriterDelay * HIBERNATE_FACTOR); /* Reset the notification request in case we timed out */ StrategyNotifyBgWriter(-1); } /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (rc & WL_POSTMASTER_DEATH) exit(1); prev_hibernate = can_hibernate; } }
/* * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup() * to resolve conflicts with other backends holding buffer pins. * * We either resolve conflicts immediately or set a SIGALRM to wake us at * the limit of our patience. The sleep in LockBufferForCleanup() is * performed here, for code clarity. * * Resolve conflict by sending a SIGUSR1 reason to all backends to check if * they hold one of the buffer pins that is blocking Startup process. If so, * backends will take an appropriate error action, ERROR or FATAL. * * We also check for deadlocks before we wait, though applications that cause * these will be extremely rare. Deadlocks occur because if queries * wait on a lock, that must be behind an AccessExclusiveLock, which can only * be cleared if the Startup process replays a transaction completion record. * If Startup process is also waiting then that is a deadlock. The deadlock * can occur if the query is waiting and then the Startup sleeps, or if * Startup is sleeping and the query waits on a lock. We protect against * only the former sequence here, the latter sequence is checked prior to * the query sleeping, in CheckRecoveryConflictDeadlock(). */ void ResolveRecoveryConflictWithBufferPin(void) { bool sig_alarm_enabled = false; Assert(InHotStandby); if (MaxStandbyDelay == 0) { /* * We don't want to wait, so just tell everybody holding the pin to * get out of town. */ SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); } else if (MaxStandbyDelay < 0) { /* * Send out a request to check for buffer pin deadlocks before we * wait. This is fairly cheap, so no need to wait for deadlock timeout * before trying to send it out. */ SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); } else { TimestampTz then = GetLatestXLogTime(); TimestampTz now = GetCurrentTimestamp(); /* Are we past max_standby_delay? */ if (TimestampDifferenceExceeds(then, now, MaxStandbyDelay)) { /* * We're already behind, so clear a path as quickly as possible. */ SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); } else { TimestampTz fin_time; /* Expected wake-up time by timer */ long timer_delay_secs; /* Amount of time we set timer * for */ int timer_delay_usecs; /* * Send out a request to check for buffer pin deadlocks before we * wait. This is fairly cheap, so no need to wait for deadlock * timeout before trying to send it out. */ SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); /* * How much longer we should wait? */ fin_time = TimestampTzPlusMilliseconds(then, MaxStandbyDelay); TimestampDifference(now, fin_time, &timer_delay_secs, &timer_delay_usecs); /* * It's possible that the difference is less than a microsecond; * ensure we don't cancel, rather than set, the interrupt. */ if (timer_delay_secs == 0 && timer_delay_usecs == 0) timer_delay_usecs = 1; if (enable_standby_sig_alarm(timer_delay_secs, timer_delay_usecs, fin_time)) sig_alarm_enabled = true; else elog(FATAL, "could not set timer for process wakeup"); } } /* Wait to be signaled by UnpinBuffer() */ ProcWaitForSignal(); if (sig_alarm_enabled) { if (!disable_standby_sig_alarm()) elog(FATAL, "could not disable timer for process wakeup"); } }
static void grab_ExecutorEnd(QueryDesc * queryDesc) { Datum values[10]; bool nulls[10] = {false, false, false, false, false, false, false, false, false, false}; Relation dump_heap; RangeVar *dump_table_rv; HeapTuple tuple; Oid namespaceId; /* lookup schema */ namespaceId = GetSysCacheOid1(NAMESPACENAME, CStringGetDatum(EXTENSION_SCHEMA)); if (OidIsValid(namespaceId)) { /* lookup table */ if (OidIsValid(get_relname_relid(EXTENSION_LOG_TABLE, namespaceId))) { /* get table heap */ dump_table_rv = makeRangeVar(EXTENSION_SCHEMA, EXTENSION_LOG_TABLE, -1); dump_heap = heap_openrv(dump_table_rv, RowExclusiveLock); /* transaction info */ values[0] = Int32GetDatum(GetCurrentTransactionId()); values[1] = Int32GetDatum(GetCurrentCommandId(false)); values[2] = Int32GetDatum(MyProcPid); values[3] = Int32GetDatum(GetUserId()); /* query timing */ if (queryDesc->totaltime != NULL) { InstrEndLoop(queryDesc->totaltime); values[4] = TimestampGetDatum( TimestampTzPlusMilliseconds(GetCurrentTimestamp(), (queryDesc->totaltime->total * -1000.0))); values[5] = Float8GetDatum(queryDesc->totaltime->total); } else { nulls[4] = true; nulls[5] = true; } /* query command type */ values[6] = Int32GetDatum(queryDesc->operation); /* query text */ values[7] = CStringGetDatum( cstring_to_text(queryDesc->sourceText)); /* query params */ if (queryDesc->params != NULL) { int numParams = queryDesc->params->numParams; Oid out_func_oid, ptype; Datum pvalue; bool isvarlena; FmgrInfo *out_functions; bool arr_nulls[numParams]; size_t arr_nelems = (size_t) numParams; Datum *arr_val_elems = palloc(sizeof(Datum) * arr_nelems); Datum *arr_typ_elems = palloc(sizeof(Datum) * arr_nelems); char elem_val_byval, elem_val_align, elem_typ_byval, elem_typ_align; int16 elem_val_len, elem_typ_len; int elem_dims[1], elem_lbs[1]; int paramno; /* init */ out_functions = (FmgrInfo *) palloc( (numParams) * sizeof(FmgrInfo)); get_typlenbyvalalign(TEXTOID, &elem_val_len, &elem_val_byval, &elem_val_align); get_typlenbyvalalign(REGTYPEOID, &elem_typ_len, &elem_typ_byval, &elem_typ_align); elem_dims[0] = arr_nelems; elem_lbs[0] = 1; for (paramno = 0; paramno < numParams; paramno++) { pvalue = queryDesc->params->params[paramno].value; ptype = queryDesc->params->params[paramno].ptype; getTypeOutputInfo(ptype, &out_func_oid, &isvarlena); fmgr_info(out_func_oid, &out_functions[paramno]); arr_typ_elems[paramno] = ptype; arr_nulls[paramno] = true; if (!queryDesc->params->params[paramno].isnull) { arr_nulls[paramno] = false; arr_val_elems[paramno] = PointerGetDatum( cstring_to_text( OutputFunctionCall(&out_functions[paramno], pvalue))); } } values[8] = PointerGetDatum( construct_md_array( arr_val_elems, arr_nulls, 1, elem_dims, elem_lbs, TEXTOID, elem_val_len, elem_val_byval, elem_val_align)); values[9] = PointerGetDatum( construct_array( arr_typ_elems, arr_nelems, REGTYPEOID, elem_typ_len, elem_typ_byval, elem_typ_align)); pfree(out_functions); pfree(arr_val_elems); } else { nulls[8] = true; nulls[9] = true; } /* insert */ tuple = heap_form_tuple(dump_heap->rd_att, values, nulls); simple_heap_insert(dump_heap, tuple); heap_close(dump_heap, RowExclusiveLock); } } if (prev_ExecutorEnd) prev_ExecutorEnd(queryDesc); else standard_ExecutorEnd(queryDesc); }
/* * Enable the SIGALRM interrupt to fire after the specified delay * * Delay is given in milliseconds. Caller should be sure a SIGALRM * signal handler is installed before this is called. * * This code properly handles nesting of deadlock timeout alarms within * statement timeout alarms. * * Returns TRUE if okay, FALSE on failure. */ bool enable_sig_alarm(int delayms, bool is_statement_timeout) { TimestampTz fin_time; struct itimerval timeval; if (is_statement_timeout) { /* * Begin statement-level timeout * * Note that we compute statement_fin_time with reference to the * statement_timestamp, but apply the specified delay without any * correction; that is, we ignore whatever time has elapsed since * statement_timestamp was set. In the normal case only a small * interval will have elapsed and so this doesn't matter, but there * are corner cases (involving multi-statement query strings with * embedded COMMIT or ROLLBACK) where we might re-initialize the * statement timeout long after initial receipt of the message. In * such cases the enforcement of the statement timeout will be a bit * inconsistent. This annoyance is judged not worth the cost of * performing an additional gettimeofday() here. */ Assert(!deadlock_timeout_active); fin_time = GetCurrentStatementStartTimestamp(); fin_time = TimestampTzPlusMilliseconds(fin_time, delayms); statement_fin_time = fin_time; cancel_from_timeout = false; statement_timeout_active = true; } else if (statement_timeout_active) { /* * Begin deadlock timeout with statement-level timeout active * * Here, we want to interrupt at the closer of the two timeout times. * If fin_time >= statement_fin_time then we need not touch the * existing timer setting; else set up to interrupt at the deadlock * timeout time. * * NOTE: in this case it is possible that this routine will be * interrupted by the previously-set timer alarm. This is okay * because the signal handler will do only what it should do according * to the state variables. The deadlock checker may get run earlier * than normal, but that does no harm. */ fin_time = GetCurrentTimestamp(); fin_time = TimestampTzPlusMilliseconds(fin_time, delayms); deadlock_timeout_active = true; if (fin_time >= statement_fin_time) return true; } else { /* Begin deadlock timeout with no statement-level timeout */ deadlock_timeout_active = true; } /* If we reach here, okay to set the timer interrupt */ MemSet(&timeval, 0, sizeof(struct itimerval)); timeval.it_value.tv_sec = delayms / 1000; timeval.it_value.tv_usec = (delayms % 1000) * 1000; if (setitimer(ITIMER_REAL, &timeval, NULL)) return false; return true; }